Example #1
0
func ParseHtml(io io.Reader, ps []post.Post) (psout []post.Post, err error) {

	// Create a qoquery document to parse from
	doc, err := goquery.NewDocumentFromReader(io)
	checkErr(err, "Failed to parse HTML")

	if err == nil {
		fmt.Println("---- Starting to parse ------------------------")

		// Find reddit posts = elements with class "thing"
		thing := doc.Find(".thing")
		for iThing := range thing.Nodes {

			post := post.NewPost()

			// use `single` as a selection of 1 node
			singlething := thing.Eq(iThing)

			// get the reddit post identifier
			reddit_post_id, exists := singlething.Attr("data-fullname")
			if exists == false {
				singlehtml, _ := singlething.Html()
				post.Err = fmt.Errorf("data-fullname not found in %s", singlehtml)
			} else {

				// find an element with class title and a child with class may-blank
				// Remove CRLF and unnecessary whitespaces
				post.Title = stringMinifier(singlething.Find(".title .may-blank").Text())

				post.PostId = reddit_post_id
				post.User = singlething.Find(".author").Text()
				post.Url, _ = singlething.Find(".comments.may-blank").Attr("href")
				post.SetScore(singlething.Find(".score.likes").Text())
				reddit_postdate, exists := singlething.Find("time").Attr("datetime")

				if exists == false {
					singlehtml, _ := singlething.Html()
					post.Err = fmt.Errorf("datetime not found in %s", singlehtml)
				} else {

					//post := post.NewPost()
					post.SetPostDate(reddit_postdate)

					// Print out the crawled info
					post.String()
					fmt.Println("-----------------------------------------------")

				}
			}
			ps = append(ps, post)

		}
	}

	return ps, err
}
Example #2
0
// Parse for posts in html from reddit, input html is an io.Reader and returns recognized posts in a psout slice of posts.
// Errors which affect only a single post are stored in their post.Err
func ParseHtmlReddit(io io.Reader, ps []post.Post) (psout []post.Post, err error) {

	// Create a qoquery document to parse from an io.Reader
	doc, err := goquery.NewDocumentFromReader(io)
	if err != nil {
		return ps, errors.New("Failed to parse HTML: " + err.Error())
	}

	// Find reddit posts = elements with class "thing"
	thing := doc.Find(".thing")
	for iThing := range thing.Nodes {

		// Create a new post struct - if the crawling fails the post will have an Err attached
		// but will be added to the outgoing (psout) slice nevertheless
		post := post.NewPost()
		post.Site = "reddit"

		// use `singlething` as a selection of one single post
		singlething := thing.Eq(iThing)

		// get the reddit post identifier
		reddit_post_id, exists := singlething.Attr("data-fullname")
		if exists == false {
			singlehtml, _ := singlething.Html()
			post.Err = fmt.Errorf("data-fullname not found in %s", singlehtml)
		} else {
			post.WebPostId = reddit_post_id
			// find an element with class title and a child with class may-blank
			// and remove CRLF and unnecessary whitespaces
			post.Title = stringMinifier(singlething.Find(".title .may-blank").Text())
			// Get the post user
			post.User = singlething.Find(".author").Text()
			// Get the post url
			post.Url, _ = singlething.Find(".comments.may-blank").Attr("href")
			// Get the post likes score
			post.SetScore(singlething.Find(".score.likes").Text())
			// Get the post date
			reddit_postdate, exists := singlething.Find("time").Attr("datetime")

			if exists == false {
				singlehtml, _ := singlething.Html()
				post.Err = fmt.Errorf("datetime not found in %s", singlehtml)
			} else {

				post.SetPostDate(reddit_postdate)

			}
		}
		ps = append(ps, post)

	}

	return ps, err
}
func Test() (err error) {
	//drivername := "postgres"
	//dsn := "user=golang password=golang dbname=golang sslmode=disable"
	//dialect := gorp.PostgresDialect{}

	drivername := "mysql"
	dsn := "golang:golang@/golang?parseTime=true&collation=utf8mb4_general_ci"
	dialect := gorp.MySQLDialect{"InnoDB", "utf8mb4"}

	// connect to db using standard Go database/sql API
	db, err := sql.Open(drivername, dsn)
	if err != nil {
		return errors.New("sql.Open failed: " + err.Error())
	}

	// Open doesn't open a connection. Validate DSN data using ping
	if err = db.Ping(); err != nil {
		return errors.New("db.Ping failed: " + err.Error())
	}

	// Set the connection to use utf8mb4
	if dialect.Engine == "InnoDB" {
		_, err = db.Exec("SET NAMES utf8mb4 COLLATE utf8mb4_general_ci")
		if err != nil {
			return errors.New("SET NAMES utf8mb4 COLLATE utf8mb4_general_ci: " + err.Error())
		}
	}

	// construct a gorp DbMap
	dbmap := &gorp.DbMap{Db: db, Dialect: dialect}
	defer dbmap.Db.Close()
	dbmap.DebugLevel = DebugLevel
	// Will log all SQL statements + args as they are run
	// The first arg is a string prefix to prepend to all log messages
	dbmap.TraceOn("[gorp]", log.New(os.Stdout, "fetch:", log.Lmicroseconds))

	// register the structs you wish to use with gorp
	// you can also use the shorter dbmap.AddTable() if you
	// don't want to override the table name

	// SetKeys(true) means we have a auto increment primary key, which
	// will get automatically bound to your struct post-insert
	table := dbmap.AddTableWithName(post.Post{}, "posts_embedded_test")
	table.SetKeys(true, "PID")
	fmt.Printf("AddTableWithName returned: %s\n", table.TableName)

	var r *gorp.RelationMap
	if len(table.Relations) > 0 {
		r = table.Relations[0]
		fmt.Printf("Relation DetailTable: %s\n", r.DetailTable.TableName)
	}

	// Add the comments table
	table = dbmap.AddTableWithName(post.Comment{}, "comments_embedded_test")
	table.SetKeys(true, "Id")
	fmt.Printf("AddTableWithName returned: %s\n", table.TableName)
	if r != nil {
		fmt.Printf("Relation DetailTable: %s\n", r.DetailTable.TableName)
	}

	// create the table. in a production system you'd generally
	// use a migration tool, or create the tables via scripts
	if err = dbmap.CreateTablesIfNotExists(); err != nil {
		return errors.New("Create tables failed: " + err.Error())
	}

	// Force create all indexes for this database
	if err = dbmap.CreateIndexes(); err != nil {
		return errors.New("Create indexes failed: " + err.Error())
	}

	i := 0
	x := 0
	var LastPkForGetTests uint64
	var p post.Post

	rand.Seed(42)

	for i < 10 {
		p = post.NewPost()
		p.Title = fmt.Sprintf("Post number %d", i)
		p.Site = "test"
		p.PostDate = time.Unix(time.Now().Unix(), 0).UTC()
		p.WebPostId = strconv.FormatUint(post.Hash(p.Title+p.PostDate.String()), 10)

		x = 0
		for x < 10 {
			c := p.AddComment()
			c.Title = fmt.Sprintf("Comment %d on post %d: ", x, i)
			//c.Title = "👩�👦�👦👨�👩�👧�👩�👩�"
			c.Title += "\U0001F475 \u2318 \xe2\x8c\x98 \U0001F474 \xF0\x9F\x91\xB4 \U0001F610"

			c.WebCommentId = strconv.FormatUint(post.Hash(c.Title+c.GetCommentDate().String())+uint64(rand.Int63n(100000)), 10)
			if utf8.ValidString(c.Title) {
				fmt.Printf("IS VALID: '%s'\n", c.Title)
			} else {
				fmt.Printf("IS *** NOT*** VALID: '%s'\n", c.Title)

			}
			nihongo := c.Title
			for i, w := 0, 0; i < len(nihongo); i += w {
				runeValue, width := utf8.DecodeRuneInString(nihongo[i:])
				fmt.Printf("%#U starts at byte position %d, lenght %d\n", runeValue, i, width)
				w = width
			}

			x++
		}

		// Inserting a post also inserts all its detail records (=comments)
		err = dbmap.InsertWithChilds(&p)
		if DebugLevel > 3 {
			// Print out the crawled info
			fmt.Println("----------- INSERT POST START -----------------")
			fmt.Println(p.String("IP: "))
		}
		if err != nil {
			return errors.New("Insert failed: " + err.Error())
		}

		LastPkForGetTests = p.Id

		if DebugLevel > 3 {
			// Print out the end of the crawled info
			fmt.Println("----------- INSERT POST END -------------------")
		}

		for y, c := range p.Comments {

			c.Title = fmt.Sprintf("UpdatedComment %d ", y) + c.Title
			x++
		}

		p.Title = fmt.Sprintf("UpdatedPost %d ", i) + p.Title
		var rowsaffected int64
		rowsaffected, err = dbmap.UpdateWithChilds(&p)
		if DebugLevel > 3 {
			// Print out the crawled info
			fmt.Println("----------- UPDATE POST START -----------------")
			fmt.Printf("Rows affected: %d\n", rowsaffected)
			fmt.Println(p.String("UP: "))
		}
		if err != nil {
			return errors.New("update failed: " + err.Error())
		}
		if DebugLevel > 3 {
			// Print out the end of the crawled info
			fmt.Println("----------- UPDATE POST END -------------------")
		}

		i++

	}
	fmt.Println("Starting Get tests")

	res, err := dbmap.GetWithChilds(post.Post{}, LastPkForGetTests)

	if err != nil {
		return errors.New("get failed: " + err.Error())
	}
	if res == nil {
		return errors.New(fmt.Sprintf("Get post for id %d did not return any rows ", LastPkForGetTests))
	}

	resp := res.(*post.Post)

	if DebugLevel > 3 {
		// Print out the selected post
		fmt.Println("----------- GET POST START -----------------")
		fmt.Println(resp.String("GP: "))
	}

	if DebugLevel > 3 {
		// Print out the end of the selected post
		fmt.Println("----------- GET POST END -------------------")
	}

	var updateNeeded bool
	updateNeeded, err = AddUpdatableChilds(&p, resp, dbmap)
	if err != nil {
		return errors.New(fmt.Sprintf("AddUpdatableChilds for post '%s' failed: %s", resp.WebPostId, err.Error()))
	}

	if updateNeeded {
		var rowsaffected int64
		rowsaffected, err = dbmap.UpdateWithChilds(resp)
		if DebugLevel > 3 {
			// Print out the crawled info
			fmt.Println("----------- REUPDATE POST START -----------------")
			fmt.Printf("Rows affected: %d\n", rowsaffected)
			fmt.Println(resp.String("RUP: "))
		}
		if err != nil {
			return errors.New("reupdate failed: " + err.Error())
		}
		if DebugLevel > 3 {
			// Print out the end of the crawled info
			fmt.Println("----------- REUPDATE POST END -------------------")
		}
	}

	return
}
// Parse for posts in html from hackernews, input html is an io.Reader and returns recognized posts in a psout slice of posts.
// Errors which affect only a single post are stored in their post.Err
func ParseHtmlHackerNews(body io.Reader, ps []*post.Post) (psout []*post.Post, err error) {

	var html string
	// Create a qoquery document to parse from an io.Reader
	doc, err := goquery.NewDocumentFromReader(body)
	if err != nil {
		return ps, errors.New("Failed to parse HTML: " + err.Error())
	}

	// Find hackernews posts = elements with class "athing"
	thing := doc.Find(".athing")
	for iThing := range thing.Nodes {

		// Create a new post struct - if the crawling fails the post will have an Err attached
		// but will be added to the outgoing (psout) slice nevertheless
		post := post.NewPost()
		post.Site = "hackernews"
		ps = append(ps, &post)

		// use `singlearticle` as a selection of one single post
		singlearticle := thing.Eq(iThing)

		// Get the next element containing additional info for this post
		scorenode := singlearticle.Next()
		if scorenode == nil {
			errhtml, _ := singlearticle.Html()
			post.Err = fmt.Errorf("Did not find next sibling for: %s\n", errhtml)
			continue
		}

		htmlpost := singlearticle.Find(".title a").First()
		if htmlpost.Size() == 0 {
			errhtml, _ := singlearticle.Html()
			post.Err = fmt.Errorf("Did not find title for: %s\n", errhtml)
			continue
		}

		post.Title = htmlpost.Text()
		var exists bool
		post.Url, exists = htmlpost.Attr("href")
		if exists == false {
			singlehtml, _ := htmlpost.Html()
			post.Err = fmt.Errorf("href not found in %s\n", singlehtml)
		}
		post.Url = stringMinifier(post.Url)

		if !(strings.HasPrefix(post.Url, "http")) {
			post.Url = "https://news.ycombinator.com/" + post.Url
		}

		if DebugLevel > 2 {
			fmt.Printf("**** URL post.Url: %s\n", post.Url)
		}

		if DebugLevel > 3 {
			fmt.Printf("---------------------------\n")
			html, _ = scorenode.Html()
			fmt.Printf("HTML: %s\n", html)
			fmt.Printf("---------------------------\n")
		}

		// Get the score
		scoretag := scorenode.Find(".subtext .score").First()
		if scoretag.Size() == 0 {
			post.Err = fmt.Errorf("Did not find score for: %v\n", scorenode)
			continue
		}

		if DebugLevel > 3 {
			fmt.Printf("------- SCORE -------------\n")
			html, _ = scoretag.Html()
			fmt.Printf("HTML: %s\n", html)
			score := scoretag.Text()
			fmt.Printf("TEXT: %s\n", score)
			fmt.Printf("---------------------------\n")
		}

		post.SetScore(strings.Split(scoretag.Text(), " ")[0])

		postid, exists := scoretag.Attr("id")
		if !exists {
			html, _ = scoretag.Html()
			post.Err = fmt.Errorf("Did not find postid in %s\n", html)
		}

		if DebugLevel > 3 {
			fmt.Printf("------- POST ID -----------\n")
			fmt.Printf("TEXT: %s\n", postid)
			fmt.Printf("---------------------------\n")
		}

		post.WebPostId = strings.Split(postid, "_")[1]

		// Get the username and postdate
		hrefs := scorenode.Find(".subtext a")
		if hrefs.Size() == 0 {
			errhtml, _ := scorenode.Html()
			post.Err = fmt.Errorf("Did not find user and date in %s\n", errhtml)
			continue
		}

		for i := range hrefs.Nodes {
			href := hrefs.Eq(i)
			t, _ := href.Html()
			s, exists := href.Attr("href")
			if exists {
				if strings.HasPrefix(s, "user?id") {
					post.User = t
					continue
				}
				if strings.HasPrefix(s, "item?id") {
					if strings.Contains(t, "ago") {
						var postDate time.Time
						postDate, err = GetDateFromCreatedAgo(t)
						if err != nil {
							post.Err = errors.New(fmt.Sprintf("Failed to convert to date: %s: %s\n", t, err.Error()))
							continue
						}
						post.PostDate = postDate
						post.Err = err
					}
				}
			}
			if DebugLevel > 3 {
				fmt.Printf("------- HREF --------------\n")
				fmt.Printf("TEXT: %s\n", t)
				fmt.Printf("HREF: %s\n", s)
				fmt.Printf("---------------------------\n")
			}
		}
		if post.Err == nil {
			err = ParseHtmlComments(&post)
		}
		if DebugLevel > 2 && err == nil {
			fmt.Printf("------ POST DUMP -----------\n")
			fmt.Print(post.String("PARSED: "))
			fmt.Printf("------ POST DUMP END -------\n")
		}

	}

	return ps, err
}
// Parse for posts in html from hackernews, input html is an io.Reader and returns recognized posts in a psout slice of posts.
// Errors which affect only a single post are stored in their post.Err
func ParseHtmlHackerNews(body io.Reader, ps []*post.Post) (psout []*post.Post, err error) {

	root, err := html.Parse(body)
	if err != nil {
		err = errors.New("Failed to html.Parse: " + err.Error())
		return
	}

	// define a matcher
	matcher := func(n *html.Node) bool {
		if n.DataAtom == atom.Tr && n.Parent != nil && n.Parent.DataAtom == atom.Tbody {
			matched := scrape.Attr(n, "class") == "athing"
			return matched
		}
		return false
	}

	// grab all articles and loop over them
	articles := scrape.FindAll(root, matcher)
	for _, article := range articles {
		var ok bool
		// Get one post entry
		var titlenode *html.Node

		titlenode, ok = scrape.Find(article,
			func(n *html.Node) bool {
				if n.DataAtom == atom.A && n.Parent != nil && scrape.Attr(n.Parent, "class") == "title" {
					return true
				}
				return false
			})
		if !ok {
			continue
		}
		// Create a new post struct - if the crawling fails the post will have an Err attached
		// but will be added to the outgoing (psout) slice nevertheless
		post := post.NewPost()
		post.Site = "hackernews"

		post.Title = scrape.Text(titlenode)
		post.Url = scrape.Attr(titlenode, "href")
		if strings.HasPrefix(post.Url, "item?id=") {
			post.Url = "https://news.ycombinator.com/" + post.Url
		}

		ps = append(ps, &post)

		// Get additional info for this post
		scorenode := article.NextSibling
		if scorenode == nil {
			post.Err = errors.New("Did not find score for: %s\n" + scrape.Text(article))
			continue
		}

		// Get the subtext containing scores, user and date
		subtext, ok := scrape.Find(scorenode,
			func(n *html.Node) bool {
				if scrape.Attr(n, "class") == "subtext" {
					return true
				}
				return false
			})
		if !ok {
			post.Err = errors.New(fmt.Sprintf("Did not find siblings for subtext %s\n", scorenode.Data))
			continue
		}

		subs := scrape.FindAll(subtext,
			func(n *html.Node) bool {
				// Get the PostId and Score
				// span class="score" id="score_9643579">92 points</span>
				if n.DataAtom == atom.Span && scrape.Attr(n, "class") == "score" && n.Parent != nil && scrape.Attr(n.Parent, "class") == "subtext" {

					// Get score
					var scoreid int
					scorestr := strings.Split(scrape.Text(n), " ")[0]
					scoreid, err = strconv.Atoi(scorestr)
					if err != nil {
						fmt.Printf("Failed to convert score %s to int: %s\n", scorestr, err.Error())
						return false
					}
					post.Score = scoreid

					// Get PostId
					postidstr := scrape.Attr(n, "id")
					if len(strings.Split(postidstr, "_")) > 1 {
						post.WebPostId = strings.Split(postidstr, "_")[1]
						return true
					}
				}
				// Get the Username and Creation Date for this post
				if scrape.Attr(n.Parent, "class") == "subtext" && n.DataAtom == atom.A && n.Parent != nil {
					href := strings.ToLower(scrape.Attr(n, "href"))
					if href != "" {
						s := strings.Split(href, "?")
						if s[0] == "user" && len(s) > 1 {
							// Username
							u := strings.Split(s[1], "=")
							if len(u) > 1 {
								post.User = u[1]
								return true
							}
						} else {
							if s[0] == "item" && len(s) > 1 {
								// Created date
								createdago := scrape.Text(n)
								if strings.Contains(createdago, "ago") {
									var postDate time.Time

									postDate, err = GetDateFromCreatedAgo(createdago)
									if err != nil {
										err = errors.New(fmt.Sprintf("Failed to convert to date: %V\n", createdago))
										return false
									}
									post.PostDate = postDate

									return true
								}
							}
						}
					}
				} // end "class" == "subtext"
				return false
			})

		if len(subs) == 0 {
			var w bytes.Buffer
			if rerr := html.Render(&w, subtext); rerr != nil {
				fmt.Printf("Render error: %s\n", rerr)
			}
			post.Err = errors.New(fmt.Sprintf("Unable to parse score,user,date from %s:\n %s\n", post.Title, w.String()))
		}
	}

	return ps, err
}
Example #6
0
func RedditPostScraperWithGorm(sub string) (err error) {

	db, err := gorm.Open("mysql", "golang:golang@/golang?charset=utf8&parseTime=True&loc=Local")

	// Get database connection handle [*sql.DB](http://golang.org/pkg/database/sql/#DB)
	db.DB()

	// Then you could invoke `*sql.DB`'s functions with it
	db.DB().Ping()
	db.DB().SetMaxIdleConns(10)
	db.DB().SetMaxOpenConns(100)

	// Automating Migration
	db.AutoMigrate(&post.Post{})

	// Get data from reddit
	geturl := "http://www.reddit.com/r/" + sub + "/new"
	resp, err := http.Get(geturl)
	if err != nil {
		return errors.New("Failed to http.Get from " + geturl + ": " + err.Error())
	}
	if resp != nil {
		if resp.Body == nil {
			return errors.New("Body from " + geturl + " is nil!")
		} else {
			defer resp.Body.Close()
		}
	} else {
		return errors.New("Response from " + geturl + " is nil!")
	}
	if resp.StatusCode != 200 { // 200 = OK
		httperr := fmt.Sprintf("Failed to http.Get from %s: Http Status code: %d: Msg: %s", geturl, resp.StatusCode, resp.Status)
		return errors.New(httperr)
	}

	// Create a new post slice and then parse the response body into ps
	ps := make([]post.Post, 0)
	ps, err = ParseHtmlReddit(resp.Body, ps)
	if err != nil {
		return errors.New("Error in RedditParseHtml: " + geturl + ": " + err.Error())
	}
	foundnewposts := false
	updatedposts := 0

	// insert rows - auto increment PKs will be set properly after the insert
	for _, p := range ps {
		if p.Err == nil {

			p.Site = "reddit"

			// check if post already exists
			var count int
			querypost := post.NewPost()
			// SELECT * FROM posts WHERE PostId = "xxx";
			db.Find(&querypost, "PostId = ?", p.PostId).Count(&count)
			err = db.Error
			if err != nil {
				return errors.New("select count(*) from posts failed: " + err.Error())
			}

			if count == 0 {
				foundnewposts = true
				db.Create(&p)
				err = db.Error
				if err != nil {
					return errors.New("insert into table posts failed: " + err.Error())
				}
				if err == nil {
					// Print out the crawled info
					fmt.Println("----------- INSERT ----------------------------")
					fmt.Println(p.String())
				}
			} else {
				// Post already exists, do an update
				//var err error

				score := p.Score

				// Get the first matched record
				// SELECT * FROM posts WHERE PostId = 'xxx' limit 1;
				db.Where("PostId = ?", p.PostId).First(&querypost)
				err = db.Error
				if err != nil {
					return errors.New("Failed: select Id from posts where PostId = " + p.PostId + ": " + err.Error())
				}

				if score != querypost.Score {
					db.Save(&p)
					err = db.Error
					if err != nil {
						return errors.New("update table 'posts' failed: " + err.Error())
					} else {
						updatedposts++
						// Print out the update info
						fmt.Println("----------- UPDATE SCORE-----------------------")
						fmt.Println(p.Title)
						fmt.Printf("From %d to %d\n", score, p.Score)
					}
				}
			}
		} else {
			fmt.Println("Single post error in " + geturl + ": " + p.Err.Error())
		}
	}
	if !foundnewposts {
		fmt.Println("No new posts found at " + geturl)
	}

	if updatedposts > 0 {
		fmt.Printf("%d posts have been updated from %s\n", updatedposts, geturl)
	}

	return
}