Example #1
0
func ParseHtml(io io.Reader, ps []post.Post) (psout []post.Post, err error) {

	// Create a qoquery document to parse from
	doc, err := goquery.NewDocumentFromReader(io)
	checkErr(err, "Failed to parse HTML")

	if err == nil {
		fmt.Println("---- Starting to parse ------------------------")

		// Find reddit posts = elements with class "thing"
		thing := doc.Find(".thing")
		for iThing := range thing.Nodes {

			post := post.NewPost()

			// use `single` as a selection of 1 node
			singlething := thing.Eq(iThing)

			// get the reddit post identifier
			reddit_post_id, exists := singlething.Attr("data-fullname")
			if exists == false {
				singlehtml, _ := singlething.Html()
				post.Err = fmt.Errorf("data-fullname not found in %s", singlehtml)
			} else {

				// find an element with class title and a child with class may-blank
				// Remove CRLF and unnecessary whitespaces
				post.Title = stringMinifier(singlething.Find(".title .may-blank").Text())

				post.PostId = reddit_post_id
				post.User = singlething.Find(".author").Text()
				post.Url, _ = singlething.Find(".comments.may-blank").Attr("href")
				post.SetScore(singlething.Find(".score.likes").Text())
				reddit_postdate, exists := singlething.Find("time").Attr("datetime")

				if exists == false {
					singlehtml, _ := singlething.Html()
					post.Err = fmt.Errorf("datetime not found in %s", singlehtml)
				} else {

					//post := post.NewPost()
					post.SetPostDate(reddit_postdate)

					// Print out the crawled info
					post.String()
					fmt.Println("-----------------------------------------------")

				}
			}
			ps = append(ps, post)

		}
	}

	return ps, err
}
Example #2
0
// Parse for posts in html from reddit, input html is an io.Reader and returns recognized posts in a psout slice of posts.
// Errors which affect only a single post are stored in their post.Err
func ParseHtmlReddit(io io.Reader, ps []post.Post) (psout []post.Post, err error) {

	// Create a qoquery document to parse from an io.Reader
	doc, err := goquery.NewDocumentFromReader(io)
	if err != nil {
		return ps, errors.New("Failed to parse HTML: " + err.Error())
	}

	// Find reddit posts = elements with class "thing"
	thing := doc.Find(".thing")
	for iThing := range thing.Nodes {

		// Create a new post struct - if the crawling fails the post will have an Err attached
		// but will be added to the outgoing (psout) slice nevertheless
		post := post.NewPost()
		post.Site = "reddit"

		// use `singlething` as a selection of one single post
		singlething := thing.Eq(iThing)

		// get the reddit post identifier
		reddit_post_id, exists := singlething.Attr("data-fullname")
		if exists == false {
			singlehtml, _ := singlething.Html()
			post.Err = fmt.Errorf("data-fullname not found in %s", singlehtml)
		} else {
			post.WebPostId = reddit_post_id
			// find an element with class title and a child with class may-blank
			// and remove CRLF and unnecessary whitespaces
			post.Title = stringMinifier(singlething.Find(".title .may-blank").Text())
			// Get the post user
			post.User = singlething.Find(".author").Text()
			// Get the post url
			post.Url, _ = singlething.Find(".comments.may-blank").Attr("href")
			// Get the post likes score
			post.SetScore(singlething.Find(".score.likes").Text())
			// Get the post date
			reddit_postdate, exists := singlething.Find("time").Attr("datetime")

			if exists == false {
				singlehtml, _ := singlething.Html()
				post.Err = fmt.Errorf("datetime not found in %s", singlehtml)
			} else {

				post.SetPostDate(reddit_postdate)

			}
		}
		ps = append(ps, post)

	}

	return ps, err
}