Пример #1
0
func ParseHtml(io io.Reader, ps []post.Post) (psout []post.Post, err error) {

	// Create a qoquery document to parse from
	doc, err := goquery.NewDocumentFromReader(io)
	checkErr(err, "Failed to parse HTML")

	if err == nil {
		fmt.Println("---- Starting to parse ------------------------")

		// Find reddit posts = elements with class "thing"
		thing := doc.Find(".thing")
		for iThing := range thing.Nodes {

			post := post.NewPost()

			// use `single` as a selection of 1 node
			singlething := thing.Eq(iThing)

			// get the reddit post identifier
			reddit_post_id, exists := singlething.Attr("data-fullname")
			if exists == false {
				singlehtml, _ := singlething.Html()
				post.Err = fmt.Errorf("data-fullname not found in %s", singlehtml)
			} else {

				// find an element with class title and a child with class may-blank
				// Remove CRLF and unnecessary whitespaces
				post.Title = stringMinifier(singlething.Find(".title .may-blank").Text())

				post.PostId = reddit_post_id
				post.User = singlething.Find(".author").Text()
				post.Url, _ = singlething.Find(".comments.may-blank").Attr("href")
				post.SetScore(singlething.Find(".score.likes").Text())
				reddit_postdate, exists := singlething.Find("time").Attr("datetime")

				if exists == false {
					singlehtml, _ := singlething.Html()
					post.Err = fmt.Errorf("datetime not found in %s", singlehtml)
				} else {

					//post := post.NewPost()
					post.SetPostDate(reddit_postdate)

					// Print out the crawled info
					post.String()
					fmt.Println("-----------------------------------------------")

				}
			}
			ps = append(ps, post)

		}
	}

	return ps, err
}
Пример #2
0
// Parse for posts in html from reddit, input html is an io.Reader and returns recognized posts in a psout slice of posts.
// Errors which affect only a single post are stored in their post.Err
func ParseHtmlReddit(io io.Reader, ps []post.Post) (psout []post.Post, err error) {

	// Create a qoquery document to parse from an io.Reader
	doc, err := goquery.NewDocumentFromReader(io)
	if err != nil {
		return ps, errors.New("Failed to parse HTML: " + err.Error())
	}

	// Find reddit posts = elements with class "thing"
	thing := doc.Find(".thing")
	for iThing := range thing.Nodes {

		// Create a new post struct - if the crawling fails the post will have an Err attached
		// but will be added to the outgoing (psout) slice nevertheless
		post := post.NewPost()
		post.Site = "reddit"

		// use `singlething` as a selection of one single post
		singlething := thing.Eq(iThing)

		// get the reddit post identifier
		reddit_post_id, exists := singlething.Attr("data-fullname")
		if exists == false {
			singlehtml, _ := singlething.Html()
			post.Err = fmt.Errorf("data-fullname not found in %s", singlehtml)
		} else {
			post.WebPostId = reddit_post_id
			// find an element with class title and a child with class may-blank
			// and remove CRLF and unnecessary whitespaces
			post.Title = stringMinifier(singlething.Find(".title .may-blank").Text())
			// Get the post user
			post.User = singlething.Find(".author").Text()
			// Get the post url
			post.Url, _ = singlething.Find(".comments.may-blank").Attr("href")
			// Get the post likes score
			post.SetScore(singlething.Find(".score.likes").Text())
			// Get the post date
			reddit_postdate, exists := singlething.Find("time").Attr("datetime")

			if exists == false {
				singlehtml, _ := singlething.Html()
				post.Err = fmt.Errorf("datetime not found in %s", singlehtml)
			} else {

				post.SetPostDate(reddit_postdate)

			}
		}
		ps = append(ps, post)

	}

	return ps, err
}
// Parse for posts in html from hackernews, input html is an io.Reader and returns recognized posts in a psout slice of posts.
// Errors which affect only a single post are stored in their post.Err
func ParseHtmlHackerNews(body io.Reader, ps []*post.Post) (psout []*post.Post, err error) {

	var html string
	// Create a qoquery document to parse from an io.Reader
	doc, err := goquery.NewDocumentFromReader(body)
	if err != nil {
		return ps, errors.New("Failed to parse HTML: " + err.Error())
	}

	// Find hackernews posts = elements with class "athing"
	thing := doc.Find(".athing")
	for iThing := range thing.Nodes {

		// Create a new post struct - if the crawling fails the post will have an Err attached
		// but will be added to the outgoing (psout) slice nevertheless
		post := post.NewPost()
		post.Site = "hackernews"
		ps = append(ps, &post)

		// use `singlearticle` as a selection of one single post
		singlearticle := thing.Eq(iThing)

		// Get the next element containing additional info for this post
		scorenode := singlearticle.Next()
		if scorenode == nil {
			errhtml, _ := singlearticle.Html()
			post.Err = fmt.Errorf("Did not find next sibling for: %s\n", errhtml)
			continue
		}

		htmlpost := singlearticle.Find(".title a").First()
		if htmlpost.Size() == 0 {
			errhtml, _ := singlearticle.Html()
			post.Err = fmt.Errorf("Did not find title for: %s\n", errhtml)
			continue
		}

		post.Title = htmlpost.Text()
		var exists bool
		post.Url, exists = htmlpost.Attr("href")
		if exists == false {
			singlehtml, _ := htmlpost.Html()
			post.Err = fmt.Errorf("href not found in %s\n", singlehtml)
		}
		post.Url = stringMinifier(post.Url)

		if !(strings.HasPrefix(post.Url, "http")) {
			post.Url = "https://news.ycombinator.com/" + post.Url
		}

		if DebugLevel > 2 {
			fmt.Printf("**** URL post.Url: %s\n", post.Url)
		}

		if DebugLevel > 3 {
			fmt.Printf("---------------------------\n")
			html, _ = scorenode.Html()
			fmt.Printf("HTML: %s\n", html)
			fmt.Printf("---------------------------\n")
		}

		// Get the score
		scoretag := scorenode.Find(".subtext .score").First()
		if scoretag.Size() == 0 {
			post.Err = fmt.Errorf("Did not find score for: %v\n", scorenode)
			continue
		}

		if DebugLevel > 3 {
			fmt.Printf("------- SCORE -------------\n")
			html, _ = scoretag.Html()
			fmt.Printf("HTML: %s\n", html)
			score := scoretag.Text()
			fmt.Printf("TEXT: %s\n", score)
			fmt.Printf("---------------------------\n")
		}

		post.SetScore(strings.Split(scoretag.Text(), " ")[0])

		postid, exists := scoretag.Attr("id")
		if !exists {
			html, _ = scoretag.Html()
			post.Err = fmt.Errorf("Did not find postid in %s\n", html)
		}

		if DebugLevel > 3 {
			fmt.Printf("------- POST ID -----------\n")
			fmt.Printf("TEXT: %s\n", postid)
			fmt.Printf("---------------------------\n")
		}

		post.WebPostId = strings.Split(postid, "_")[1]

		// Get the username and postdate
		hrefs := scorenode.Find(".subtext a")
		if hrefs.Size() == 0 {
			errhtml, _ := scorenode.Html()
			post.Err = fmt.Errorf("Did not find user and date in %s\n", errhtml)
			continue
		}

		for i := range hrefs.Nodes {
			href := hrefs.Eq(i)
			t, _ := href.Html()
			s, exists := href.Attr("href")
			if exists {
				if strings.HasPrefix(s, "user?id") {
					post.User = t
					continue
				}
				if strings.HasPrefix(s, "item?id") {
					if strings.Contains(t, "ago") {
						var postDate time.Time
						postDate, err = GetDateFromCreatedAgo(t)
						if err != nil {
							post.Err = errors.New(fmt.Sprintf("Failed to convert to date: %s: %s\n", t, err.Error()))
							continue
						}
						post.PostDate = postDate
						post.Err = err
					}
				}
			}
			if DebugLevel > 3 {
				fmt.Printf("------- HREF --------------\n")
				fmt.Printf("TEXT: %s\n", t)
				fmt.Printf("HREF: %s\n", s)
				fmt.Printf("---------------------------\n")
			}
		}
		if post.Err == nil {
			err = ParseHtmlComments(&post)
		}
		if DebugLevel > 2 && err == nil {
			fmt.Printf("------ POST DUMP -----------\n")
			fmt.Print(post.String("PARSED: "))
			fmt.Printf("------ POST DUMP END -------\n")
		}

	}

	return ps, err
}