func ParseHtml(io io.Reader, ps []post.Post) (psout []post.Post, err error) { // Create a qoquery document to parse from doc, err := goquery.NewDocumentFromReader(io) checkErr(err, "Failed to parse HTML") if err == nil { fmt.Println("---- Starting to parse ------------------------") // Find reddit posts = elements with class "thing" thing := doc.Find(".thing") for iThing := range thing.Nodes { post := post.NewPost() // use `single` as a selection of 1 node singlething := thing.Eq(iThing) // get the reddit post identifier reddit_post_id, exists := singlething.Attr("data-fullname") if exists == false { singlehtml, _ := singlething.Html() post.Err = fmt.Errorf("data-fullname not found in %s", singlehtml) } else { // find an element with class title and a child with class may-blank // Remove CRLF and unnecessary whitespaces post.Title = stringMinifier(singlething.Find(".title .may-blank").Text()) post.PostId = reddit_post_id post.User = singlething.Find(".author").Text() post.Url, _ = singlething.Find(".comments.may-blank").Attr("href") post.SetScore(singlething.Find(".score.likes").Text()) reddit_postdate, exists := singlething.Find("time").Attr("datetime") if exists == false { singlehtml, _ := singlething.Html() post.Err = fmt.Errorf("datetime not found in %s", singlehtml) } else { //post := post.NewPost() post.SetPostDate(reddit_postdate) // Print out the crawled info post.String() fmt.Println("-----------------------------------------------") } } ps = append(ps, post) } } return ps, err }
// Parse for posts in html from reddit, input html is an io.Reader and returns recognized posts in a psout slice of posts. // Errors which affect only a single post are stored in their post.Err func ParseHtmlReddit(io io.Reader, ps []post.Post) (psout []post.Post, err error) { // Create a qoquery document to parse from an io.Reader doc, err := goquery.NewDocumentFromReader(io) if err != nil { return ps, errors.New("Failed to parse HTML: " + err.Error()) } // Find reddit posts = elements with class "thing" thing := doc.Find(".thing") for iThing := range thing.Nodes { // Create a new post struct - if the crawling fails the post will have an Err attached // but will be added to the outgoing (psout) slice nevertheless post := post.NewPost() post.Site = "reddit" // use `singlething` as a selection of one single post singlething := thing.Eq(iThing) // get the reddit post identifier reddit_post_id, exists := singlething.Attr("data-fullname") if exists == false { singlehtml, _ := singlething.Html() post.Err = fmt.Errorf("data-fullname not found in %s", singlehtml) } else { post.WebPostId = reddit_post_id // find an element with class title and a child with class may-blank // and remove CRLF and unnecessary whitespaces post.Title = stringMinifier(singlething.Find(".title .may-blank").Text()) // Get the post user post.User = singlething.Find(".author").Text() // Get the post url post.Url, _ = singlething.Find(".comments.may-blank").Attr("href") // Get the post likes score post.SetScore(singlething.Find(".score.likes").Text()) // Get the post date reddit_postdate, exists := singlething.Find("time").Attr("datetime") if exists == false { singlehtml, _ := singlething.Html() post.Err = fmt.Errorf("datetime not found in %s", singlehtml) } else { post.SetPostDate(reddit_postdate) } } ps = append(ps, post) } return ps, err }