func ParseHtml(io io.Reader, ps []post.Post) (psout []post.Post, err error) { // Create a qoquery document to parse from doc, err := goquery.NewDocumentFromReader(io) checkErr(err, "Failed to parse HTML") if err == nil { fmt.Println("---- Starting to parse ------------------------") // Find reddit posts = elements with class "thing" thing := doc.Find(".thing") for iThing := range thing.Nodes { post := post.NewPost() // use `single` as a selection of 1 node singlething := thing.Eq(iThing) // get the reddit post identifier reddit_post_id, exists := singlething.Attr("data-fullname") if exists == false { singlehtml, _ := singlething.Html() post.Err = fmt.Errorf("data-fullname not found in %s", singlehtml) } else { // find an element with class title and a child with class may-blank // Remove CRLF and unnecessary whitespaces post.Title = stringMinifier(singlething.Find(".title .may-blank").Text()) post.PostId = reddit_post_id post.User = singlething.Find(".author").Text() post.Url, _ = singlething.Find(".comments.may-blank").Attr("href") post.SetScore(singlething.Find(".score.likes").Text()) reddit_postdate, exists := singlething.Find("time").Attr("datetime") if exists == false { singlehtml, _ := singlething.Html() post.Err = fmt.Errorf("datetime not found in %s", singlehtml) } else { //post := post.NewPost() post.SetPostDate(reddit_postdate) // Print out the crawled info post.String() fmt.Println("-----------------------------------------------") } } ps = append(ps, post) } } return ps, err }
// Parse for posts in html from reddit, input html is an io.Reader and returns recognized posts in a psout slice of posts. // Errors which affect only a single post are stored in their post.Err func ParseHtmlReddit(io io.Reader, ps []post.Post) (psout []post.Post, err error) { // Create a qoquery document to parse from an io.Reader doc, err := goquery.NewDocumentFromReader(io) if err != nil { return ps, errors.New("Failed to parse HTML: " + err.Error()) } // Find reddit posts = elements with class "thing" thing := doc.Find(".thing") for iThing := range thing.Nodes { // Create a new post struct - if the crawling fails the post will have an Err attached // but will be added to the outgoing (psout) slice nevertheless post := post.NewPost() post.Site = "reddit" // use `singlething` as a selection of one single post singlething := thing.Eq(iThing) // get the reddit post identifier reddit_post_id, exists := singlething.Attr("data-fullname") if exists == false { singlehtml, _ := singlething.Html() post.Err = fmt.Errorf("data-fullname not found in %s", singlehtml) } else { post.WebPostId = reddit_post_id // find an element with class title and a child with class may-blank // and remove CRLF and unnecessary whitespaces post.Title = stringMinifier(singlething.Find(".title .may-blank").Text()) // Get the post user post.User = singlething.Find(".author").Text() // Get the post url post.Url, _ = singlething.Find(".comments.may-blank").Attr("href") // Get the post likes score post.SetScore(singlething.Find(".score.likes").Text()) // Get the post date reddit_postdate, exists := singlething.Find("time").Attr("datetime") if exists == false { singlehtml, _ := singlething.Html() post.Err = fmt.Errorf("datetime not found in %s", singlehtml) } else { post.SetPostDate(reddit_postdate) } } ps = append(ps, post) } return ps, err }
// Parse for posts in html from hackernews, input html is an io.Reader and returns recognized posts in a psout slice of posts. // Errors which affect only a single post are stored in their post.Err func ParseHtmlHackerNews(body io.Reader, ps []*post.Post) (psout []*post.Post, err error) { var html string // Create a qoquery document to parse from an io.Reader doc, err := goquery.NewDocumentFromReader(body) if err != nil { return ps, errors.New("Failed to parse HTML: " + err.Error()) } // Find hackernews posts = elements with class "athing" thing := doc.Find(".athing") for iThing := range thing.Nodes { // Create a new post struct - if the crawling fails the post will have an Err attached // but will be added to the outgoing (psout) slice nevertheless post := post.NewPost() post.Site = "hackernews" ps = append(ps, &post) // use `singlearticle` as a selection of one single post singlearticle := thing.Eq(iThing) // Get the next element containing additional info for this post scorenode := singlearticle.Next() if scorenode == nil { errhtml, _ := singlearticle.Html() post.Err = fmt.Errorf("Did not find next sibling for: %s\n", errhtml) continue } htmlpost := singlearticle.Find(".title a").First() if htmlpost.Size() == 0 { errhtml, _ := singlearticle.Html() post.Err = fmt.Errorf("Did not find title for: %s\n", errhtml) continue } post.Title = htmlpost.Text() var exists bool post.Url, exists = htmlpost.Attr("href") if exists == false { singlehtml, _ := htmlpost.Html() post.Err = fmt.Errorf("href not found in %s\n", singlehtml) } post.Url = stringMinifier(post.Url) if !(strings.HasPrefix(post.Url, "http")) { post.Url = "https://news.ycombinator.com/" + post.Url } if DebugLevel > 2 { fmt.Printf("**** URL post.Url: %s\n", post.Url) } if DebugLevel > 3 { fmt.Printf("---------------------------\n") html, _ = scorenode.Html() fmt.Printf("HTML: %s\n", html) fmt.Printf("---------------------------\n") } // Get the score scoretag := scorenode.Find(".subtext .score").First() if scoretag.Size() == 0 { post.Err = fmt.Errorf("Did not find score for: %v\n", scorenode) continue } if DebugLevel > 3 { fmt.Printf("------- SCORE -------------\n") html, _ = scoretag.Html() fmt.Printf("HTML: %s\n", html) score := scoretag.Text() fmt.Printf("TEXT: %s\n", score) fmt.Printf("---------------------------\n") } post.SetScore(strings.Split(scoretag.Text(), " ")[0]) postid, exists := scoretag.Attr("id") if !exists { html, _ = scoretag.Html() post.Err = fmt.Errorf("Did not find postid in %s\n", html) } if DebugLevel > 3 { fmt.Printf("------- POST ID -----------\n") fmt.Printf("TEXT: %s\n", postid) fmt.Printf("---------------------------\n") } post.WebPostId = strings.Split(postid, "_")[1] // Get the username and postdate hrefs := scorenode.Find(".subtext a") if hrefs.Size() == 0 { errhtml, _ := scorenode.Html() post.Err = fmt.Errorf("Did not find user and date in %s\n", errhtml) continue } for i := range hrefs.Nodes { href := hrefs.Eq(i) t, _ := href.Html() s, exists := href.Attr("href") if exists { if strings.HasPrefix(s, "user?id") { post.User = t continue } if strings.HasPrefix(s, "item?id") { if strings.Contains(t, "ago") { var postDate time.Time postDate, err = GetDateFromCreatedAgo(t) if err != nil { post.Err = errors.New(fmt.Sprintf("Failed to convert to date: %s: %s\n", t, err.Error())) continue } post.PostDate = postDate post.Err = err } } } if DebugLevel > 3 { fmt.Printf("------- HREF --------------\n") fmt.Printf("TEXT: %s\n", t) fmt.Printf("HREF: %s\n", s) fmt.Printf("---------------------------\n") } } if post.Err == nil { err = ParseHtmlComments(&post) } if DebugLevel > 2 && err == nil { fmt.Printf("------ POST DUMP -----------\n") fmt.Print(post.String("PARSED: ")) fmt.Printf("------ POST DUMP END -------\n") } } return ps, err }