func ParseHtml(io io.Reader, ps []post.Post) (psout []post.Post, err error) { // Create a qoquery document to parse from doc, err := goquery.NewDocumentFromReader(io) checkErr(err, "Failed to parse HTML") if err == nil { fmt.Println("---- Starting to parse ------------------------") // Find reddit posts = elements with class "thing" thing := doc.Find(".thing") for iThing := range thing.Nodes { post := post.NewPost() // use `single` as a selection of 1 node singlething := thing.Eq(iThing) // get the reddit post identifier reddit_post_id, exists := singlething.Attr("data-fullname") if exists == false { singlehtml, _ := singlething.Html() post.Err = fmt.Errorf("data-fullname not found in %s", singlehtml) } else { // find an element with class title and a child with class may-blank // Remove CRLF and unnecessary whitespaces post.Title = stringMinifier(singlething.Find(".title .may-blank").Text()) post.PostId = reddit_post_id post.User = singlething.Find(".author").Text() post.Url, _ = singlething.Find(".comments.may-blank").Attr("href") post.SetScore(singlething.Find(".score.likes").Text()) reddit_postdate, exists := singlething.Find("time").Attr("datetime") if exists == false { singlehtml, _ := singlething.Html() post.Err = fmt.Errorf("datetime not found in %s", singlehtml) } else { //post := post.NewPost() post.SetPostDate(reddit_postdate) // Print out the crawled info post.String() fmt.Println("-----------------------------------------------") } } ps = append(ps, post) } } return ps, err }
// Parse for posts in html from reddit, input html is an io.Reader and returns recognized posts in a psout slice of posts. // Errors which affect only a single post are stored in their post.Err func ParseHtmlReddit(io io.Reader, ps []post.Post) (psout []post.Post, err error) { // Create a qoquery document to parse from an io.Reader doc, err := goquery.NewDocumentFromReader(io) if err != nil { return ps, errors.New("Failed to parse HTML: " + err.Error()) } // Find reddit posts = elements with class "thing" thing := doc.Find(".thing") for iThing := range thing.Nodes { // Create a new post struct - if the crawling fails the post will have an Err attached // but will be added to the outgoing (psout) slice nevertheless post := post.NewPost() post.Site = "reddit" // use `singlething` as a selection of one single post singlething := thing.Eq(iThing) // get the reddit post identifier reddit_post_id, exists := singlething.Attr("data-fullname") if exists == false { singlehtml, _ := singlething.Html() post.Err = fmt.Errorf("data-fullname not found in %s", singlehtml) } else { post.WebPostId = reddit_post_id // find an element with class title and a child with class may-blank // and remove CRLF and unnecessary whitespaces post.Title = stringMinifier(singlething.Find(".title .may-blank").Text()) // Get the post user post.User = singlething.Find(".author").Text() // Get the post url post.Url, _ = singlething.Find(".comments.may-blank").Attr("href") // Get the post likes score post.SetScore(singlething.Find(".score.likes").Text()) // Get the post date reddit_postdate, exists := singlething.Find("time").Attr("datetime") if exists == false { singlehtml, _ := singlething.Html() post.Err = fmt.Errorf("datetime not found in %s", singlehtml) } else { post.SetPostDate(reddit_postdate) } } ps = append(ps, post) } return ps, err }
func Test() (err error) { //drivername := "postgres" //dsn := "user=golang password=golang dbname=golang sslmode=disable" //dialect := gorp.PostgresDialect{} drivername := "mysql" dsn := "golang:golang@/golang?parseTime=true&collation=utf8mb4_general_ci" dialect := gorp.MySQLDialect{"InnoDB", "utf8mb4"} // connect to db using standard Go database/sql API db, err := sql.Open(drivername, dsn) if err != nil { return errors.New("sql.Open failed: " + err.Error()) } // Open doesn't open a connection. Validate DSN data using ping if err = db.Ping(); err != nil { return errors.New("db.Ping failed: " + err.Error()) } // Set the connection to use utf8mb4 if dialect.Engine == "InnoDB" { _, err = db.Exec("SET NAMES utf8mb4 COLLATE utf8mb4_general_ci") if err != nil { return errors.New("SET NAMES utf8mb4 COLLATE utf8mb4_general_ci: " + err.Error()) } } // construct a gorp DbMap dbmap := &gorp.DbMap{Db: db, Dialect: dialect} defer dbmap.Db.Close() dbmap.DebugLevel = DebugLevel // Will log all SQL statements + args as they are run // The first arg is a string prefix to prepend to all log messages dbmap.TraceOn("[gorp]", log.New(os.Stdout, "fetch:", log.Lmicroseconds)) // register the structs you wish to use with gorp // you can also use the shorter dbmap.AddTable() if you // don't want to override the table name // SetKeys(true) means we have a auto increment primary key, which // will get automatically bound to your struct post-insert table := dbmap.AddTableWithName(post.Post{}, "posts_embedded_test") table.SetKeys(true, "PID") fmt.Printf("AddTableWithName returned: %s\n", table.TableName) var r *gorp.RelationMap if len(table.Relations) > 0 { r = table.Relations[0] fmt.Printf("Relation DetailTable: %s\n", r.DetailTable.TableName) } // Add the comments table table = dbmap.AddTableWithName(post.Comment{}, "comments_embedded_test") table.SetKeys(true, "Id") fmt.Printf("AddTableWithName returned: %s\n", table.TableName) if r != nil { fmt.Printf("Relation DetailTable: %s\n", r.DetailTable.TableName) } // create the table. in a production system you'd generally // use a migration tool, or create the tables via scripts if err = dbmap.CreateTablesIfNotExists(); err != nil { return errors.New("Create tables failed: " + err.Error()) } // Force create all indexes for this database if err = dbmap.CreateIndexes(); err != nil { return errors.New("Create indexes failed: " + err.Error()) } i := 0 x := 0 var LastPkForGetTests uint64 var p post.Post rand.Seed(42) for i < 10 { p = post.NewPost() p.Title = fmt.Sprintf("Post number %d", i) p.Site = "test" p.PostDate = time.Unix(time.Now().Unix(), 0).UTC() p.WebPostId = strconv.FormatUint(post.Hash(p.Title+p.PostDate.String()), 10) x = 0 for x < 10 { c := p.AddComment() c.Title = fmt.Sprintf("Comment %d on post %d: ", x, i) //c.Title = "👩�👦�👦👨�👩�👧�👩�👩�" c.Title += "\U0001F475 \u2318 \xe2\x8c\x98 \U0001F474 \xF0\x9F\x91\xB4 \U0001F610" c.WebCommentId = strconv.FormatUint(post.Hash(c.Title+c.GetCommentDate().String())+uint64(rand.Int63n(100000)), 10) if utf8.ValidString(c.Title) { fmt.Printf("IS VALID: '%s'\n", c.Title) } else { fmt.Printf("IS *** NOT*** VALID: '%s'\n", c.Title) } nihongo := c.Title for i, w := 0, 0; i < len(nihongo); i += w { runeValue, width := utf8.DecodeRuneInString(nihongo[i:]) fmt.Printf("%#U starts at byte position %d, lenght %d\n", runeValue, i, width) w = width } x++ } // Inserting a post also inserts all its detail records (=comments) err = dbmap.InsertWithChilds(&p) if DebugLevel > 3 { // Print out the crawled info fmt.Println("----------- INSERT POST START -----------------") fmt.Println(p.String("IP: ")) } if err != nil { return errors.New("Insert failed: " + err.Error()) } LastPkForGetTests = p.Id if DebugLevel > 3 { // Print out the end of the crawled info fmt.Println("----------- INSERT POST END -------------------") } for y, c := range p.Comments { c.Title = fmt.Sprintf("UpdatedComment %d ", y) + c.Title x++ } p.Title = fmt.Sprintf("UpdatedPost %d ", i) + p.Title var rowsaffected int64 rowsaffected, err = dbmap.UpdateWithChilds(&p) if DebugLevel > 3 { // Print out the crawled info fmt.Println("----------- UPDATE POST START -----------------") fmt.Printf("Rows affected: %d\n", rowsaffected) fmt.Println(p.String("UP: ")) } if err != nil { return errors.New("update failed: " + err.Error()) } if DebugLevel > 3 { // Print out the end of the crawled info fmt.Println("----------- UPDATE POST END -------------------") } i++ } fmt.Println("Starting Get tests") res, err := dbmap.GetWithChilds(post.Post{}, LastPkForGetTests) if err != nil { return errors.New("get failed: " + err.Error()) } if res == nil { return errors.New(fmt.Sprintf("Get post for id %d did not return any rows ", LastPkForGetTests)) } resp := res.(*post.Post) if DebugLevel > 3 { // Print out the selected post fmt.Println("----------- GET POST START -----------------") fmt.Println(resp.String("GP: ")) } if DebugLevel > 3 { // Print out the end of the selected post fmt.Println("----------- GET POST END -------------------") } var updateNeeded bool updateNeeded, err = AddUpdatableChilds(&p, resp, dbmap) if err != nil { return errors.New(fmt.Sprintf("AddUpdatableChilds for post '%s' failed: %s", resp.WebPostId, err.Error())) } if updateNeeded { var rowsaffected int64 rowsaffected, err = dbmap.UpdateWithChilds(resp) if DebugLevel > 3 { // Print out the crawled info fmt.Println("----------- REUPDATE POST START -----------------") fmt.Printf("Rows affected: %d\n", rowsaffected) fmt.Println(resp.String("RUP: ")) } if err != nil { return errors.New("reupdate failed: " + err.Error()) } if DebugLevel > 3 { // Print out the end of the crawled info fmt.Println("----------- REUPDATE POST END -------------------") } } return }
// Parse for posts in html from hackernews, input html is an io.Reader and returns recognized posts in a psout slice of posts. // Errors which affect only a single post are stored in their post.Err func ParseHtmlHackerNews(body io.Reader, ps []*post.Post) (psout []*post.Post, err error) { var html string // Create a qoquery document to parse from an io.Reader doc, err := goquery.NewDocumentFromReader(body) if err != nil { return ps, errors.New("Failed to parse HTML: " + err.Error()) } // Find hackernews posts = elements with class "athing" thing := doc.Find(".athing") for iThing := range thing.Nodes { // Create a new post struct - if the crawling fails the post will have an Err attached // but will be added to the outgoing (psout) slice nevertheless post := post.NewPost() post.Site = "hackernews" ps = append(ps, &post) // use `singlearticle` as a selection of one single post singlearticle := thing.Eq(iThing) // Get the next element containing additional info for this post scorenode := singlearticle.Next() if scorenode == nil { errhtml, _ := singlearticle.Html() post.Err = fmt.Errorf("Did not find next sibling for: %s\n", errhtml) continue } htmlpost := singlearticle.Find(".title a").First() if htmlpost.Size() == 0 { errhtml, _ := singlearticle.Html() post.Err = fmt.Errorf("Did not find title for: %s\n", errhtml) continue } post.Title = htmlpost.Text() var exists bool post.Url, exists = htmlpost.Attr("href") if exists == false { singlehtml, _ := htmlpost.Html() post.Err = fmt.Errorf("href not found in %s\n", singlehtml) } post.Url = stringMinifier(post.Url) if !(strings.HasPrefix(post.Url, "http")) { post.Url = "https://news.ycombinator.com/" + post.Url } if DebugLevel > 2 { fmt.Printf("**** URL post.Url: %s\n", post.Url) } if DebugLevel > 3 { fmt.Printf("---------------------------\n") html, _ = scorenode.Html() fmt.Printf("HTML: %s\n", html) fmt.Printf("---------------------------\n") } // Get the score scoretag := scorenode.Find(".subtext .score").First() if scoretag.Size() == 0 { post.Err = fmt.Errorf("Did not find score for: %v\n", scorenode) continue } if DebugLevel > 3 { fmt.Printf("------- SCORE -------------\n") html, _ = scoretag.Html() fmt.Printf("HTML: %s\n", html) score := scoretag.Text() fmt.Printf("TEXT: %s\n", score) fmt.Printf("---------------------------\n") } post.SetScore(strings.Split(scoretag.Text(), " ")[0]) postid, exists := scoretag.Attr("id") if !exists { html, _ = scoretag.Html() post.Err = fmt.Errorf("Did not find postid in %s\n", html) } if DebugLevel > 3 { fmt.Printf("------- POST ID -----------\n") fmt.Printf("TEXT: %s\n", postid) fmt.Printf("---------------------------\n") } post.WebPostId = strings.Split(postid, "_")[1] // Get the username and postdate hrefs := scorenode.Find(".subtext a") if hrefs.Size() == 0 { errhtml, _ := scorenode.Html() post.Err = fmt.Errorf("Did not find user and date in %s\n", errhtml) continue } for i := range hrefs.Nodes { href := hrefs.Eq(i) t, _ := href.Html() s, exists := href.Attr("href") if exists { if strings.HasPrefix(s, "user?id") { post.User = t continue } if strings.HasPrefix(s, "item?id") { if strings.Contains(t, "ago") { var postDate time.Time postDate, err = GetDateFromCreatedAgo(t) if err != nil { post.Err = errors.New(fmt.Sprintf("Failed to convert to date: %s: %s\n", t, err.Error())) continue } post.PostDate = postDate post.Err = err } } } if DebugLevel > 3 { fmt.Printf("------- HREF --------------\n") fmt.Printf("TEXT: %s\n", t) fmt.Printf("HREF: %s\n", s) fmt.Printf("---------------------------\n") } } if post.Err == nil { err = ParseHtmlComments(&post) } if DebugLevel > 2 && err == nil { fmt.Printf("------ POST DUMP -----------\n") fmt.Print(post.String("PARSED: ")) fmt.Printf("------ POST DUMP END -------\n") } } return ps, err }
// Parse for posts in html from hackernews, input html is an io.Reader and returns recognized posts in a psout slice of posts. // Errors which affect only a single post are stored in their post.Err func ParseHtmlHackerNews(body io.Reader, ps []*post.Post) (psout []*post.Post, err error) { root, err := html.Parse(body) if err != nil { err = errors.New("Failed to html.Parse: " + err.Error()) return } // define a matcher matcher := func(n *html.Node) bool { if n.DataAtom == atom.Tr && n.Parent != nil && n.Parent.DataAtom == atom.Tbody { matched := scrape.Attr(n, "class") == "athing" return matched } return false } // grab all articles and loop over them articles := scrape.FindAll(root, matcher) for _, article := range articles { var ok bool // Get one post entry var titlenode *html.Node titlenode, ok = scrape.Find(article, func(n *html.Node) bool { if n.DataAtom == atom.A && n.Parent != nil && scrape.Attr(n.Parent, "class") == "title" { return true } return false }) if !ok { continue } // Create a new post struct - if the crawling fails the post will have an Err attached // but will be added to the outgoing (psout) slice nevertheless post := post.NewPost() post.Site = "hackernews" post.Title = scrape.Text(titlenode) post.Url = scrape.Attr(titlenode, "href") if strings.HasPrefix(post.Url, "item?id=") { post.Url = "https://news.ycombinator.com/" + post.Url } ps = append(ps, &post) // Get additional info for this post scorenode := article.NextSibling if scorenode == nil { post.Err = errors.New("Did not find score for: %s\n" + scrape.Text(article)) continue } // Get the subtext containing scores, user and date subtext, ok := scrape.Find(scorenode, func(n *html.Node) bool { if scrape.Attr(n, "class") == "subtext" { return true } return false }) if !ok { post.Err = errors.New(fmt.Sprintf("Did not find siblings for subtext %s\n", scorenode.Data)) continue } subs := scrape.FindAll(subtext, func(n *html.Node) bool { // Get the PostId and Score // span class="score" id="score_9643579">92 points</span> if n.DataAtom == atom.Span && scrape.Attr(n, "class") == "score" && n.Parent != nil && scrape.Attr(n.Parent, "class") == "subtext" { // Get score var scoreid int scorestr := strings.Split(scrape.Text(n), " ")[0] scoreid, err = strconv.Atoi(scorestr) if err != nil { fmt.Printf("Failed to convert score %s to int: %s\n", scorestr, err.Error()) return false } post.Score = scoreid // Get PostId postidstr := scrape.Attr(n, "id") if len(strings.Split(postidstr, "_")) > 1 { post.WebPostId = strings.Split(postidstr, "_")[1] return true } } // Get the Username and Creation Date for this post if scrape.Attr(n.Parent, "class") == "subtext" && n.DataAtom == atom.A && n.Parent != nil { href := strings.ToLower(scrape.Attr(n, "href")) if href != "" { s := strings.Split(href, "?") if s[0] == "user" && len(s) > 1 { // Username u := strings.Split(s[1], "=") if len(u) > 1 { post.User = u[1] return true } } else { if s[0] == "item" && len(s) > 1 { // Created date createdago := scrape.Text(n) if strings.Contains(createdago, "ago") { var postDate time.Time postDate, err = GetDateFromCreatedAgo(createdago) if err != nil { err = errors.New(fmt.Sprintf("Failed to convert to date: %V\n", createdago)) return false } post.PostDate = postDate return true } } } } } // end "class" == "subtext" return false }) if len(subs) == 0 { var w bytes.Buffer if rerr := html.Render(&w, subtext); rerr != nil { fmt.Printf("Render error: %s\n", rerr) } post.Err = errors.New(fmt.Sprintf("Unable to parse score,user,date from %s:\n %s\n", post.Title, w.String())) } } return ps, err }
func RedditPostScraperWithGorm(sub string) (err error) { db, err := gorm.Open("mysql", "golang:golang@/golang?charset=utf8&parseTime=True&loc=Local") // Get database connection handle [*sql.DB](http://golang.org/pkg/database/sql/#DB) db.DB() // Then you could invoke `*sql.DB`'s functions with it db.DB().Ping() db.DB().SetMaxIdleConns(10) db.DB().SetMaxOpenConns(100) // Automating Migration db.AutoMigrate(&post.Post{}) // Get data from reddit geturl := "http://www.reddit.com/r/" + sub + "/new" resp, err := http.Get(geturl) if err != nil { return errors.New("Failed to http.Get from " + geturl + ": " + err.Error()) } if resp != nil { if resp.Body == nil { return errors.New("Body from " + geturl + " is nil!") } else { defer resp.Body.Close() } } else { return errors.New("Response from " + geturl + " is nil!") } if resp.StatusCode != 200 { // 200 = OK httperr := fmt.Sprintf("Failed to http.Get from %s: Http Status code: %d: Msg: %s", geturl, resp.StatusCode, resp.Status) return errors.New(httperr) } // Create a new post slice and then parse the response body into ps ps := make([]post.Post, 0) ps, err = ParseHtmlReddit(resp.Body, ps) if err != nil { return errors.New("Error in RedditParseHtml: " + geturl + ": " + err.Error()) } foundnewposts := false updatedposts := 0 // insert rows - auto increment PKs will be set properly after the insert for _, p := range ps { if p.Err == nil { p.Site = "reddit" // check if post already exists var count int querypost := post.NewPost() // SELECT * FROM posts WHERE PostId = "xxx"; db.Find(&querypost, "PostId = ?", p.PostId).Count(&count) err = db.Error if err != nil { return errors.New("select count(*) from posts failed: " + err.Error()) } if count == 0 { foundnewposts = true db.Create(&p) err = db.Error if err != nil { return errors.New("insert into table posts failed: " + err.Error()) } if err == nil { // Print out the crawled info fmt.Println("----------- INSERT ----------------------------") fmt.Println(p.String()) } } else { // Post already exists, do an update //var err error score := p.Score // Get the first matched record // SELECT * FROM posts WHERE PostId = 'xxx' limit 1; db.Where("PostId = ?", p.PostId).First(&querypost) err = db.Error if err != nil { return errors.New("Failed: select Id from posts where PostId = " + p.PostId + ": " + err.Error()) } if score != querypost.Score { db.Save(&p) err = db.Error if err != nil { return errors.New("update table 'posts' failed: " + err.Error()) } else { updatedposts++ // Print out the update info fmt.Println("----------- UPDATE SCORE-----------------------") fmt.Println(p.Title) fmt.Printf("From %d to %d\n", score, p.Score) } } } } else { fmt.Println("Single post error in " + geturl + ": " + p.Err.Error()) } } if !foundnewposts { fmt.Println("No new posts found at " + geturl) } if updatedposts > 0 { fmt.Printf("%d posts have been updated from %s\n", updatedposts, geturl) } return }