func AddUpdatableChilds(htmlpost *post.Post, dbpost *post.Post, dbmap *gorp.DbMap) (updateNeeded bool, err error) { // Check if there are aleady comments in dbpost // If not get them from the database if len(dbpost.Comments) == 0 { pk := dbpost.Id if pk == 0 { err = errors.New("primary key not set in dbpost") return } var res interface{} res, err = dbmap.GetWithChilds(post.Post{}, pk) if err != nil { err = errors.New("get failed: " + err.Error()) return } if res == nil { err = errors.New(fmt.Sprintf("Get post for id %d did not return any rows ", pk)) return } dbpost := res.(*post.Post) if DebugLevel > 3 { // Print out the update info fmt.Println("----------- DB POST -----------------") fmt.Println(dbpost.String("CHECK DB: ")) fmt.Println("----------- DB POST END -------------------") } } if DebugLevel > 3 { // Print out the update info fmt.Println("----------- HTML POST -----------------") fmt.Println(htmlpost.String("CHECK HTML: ")) fmt.Println("----------- HTML POST END -------------------") } updateNeeded = htmlpost.Hash() != dbpost.Hash() if updateNeeded { var UpdatedComments []*post.Comment var found bool if DebugLevel > 2 { fmt.Printf("**** UpdatedComments len %d\n", len(UpdatedComments)) } for _, h := range htmlpost.Comments { found = false htmlHash := h.Hash() for _, d := range dbpost.Comments { if DebugLevel > 2 { fmt.Printf("**** COMPARE\n") fmt.Printf("**** **** d.Hash():%d htmlHash %d\n", d.Hash(), htmlHash) fmt.Printf("**** **** d.Date '%s' h.Date '%s'\n", d.GetCommentDate().String(), h.GetCommentDate().String()) fmt.Printf("**** COMPARE END\n") } if d.Hash() == htmlHash { // post with identical content has been found - do not store this comment found = true if DebugLevel > 2 { fmt.Printf("**** ***************** MATCH d.Hash() == htmlHash %d\n", d.Hash()) } break } if h.WebCommentId == d.WebCommentId { // external unique comment id found - this comment is already stored // but the comment content has been changed - update needed if DebugLevel > 3 { fmt.Printf("**** COMPARE h.WebCommentId\n") fmt.Printf("**** **** h '%s' d '%s'\n", h.WebCommentId, d.WebCommentId) fmt.Printf("**** COMPARE h.WebCommentId END\n") } h.Id = d.Id h.PostId = d.PostId break } } if !found { UpdatedComments = append(UpdatedComments, h) if DebugLevel > 2 { fmt.Printf("**** UpdatedComments len %d\n", len(UpdatedComments)) fmt.Printf("**** **** append(UpdatedComments, h) %s\n", h.String("APP: ")) } } } fmt.Printf("**** htmlpost.Comments len %d\n", len(htmlpost.Comments)) fmt.Printf("**** UpdatedComments len %d\n", len(UpdatedComments)) dbpost.Comments = make([]*post.Comment, len(UpdatedComments), len(UpdatedComments)) fmt.Printf("**** dbpost.Comments1 len %d\n", len(dbpost.Comments)) copy(dbpost.Comments, UpdatedComments) fmt.Printf("**** dbpost.Comments2 len %d\n", len(dbpost.Comments)) } if (DebugLevel > 3) && updateNeeded { // Print out the update info fmt.Println("----------- UPDATE NEEDED -----------------") for i := range htmlpost.Comments { fmt.Println(htmlpost.Comments[i].String("UPDATE NEEDED HTML: ")) if i < len(dbpost.Comments) { fmt.Println(dbpost.Comments[i].String("UPDATE NEEDED DB: ")) } } //fmt.Println(htmlpost.String("UPDATE NEEDED HTML: ")) //fmt.Println(dbpost.String("UPDATE NEEDED DB: ")) fmt.Println("----------- UPDATE NEEDED END -------------------") } return }
func Test() (err error) { //drivername := "postgres" //dsn := "user=golang password=golang dbname=golang sslmode=disable" //dialect := gorp.PostgresDialect{} drivername := "mysql" dsn := "golang:golang@/golang?parseTime=true&collation=utf8mb4_general_ci" dialect := gorp.MySQLDialect{"InnoDB", "utf8mb4"} // connect to db using standard Go database/sql API db, err := sql.Open(drivername, dsn) if err != nil { return errors.New("sql.Open failed: " + err.Error()) } // Open doesn't open a connection. Validate DSN data using ping if err = db.Ping(); err != nil { return errors.New("db.Ping failed: " + err.Error()) } // Set the connection to use utf8mb4 if dialect.Engine == "InnoDB" { _, err = db.Exec("SET NAMES utf8mb4 COLLATE utf8mb4_general_ci") if err != nil { return errors.New("SET NAMES utf8mb4 COLLATE utf8mb4_general_ci: " + err.Error()) } } // construct a gorp DbMap dbmap := &gorp.DbMap{Db: db, Dialect: dialect} defer dbmap.Db.Close() dbmap.DebugLevel = DebugLevel // Will log all SQL statements + args as they are run // The first arg is a string prefix to prepend to all log messages dbmap.TraceOn("[gorp]", log.New(os.Stdout, "fetch:", log.Lmicroseconds)) // register the structs you wish to use with gorp // you can also use the shorter dbmap.AddTable() if you // don't want to override the table name // SetKeys(true) means we have a auto increment primary key, which // will get automatically bound to your struct post-insert table := dbmap.AddTableWithName(post.Post{}, "posts_embedded_test") table.SetKeys(true, "PID") fmt.Printf("AddTableWithName returned: %s\n", table.TableName) var r *gorp.RelationMap if len(table.Relations) > 0 { r = table.Relations[0] fmt.Printf("Relation DetailTable: %s\n", r.DetailTable.TableName) } // Add the comments table table = dbmap.AddTableWithName(post.Comment{}, "comments_embedded_test") table.SetKeys(true, "Id") fmt.Printf("AddTableWithName returned: %s\n", table.TableName) if r != nil { fmt.Printf("Relation DetailTable: %s\n", r.DetailTable.TableName) } // create the table. in a production system you'd generally // use a migration tool, or create the tables via scripts if err = dbmap.CreateTablesIfNotExists(); err != nil { return errors.New("Create tables failed: " + err.Error()) } // Force create all indexes for this database if err = dbmap.CreateIndexes(); err != nil { return errors.New("Create indexes failed: " + err.Error()) } i := 0 x := 0 var LastPkForGetTests uint64 var p post.Post rand.Seed(42) for i < 10 { p = post.NewPost() p.Title = fmt.Sprintf("Post number %d", i) p.Site = "test" p.PostDate = time.Unix(time.Now().Unix(), 0).UTC() p.WebPostId = strconv.FormatUint(post.Hash(p.Title+p.PostDate.String()), 10) x = 0 for x < 10 { c := p.AddComment() c.Title = fmt.Sprintf("Comment %d on post %d: ", x, i) //c.Title = "👩�👦�👦👨�👩�👧�👩�👩�" c.Title += "\U0001F475 \u2318 \xe2\x8c\x98 \U0001F474 \xF0\x9F\x91\xB4 \U0001F610" c.WebCommentId = strconv.FormatUint(post.Hash(c.Title+c.GetCommentDate().String())+uint64(rand.Int63n(100000)), 10) if utf8.ValidString(c.Title) { fmt.Printf("IS VALID: '%s'\n", c.Title) } else { fmt.Printf("IS *** NOT*** VALID: '%s'\n", c.Title) } nihongo := c.Title for i, w := 0, 0; i < len(nihongo); i += w { runeValue, width := utf8.DecodeRuneInString(nihongo[i:]) fmt.Printf("%#U starts at byte position %d, lenght %d\n", runeValue, i, width) w = width } x++ } // Inserting a post also inserts all its detail records (=comments) err = dbmap.InsertWithChilds(&p) if DebugLevel > 3 { // Print out the crawled info fmt.Println("----------- INSERT POST START -----------------") fmt.Println(p.String("IP: ")) } if err != nil { return errors.New("Insert failed: " + err.Error()) } LastPkForGetTests = p.Id if DebugLevel > 3 { // Print out the end of the crawled info fmt.Println("----------- INSERT POST END -------------------") } for y, c := range p.Comments { c.Title = fmt.Sprintf("UpdatedComment %d ", y) + c.Title x++ } p.Title = fmt.Sprintf("UpdatedPost %d ", i) + p.Title var rowsaffected int64 rowsaffected, err = dbmap.UpdateWithChilds(&p) if DebugLevel > 3 { // Print out the crawled info fmt.Println("----------- UPDATE POST START -----------------") fmt.Printf("Rows affected: %d\n", rowsaffected) fmt.Println(p.String("UP: ")) } if err != nil { return errors.New("update failed: " + err.Error()) } if DebugLevel > 3 { // Print out the end of the crawled info fmt.Println("----------- UPDATE POST END -------------------") } i++ } fmt.Println("Starting Get tests") res, err := dbmap.GetWithChilds(post.Post{}, LastPkForGetTests) if err != nil { return errors.New("get failed: " + err.Error()) } if res == nil { return errors.New(fmt.Sprintf("Get post for id %d did not return any rows ", LastPkForGetTests)) } resp := res.(*post.Post) if DebugLevel > 3 { // Print out the selected post fmt.Println("----------- GET POST START -----------------") fmt.Println(resp.String("GP: ")) } if DebugLevel > 3 { // Print out the end of the selected post fmt.Println("----------- GET POST END -------------------") } var updateNeeded bool updateNeeded, err = AddUpdatableChilds(&p, resp, dbmap) if err != nil { return errors.New(fmt.Sprintf("AddUpdatableChilds for post '%s' failed: %s", resp.WebPostId, err.Error())) } if updateNeeded { var rowsaffected int64 rowsaffected, err = dbmap.UpdateWithChilds(resp) if DebugLevel > 3 { // Print out the crawled info fmt.Println("----------- REUPDATE POST START -----------------") fmt.Printf("Rows affected: %d\n", rowsaffected) fmt.Println(resp.String("RUP: ")) } if err != nil { return errors.New("reupdate failed: " + err.Error()) } if DebugLevel > 3 { // Print out the end of the crawled info fmt.Println("----------- REUPDATE POST END -------------------") } } return }
func RedditPostScraper(sub string) (err error) { //drivername := "postgres" //dsn := "user=golang password=golang dbname=golang sslmode=disable" //dialect := gorp.PostgresDialect{} drivername := "mysql" dsn := "golang:golang@/golang?parseTime=true" dialect := gorp.MySQLDialect{"InnoDB", "UTF8"} // connect to db using standard Go database/sql API db, err := sql.Open(drivername, dsn) if err != nil { return errors.New("sql.Open failed: " + err.Error()) } // Open doesn't open a connection. Validate DSN data: if err = db.Ping(); err != nil { return errors.New("db.Ping failed: " + err.Error()) } // construct a gorp DbMap dbmap := &gorp.DbMap{Db: db, Dialect: dialect} defer dbmap.Db.Close() dbmap.DebugLevel = DebugLevel // Will log all SQL statements + args as they are run // The first arg is a string prefix to prepend to all log messages //dbmap.TraceOn("[gorp]", log.New(os.Stdout, "fetch:", log.Lmicroseconds)) // register the structs you wish to use with gorp // you can also use the shorter dbmap.AddTable() if you // don't want to override the table name tablename := "posts_index_test" // SetKeys(true) means we have a auto increment primary key, which // will get automatically bound to your struct post-insert table := dbmap.AddTableWithName(post.Post{}, tablename) table.SetKeys(true, "PID") // create the table. in a production system you'd generally // use a migration tool, or create the tables via scripts if err = dbmap.CreateTablesIfNotExists(); err != nil { return errors.New("Create tables failed: " + err.Error()) } // Force create all indexes for this database if err = dbmap.CreateIndexes(); err != nil { return errors.New("Create indexes failed: " + err.Error()) } // Get data from reddit geturl := "http://www.reddit.com/r/" + sub + "/new" resp, err := http.Get(geturl) if err != nil { return errors.New("Failed to http.Get from " + geturl + ": " + err.Error()) } if resp != nil { if resp.Body == nil { return errors.New("Body from " + geturl + " is nil!") } else { defer resp.Body.Close() } } else { return errors.New("Response from " + geturl + " is nil!") } if resp.StatusCode != 200 { // 200 = OK httperr := fmt.Sprintf("Failed to http.Get from %s: Http Status code: %d: Msg: %s", geturl, resp.StatusCode, resp.Status) return errors.New(httperr) } // Create a new post slice and then parse the response body into ps ps := make([]post.Post, 0) ps, err = ParseHtmlReddit(resp.Body, ps) if err != nil { return errors.New("Error in RedditParseHtml: " + geturl + ": " + err.Error()) } foundnewposts := false updatedposts := 0 // insert rows - auto increment PKs will be set properly after the insert for _, htmlpost := range ps { if htmlpost.Err == nil { var postcount int // Store reddit sub htmlpost.PostSub = sub // check if post already exists intSelectResult := make([]int, 0) postcountsql := "select count(*) from " + dbmap.Dialect.QuoteField(tablename) + " where WebPostId = :post_id" _, err := dbmap.Select(&intSelectResult, postcountsql, map[string]interface{}{ "post_id": htmlpost.WebPostId, }) if err != nil { return errors.New(fmt.Sprintf("Query: %s failed: %s\n", postcountsql, err.Error())) } if len(intSelectResult) == 0 { return errors.New(fmt.Sprintf("Query: %s returned no result\n", postcountsql)) } postcount = intSelectResult[0] // DEBUG if DebugLevel > 3 { fmt.Println("HTMLpost.WebPostId: " + htmlpost.WebPostId) fmt.Printf("HTMLpost.Id: %v\n", htmlpost.Id) fmt.Printf("DBpost count: %v \n", postcount) } // New post? then insert if postcount == 0 { foundnewposts = true err = dbmap.Insert(&htmlpost) if DebugLevel > 2 { // Print out the crawled info fmt.Println("----------- INSERT POST START -----------------") fmt.Println(htmlpost.String()) } if err != nil { return errors.New("insert into table " + dbmap.Dialect.QuoteField(tablename) + " failed: " + err.Error()) } if DebugLevel > 2 { // Print out the end of the crawled info fmt.Println("----------- INSERT POST END -------------------") } } else { // Post already exists, do an update dbposts := make([]post.Post, 0) getpostsql := "select * from " + dbmap.Dialect.QuoteField(tablename) + " where WebPostId = :post_id" _, err := dbmap.Select(&dbposts, getpostsql, map[string]interface{}{ "post_id": htmlpost.WebPostId, }) if err != nil { return errors.New(fmt.Sprintf("Getting WebPostId %s from DB failes\n", htmlpost.WebPostId, err.Error())) } var dbpost post.Post if len(dbposts) > 0 { dbpost = dbposts[0] } else { return errors.New(fmt.Sprintf("Query: %s returned no result\n", getpostsql)) } // DEBUG if DebugLevel > 3 { fmt.Printf("DBPOST: %s\n", dbpost.String()) fmt.Printf("DBpost.Id: %v\n", dbpost.Id) fmt.Printf("DBpost.Score: %v\n", dbpost.Score) } if htmlpost.Score != dbpost.Score { if DebugLevel > 2 { // Print out the update info fmt.Println("----------- UPDATE POST START -----------------") fmt.Println("Title: " + dbpost.Title) fmt.Printf("Id: %v\n", dbpost.Id) fmt.Printf("From score %d to score %d\n", dbpost.Score, htmlpost.Score) fmt.Println("----------- UPDATE POST END -------------------") } dbpost.Score = htmlpost.Score affectedrows, err := dbmap.Update(&dbpost) switch { case err != nil: return errors.New("update table " + tablename + " failed: " + err.Error()) case affectedrows == 0: return errors.New(fmt.Sprintf("update table %s for Id %d did not affect any lines", tablename, dbpost.Id)) default: updatedposts++ } } } } else { if DebugLevel > 1 { fmt.Println("Single post error in " + geturl + ": " + htmlpost.Err.Error()) } } } if !foundnewposts { if DebugLevel > 2 { fmt.Println("No new posts found at " + geturl) } } if updatedposts > 0 { if DebugLevel > 2 { fmt.Printf("%d posts have been updated from %s\n", updatedposts, geturl) } } return }
func ParseHtmlComments(p *post.Post) (err error) { if p.WebPostId == "" { return errors.New(fmt.Sprintf("p.WebPostId is empty in post '%s'", p.String("PC: "))) } // Get comments from hackernews geturl := fmt.Sprintf("http://news.ycombinator.com/item?id=%s", p.WebPostId) // DEBUG //geturl := fmt.Sprintf("https://news.ycombinator.com/item?id=9751858") if DebugLevel > 2 { fmt.Printf("START GET COMMENTS FROM '%s'\n", geturl) } body, err := GetHtmlBody(geturl) if err != nil { return errors.New("GetHtmlBody: " + err.Error()) } // Create a qoquery document to parse from an io.Reader doc, err := goquery.NewDocumentFromReader(body) if err != nil { return errors.New("Failed to parse HTML: " + err.Error()) } // Find hackernews comments = elements with class "athing" thing := doc.Find(".athing") for iThing := range thing.Nodes { // use `singlecomment` as a selection of one single post singlecomment := thing.Eq(iThing) comment := post.NewComment() //p.Comments = append(p.Comments, &comment) comheads := singlecomment.Find(".comhead a") for i := range comheads.Nodes { comhead := comheads.Eq(i) t, _ := comhead.Html() s, exists := comhead.Attr("href") if exists { if strings.HasPrefix(s, "user?id") { comment.User = t continue } if strings.HasPrefix(s, "item?id") { if strings.Contains(t, "ago") { var commentDate time.Time commentDate, err = GetDateFromCreatedAgo(t) if err != nil { comment.Err = errors.New(fmt.Sprintf("Failed to convert to date: %s: %s\n", t, err.Error())) err = nil continue } comment.CommentDate = commentDate if len(strings.Split(s, "=")) > 1 { comment.WebCommentId = strings.Split(s, "=")[1] } //comment.Err = err } } } comments := singlecomment.Find("span.comment") removeReplySelection := comments.Find("span div.reply") removeReplySelection.Remove() var sep string for iComment, _ := range comments.Nodes { s := comments.Eq(iComment) h, _ := s.Html() if !utf8.ValidString(s.Text()) { comment.Err = errors.New(fmt.Sprintf("Ignoring invalid UTF-8: '%s'", s.Text())) break } h, err = HtmlToMarkdown(h) if err != nil { comment.Err = errors.New(fmt.Sprintf("Ignoring markdownifier: '%s'", err.Error())) break } if h != "" { comment.Body = comment.Body + sep + h } sep = "\n" } //fmt.Printf("POST %s BODY = %s\n", p.WebPostId, comment.Body) if comment.Err == nil && len(comment.WebCommentId) > 0 && len(comment.Body) > 0 { p.Comments = append(p.Comments, &comment) } else { p.CommentParseErrors = append(p.CommentParseErrors, &comment) } } } if DebugLevel > 0 { fmt.Printf("GET COMMENTS FROM '%s' yielded %d comments\n", geturl, len(p.Comments)) } return err }
func HackerNewsPostScraper(sub string) (err error) { //drivername := "postgres" //dsn := "user=golang password=golang dbname=golang sslmode=disable" //dialect := gorp.PostgresDialect{} drivername := "mysql" dsn := "golang:golang@/golang?parseTime=true" dialect := gorp.MySQLDialect{"InnoDB", "utf8mb4"} // connect to db using standard Go database/sql API db, err := sql.Open(drivername, dsn) if err != nil { return errors.New("sql.Open failed: " + err.Error()) } // Open doesn't open a connection. Validate DSN data using ping if err = db.Ping(); err != nil { return errors.New("db.Ping failed: " + err.Error()) } // Set the connection to use utf8bmb4 if dialect.Engine == "InnoDB" { fmt.Println("Setting connection to utf8mb4") _, err = db.Exec("SET NAMES utf8mb4 COLLATE utf8mb4_general_ci") if err != nil { return errors.New("SET NAMES utf8mb4 COLLATE utf8mb4_general_ci: " + err.Error()) } } // construct a gorp DbMap dbmap := &gorp.DbMap{Db: db, Dialect: dialect} defer dbmap.Db.Close() dbmap.DebugLevel = DebugLevel // Will log all SQL statements + args as they are run // The first arg is a string prefix to prepend to all log messages //dbmap.TraceOn("[gorp]", log.New(os.Stdout, "Trace:", log.Lmicroseconds)) // register the structs you wish to use with gorp // you can also use the shorter dbmap.AddTable() if you // don't want to override the table name // SetKeys(true) means we have a auto increment primary key, which // will get automatically bound to your struct post-insert table := dbmap.AddTableWithName(post.Post{}, "posts_index_test") table.SetKeys(true, "PID") // Add the comments table table = dbmap.AddTableWithName(post.Comment{}, "comments_index_test") table.SetKeys(true, "Id") // create the table. in a production system you'd generally // use a migration tool, or create the tables via scripts if err = dbmap.CreateTablesIfNotExists(); err != nil { return errors.New("Create tables failed: " + err.Error()) } // Force create all indexes for this database if err = dbmap.CreateIndexes(); err != nil { return errors.New("Create indexes failed: " + err.Error()) } // Get data from hackernews geturl := "http://news.ycombinator.com/" + sub // DEBUG for a special thread //geturl := "https://news.ycombinator.com/item?id=10056146" body, err := GetHtmlBody(geturl) if err != nil { return errors.New("GetHtmlBody: " + err.Error()) } // Create a new post slice and then parse the response body into ps ps := make([]*post.Post, 0) //cs := make([]*post.Comment, 0) ps, err = ParseHtmlHackerNews(body, ps) if err != nil { return errors.New("ParseHtmlHackerNews: " + err.Error()) } // Number of updated posts var updatedPostsCount int64 // Number of new posts var insertedPostsCount int64 var insertedPostsCommentCount int64 var updatedPostsCommentCount int64 // Number of post parsing errors var htmlPostErrorCount uint32 // Number of comment parsing errors var htmlCommentErrorCount uint32 // loop over all parsed posts for _, htmlpost := range ps { if htmlpost.WebPostId == "" { if DebugLevel > 1 { fmt.Printf("WebPostId not set in %s\n", htmlpost.Title) } // Fail early, continue with next post continue } if htmlpost.Err != nil { if DebugLevel > 1 { fmt.Println("Single post error in " + geturl + ": " + htmlpost.Err.Error()) } // Fail early, continue with next post htmlPostErrorCount++ continue } if len(htmlpost.CommentParseErrors) > 0 { for _, c := range htmlpost.CommentParseErrors { htmlCommentErrorCount++ if DebugLevel > 2 { fmt.Println("Single comment error in '" + geturl + "' for WebPostId '" + htmlpost.WebPostId + ": " + c.Err.Error()) } } } // Store post sub htmlpost.PostSub = sub tm, err := dbmap.TableFor(reflect.TypeOf(*htmlpost), true) if err != nil { return errors.New("Failed to get reflection type: " + err.Error()) } if DebugLevel > 3 { fmt.Println("TABLEMAP: " + tm.TableName) } // check if post already exists dbposts := make([]post.Post, 0) getpostsql := "select * from " + dbmap.Dialect.QuotedTableForQuery("", tm.TableName) + " where WebPostId = :post_id" _, err = dbmap.Select(&dbposts, getpostsql, map[string]interface{}{ "post_id": htmlpost.WebPostId, }) if err != nil { return errors.New(fmt.Sprintf("Getting PostId %s from DB failed: %s", htmlpost.WebPostId, err.Error())) } var dbpost *post.Post if len(dbposts) == 1 { dbpost = &dbposts[0] } else if len(dbposts) > 1 { return errors.New(fmt.Sprintf("Query: %s returned %d rows", getpostsql, len(dbposts))) } postcount := len(dbposts) // New post? then insert if postcount == 0 { if DebugLevel > 2 { fmt.Printf("New post found, inserting htmlpost.WebPostId '%s'\n", htmlpost.WebPostId) } // Reset the rowcount info dbmap.LastOpInfo.Reset() htmlpost.CommentCount = uint64(len(htmlpost.Comments)) // Insert the new post into the database err = dbmap.InsertWithChilds(htmlpost) if DebugLevel > 2 { // Print out the crawled info fmt.Println("----------- INSERT POST START -----------------") fmt.Println(htmlpost.String("INSERT: ")) } if err != nil { return errors.New("insert into table " + dbmap.Dialect.QuoteField(tm.TableName) + " failed: " + err.Error()) } if DebugLevel > 2 { // Print out the end of the crawled info fmt.Println("----------- INSERT POST END -------------------") } insertedPostsCount += dbmap.LastOpInfo.RowCount insertedPostsCommentCount += dbmap.LastOpInfo.ChildInsertRowCount } else { // Post already exists, get the full post with its comments from the db res, err := dbmap.GetWithChilds(post.Post{}, 9999999999, 0, dbpost.Id) if err != nil { return errors.New("get failed: " + err.Error()) } if res == nil { return errors.New(fmt.Sprintf("Get post for id %d did not return any rows ", dbpost.Id)) } dbpost = res.(*post.Post) // Check if an update is needed var updateNeeded bool updateNeeded, err = AddUpdatableChilds(htmlpost, dbpost, dbmap) if err != nil { return errors.New(fmt.Sprintf("CheckIfDataChanged for post '%s' failed: %s", htmlpost.WebPostId, err.Error())) } //if htmlpost.Score != dbpost.Score { if updateNeeded { // The post changed, do an update into the database //fmt.Println("Post Date db: " + dbpost.PostDate.String() + ", html: " + htmlpost.PostDate.String()) //fmt.Printf("Post Score db: %d, html: %d\n", dbpost.Score, htmlpost.Score) if DebugLevel > 2 { fmt.Println("----------- UPDATE POST START -----------------") fmt.Println(dbpost.String("UPDATE1: ")) fmt.Printf("From score %d to score %d\n", dbpost.Score, htmlpost.Score) } dbpost.Score = htmlpost.Score dbpost.PostDate = htmlpost.PostDate // Reset the rowcount info dbmap.LastOpInfo.Reset() // Update the posts together with its comments affectedrows, err := dbmap.UpdateWithChilds(dbpost) switch { case err != nil: return errors.New("update table " + tm.TableName + " failed: " + err.Error()) case affectedrows == 0: return errors.New(fmt.Sprintf("update table %s for Id %d did not affect any lines", tm.TableName, dbpost.Id)) default: updatedPostsCount += dbmap.LastOpInfo.RowCount insertedPostsCommentCount += dbmap.LastOpInfo.ChildInsertRowCount updatedPostsCommentCount += dbmap.LastOpInfo.ChildUpdateRowCount dbpost.CommentCount += uint64(dbmap.LastOpInfo.ChildInsertRowCount) _, err = dbmap.Update(dbpost) if err != nil { return errors.New(fmt.Sprintf("Update for post '%s' failed: %s", dbpost.WebPostId, err.Error())) } if DebugLevel > 2 { // Print out the update info fmt.Println("----------- UPDATE POST COMMIT -----------------") fmt.Println(dbpost.String("UPDATE2: ")) fmt.Println("----------- UPDATE POST END -------------------") } } } } } if insertedPostsCount == 0 && updatedPostsCount == 0 { if DebugLevel > 2 { fmt.Println("No new posts found at " + geturl) } return } if DebugLevel > 2 { fmt.Printf("%d new posts have been inserted from %s\n", insertedPostsCount, geturl) fmt.Printf("%d posts have been updated from %s\n", updatedPostsCount, geturl) fmt.Printf("%d new comments have been inserted from %s\n", insertedPostsCommentCount, geturl) fmt.Printf("%d comments have been updated from %s\n", updatedPostsCommentCount, geturl) fmt.Printf("%d comment errors\n", htmlCommentErrorCount) } return }
func HackerNewsPostScraper(sub string) (err error) { //drivername := "postgres" //dsn := "user=golang password=golang dbname=golang sslmode=disable" //dialect := gorp.PostgresDialect{} drivername := "mysql" dsn := "golang:golang@/golang?parseTime=true" dialect := gorp.MySQLDialect{"InnoDB", "UTF8"} // connect to db using standard Go database/sql API db, err := sql.Open(drivername, dsn) if err != nil { return errors.New("sql.Open failed: " + err.Error()) } // Open doesn't open a connection. Validate DSN data using ping if err = db.Ping(); err != nil { return errors.New("db.Ping failed: " + err.Error()) } // construct a gorp DbMap dbmap := &gorp.DbMap{Db: db, Dialect: dialect} defer dbmap.Db.Close() dbmap.DebugLevel = DebugLevel // Will log all SQL statements + args as they are run // The first arg is a string prefix to prepend to all log messages //dbmap.TraceOn("[gorp]", log.New(os.Stdout, "fetch:", log.Lmicroseconds)) // register the structs you wish to use with gorp // you can also use the shorter dbmap.AddTable() if you // don't want to override the table name tablename := "posts_index_test" // SetKeys(true) means we have a auto increment primary key, which // will get automatically bound to your struct post-insert table := dbmap.AddTableWithName(post.Post{}, tablename) table.SetKeys(true, "PID") // create the table. in a production system you'd generally // use a migration tool, or create the tables via scripts if err = dbmap.CreateTablesIfNotExists(); err != nil { return errors.New("Create tables failed: " + err.Error()) } // Force create all indexes for this database if err = dbmap.CreateIndexes(); err != nil { return errors.New("Create indexes failed: " + err.Error()) } // Get data from hackernews geturl := "http://news.ycombinator.com/" body, err := GetHtmlBody(geturl) if err != nil { return errors.New("GetHtmlBody: " + err.Error()) } // Create a new post slice and then parse the response body into ps ps := make([]*post.Post, 0) ps, err = ParseHtmlHackerNews(body, ps) if err != nil { return errors.New("ParseHtmlHackerNews: " + err.Error()) } // Number of updated posts var updatedPostsCount uint32 // Number of new posts var insertedPostsCount uint32 // insert rows - auto increment PKs will be set properly after the insert for _, htmlpost := range ps { if htmlpost.WebPostId == "" { if DebugLevel > 1 { fmt.Printf("WebPostId not set in %s\n", htmlpost.Title) } // Fail early, continue with next post continue } if htmlpost.Err != nil { if DebugLevel > 1 { fmt.Println("Single post error in " + geturl + ": " + htmlpost.Err.Error()) } // Fail early, continue with next post continue } // Store post sub htmlpost.PostSub = sub // check if post already exists intSelectResult := make([]int, 0) postcountsql := "select count(*) from " + dbmap.Dialect.QuoteField(tablename) + " where WebPostId = :post_id" _, err := dbmap.Select(&intSelectResult, postcountsql, map[string]interface{}{ "post_id": htmlpost.WebPostId, }) if err != nil { return errors.New(fmt.Sprintf("Query: %s failed: %s\n", postcountsql, err.Error())) } if len(intSelectResult) == 0 { return errors.New(fmt.Sprintf("Query: %s returned no result\n", postcountsql)) } postcount := intSelectResult[0] // New post? then insert if postcount == 0 { // Insert the new post into the database err = dbmap.Insert(htmlpost) if DebugLevel > 2 { // Print out the crawled info fmt.Println("----------- INSERT POST START -----------------") fmt.Println(htmlpost.String()) } if err != nil { return errors.New("insert into table " + dbmap.Dialect.QuoteField(tablename) + " failed: " + err.Error()) } if DebugLevel > 2 { // Print out the end of the crawled info fmt.Println("----------- INSERT POST END -------------------") } insertedPostsCount++ } else { // Post already exists, do an update // Create a slice of posts to select into dbposts := make([]post.Post, 0) getpostsql := "select * from " + dbmap.Dialect.QuoteField(tablename) + " where WebPostId = :post_id" _, err := dbmap.Select(&dbposts, getpostsql, map[string]interface{}{ "post_id": htmlpost.WebPostId, }) if err != nil { return errors.New(fmt.Sprintf("Getting WebPostId %s from DB failed: %s\n", htmlpost.WebPostId, err.Error())) } var dbpost post.Post if len(dbposts) > 0 { dbpost = dbposts[0] } else { return errors.New(fmt.Sprintf("Query: %s returned no result\n", getpostsql)) } if htmlpost.Score != dbpost.Score { // The post score changed, do an update into the database //fmt.Println("Post Date db: " + dbpost.PostDate.String() + ", html: " + htmlpost.PostDate.String()) //fmt.Printf("Post Score db: %d, html: %d\n", dbpost.Score, htmlpost.Score) if DebugLevel > 2 { fmt.Println("----------- UPDATE POST START -----------------") fmt.Println(dbpost.String()) fmt.Printf("From score %d to score %d\n", dbpost.Score, htmlpost.Score) } dbpost.Score = htmlpost.Score dbpost.PostDate = htmlpost.PostDate affectedrows, err := dbmap.Update(&dbpost) switch { case err != nil: return errors.New("update table " + tablename + " failed: " + err.Error()) case affectedrows == 0: return errors.New(fmt.Sprintf("update table %s for Id %d did not affect any lines", tablename, dbpost.Id)) default: updatedPostsCount++ if DebugLevel > 2 { // Print out the update info fmt.Println("----------- UPDATE POST COMMIT -----------------") fmt.Println(dbpost.String()) fmt.Println("----------- UPDATE POST END -------------------") } } } } } if insertedPostsCount == 0 && updatedPostsCount == 0 { if DebugLevel > 2 { fmt.Println("No new posts found at " + geturl) } } if updatedPostsCount > 0 && DebugLevel > 2 { fmt.Printf("%d existing posts have been updated from %s\n", updatedPostsCount, geturl) } if insertedPostsCount > 0 && DebugLevel > 2 { fmt.Printf("%d new posts have been inserted from %s\n", insertedPostsCount, geturl) } return }