func HackerNewsPostScraper(sub string) (err error) { //drivername := "postgres" //dsn := "user=golang password=golang dbname=golang sslmode=disable" //dialect := gorp.PostgresDialect{} drivername := "mysql" dsn := "golang:golang@/golang?parseTime=true" dialect := gorp.MySQLDialect{"InnoDB", "utf8mb4"} // connect to db using standard Go database/sql API db, err := sql.Open(drivername, dsn) if err != nil { return errors.New("sql.Open failed: " + err.Error()) } // Open doesn't open a connection. Validate DSN data using ping if err = db.Ping(); err != nil { return errors.New("db.Ping failed: " + err.Error()) } // Set the connection to use utf8bmb4 if dialect.Engine == "InnoDB" { fmt.Println("Setting connection to utf8mb4") _, err = db.Exec("SET NAMES utf8mb4 COLLATE utf8mb4_general_ci") if err != nil { return errors.New("SET NAMES utf8mb4 COLLATE utf8mb4_general_ci: " + err.Error()) } } // construct a gorp DbMap dbmap := &gorp.DbMap{Db: db, Dialect: dialect} defer dbmap.Db.Close() dbmap.DebugLevel = DebugLevel // Will log all SQL statements + args as they are run // The first arg is a string prefix to prepend to all log messages //dbmap.TraceOn("[gorp]", log.New(os.Stdout, "Trace:", log.Lmicroseconds)) // register the structs you wish to use with gorp // you can also use the shorter dbmap.AddTable() if you // don't want to override the table name // SetKeys(true) means we have a auto increment primary key, which // will get automatically bound to your struct post-insert table := dbmap.AddTableWithName(post.Post{}, "posts_index_test") table.SetKeys(true, "PID") // Add the comments table table = dbmap.AddTableWithName(post.Comment{}, "comments_index_test") table.SetKeys(true, "Id") // create the table. in a production system you'd generally // use a migration tool, or create the tables via scripts if err = dbmap.CreateTablesIfNotExists(); err != nil { return errors.New("Create tables failed: " + err.Error()) } // Force create all indexes for this database if err = dbmap.CreateIndexes(); err != nil { return errors.New("Create indexes failed: " + err.Error()) } // Get data from hackernews geturl := "http://news.ycombinator.com/" + sub // DEBUG for a special thread //geturl := "https://news.ycombinator.com/item?id=10056146" body, err := GetHtmlBody(geturl) if err != nil { return errors.New("GetHtmlBody: " + err.Error()) } // Create a new post slice and then parse the response body into ps ps := make([]*post.Post, 0) //cs := make([]*post.Comment, 0) ps, err = ParseHtmlHackerNews(body, ps) if err != nil { return errors.New("ParseHtmlHackerNews: " + err.Error()) } // Number of updated posts var updatedPostsCount int64 // Number of new posts var insertedPostsCount int64 var insertedPostsCommentCount int64 var updatedPostsCommentCount int64 // Number of post parsing errors var htmlPostErrorCount uint32 // Number of comment parsing errors var htmlCommentErrorCount uint32 // loop over all parsed posts for _, htmlpost := range ps { if htmlpost.WebPostId == "" { if DebugLevel > 1 { fmt.Printf("WebPostId not set in %s\n", htmlpost.Title) } // Fail early, continue with next post continue } if htmlpost.Err != nil { if DebugLevel > 1 { fmt.Println("Single post error in " + geturl + ": " + htmlpost.Err.Error()) } // Fail early, continue with next post htmlPostErrorCount++ continue } if len(htmlpost.CommentParseErrors) > 0 { for _, c := range htmlpost.CommentParseErrors { htmlCommentErrorCount++ if DebugLevel > 2 { fmt.Println("Single comment error in '" + geturl + "' for WebPostId '" + htmlpost.WebPostId + ": " + c.Err.Error()) } } } // Store post sub htmlpost.PostSub = sub tm, err := dbmap.TableFor(reflect.TypeOf(*htmlpost), true) if err != nil { return errors.New("Failed to get reflection type: " + err.Error()) } if DebugLevel > 3 { fmt.Println("TABLEMAP: " + tm.TableName) } // check if post already exists dbposts := make([]post.Post, 0) getpostsql := "select * from " + dbmap.Dialect.QuotedTableForQuery("", tm.TableName) + " where WebPostId = :post_id" _, err = dbmap.Select(&dbposts, getpostsql, map[string]interface{}{ "post_id": htmlpost.WebPostId, }) if err != nil { return errors.New(fmt.Sprintf("Getting PostId %s from DB failed: %s", htmlpost.WebPostId, err.Error())) } var dbpost *post.Post if len(dbposts) == 1 { dbpost = &dbposts[0] } else if len(dbposts) > 1 { return errors.New(fmt.Sprintf("Query: %s returned %d rows", getpostsql, len(dbposts))) } postcount := len(dbposts) // New post? then insert if postcount == 0 { if DebugLevel > 2 { fmt.Printf("New post found, inserting htmlpost.WebPostId '%s'\n", htmlpost.WebPostId) } // Reset the rowcount info dbmap.LastOpInfo.Reset() htmlpost.CommentCount = uint64(len(htmlpost.Comments)) // Insert the new post into the database err = dbmap.InsertWithChilds(htmlpost) if DebugLevel > 2 { // Print out the crawled info fmt.Println("----------- INSERT POST START -----------------") fmt.Println(htmlpost.String("INSERT: ")) } if err != nil { return errors.New("insert into table " + dbmap.Dialect.QuoteField(tm.TableName) + " failed: " + err.Error()) } if DebugLevel > 2 { // Print out the end of the crawled info fmt.Println("----------- INSERT POST END -------------------") } insertedPostsCount += dbmap.LastOpInfo.RowCount insertedPostsCommentCount += dbmap.LastOpInfo.ChildInsertRowCount } else { // Post already exists, get the full post with its comments from the db res, err := dbmap.GetWithChilds(post.Post{}, 9999999999, 0, dbpost.Id) if err != nil { return errors.New("get failed: " + err.Error()) } if res == nil { return errors.New(fmt.Sprintf("Get post for id %d did not return any rows ", dbpost.Id)) } dbpost = res.(*post.Post) // Check if an update is needed var updateNeeded bool updateNeeded, err = AddUpdatableChilds(htmlpost, dbpost, dbmap) if err != nil { return errors.New(fmt.Sprintf("CheckIfDataChanged for post '%s' failed: %s", htmlpost.WebPostId, err.Error())) } //if htmlpost.Score != dbpost.Score { if updateNeeded { // The post changed, do an update into the database //fmt.Println("Post Date db: " + dbpost.PostDate.String() + ", html: " + htmlpost.PostDate.String()) //fmt.Printf("Post Score db: %d, html: %d\n", dbpost.Score, htmlpost.Score) if DebugLevel > 2 { fmt.Println("----------- UPDATE POST START -----------------") fmt.Println(dbpost.String("UPDATE1: ")) fmt.Printf("From score %d to score %d\n", dbpost.Score, htmlpost.Score) } dbpost.Score = htmlpost.Score dbpost.PostDate = htmlpost.PostDate // Reset the rowcount info dbmap.LastOpInfo.Reset() // Update the posts together with its comments affectedrows, err := dbmap.UpdateWithChilds(dbpost) switch { case err != nil: return errors.New("update table " + tm.TableName + " failed: " + err.Error()) case affectedrows == 0: return errors.New(fmt.Sprintf("update table %s for Id %d did not affect any lines", tm.TableName, dbpost.Id)) default: updatedPostsCount += dbmap.LastOpInfo.RowCount insertedPostsCommentCount += dbmap.LastOpInfo.ChildInsertRowCount updatedPostsCommentCount += dbmap.LastOpInfo.ChildUpdateRowCount dbpost.CommentCount += uint64(dbmap.LastOpInfo.ChildInsertRowCount) _, err = dbmap.Update(dbpost) if err != nil { return errors.New(fmt.Sprintf("Update for post '%s' failed: %s", dbpost.WebPostId, err.Error())) } if DebugLevel > 2 { // Print out the update info fmt.Println("----------- UPDATE POST COMMIT -----------------") fmt.Println(dbpost.String("UPDATE2: ")) fmt.Println("----------- UPDATE POST END -------------------") } } } } } if insertedPostsCount == 0 && updatedPostsCount == 0 { if DebugLevel > 2 { fmt.Println("No new posts found at " + geturl) } return } if DebugLevel > 2 { fmt.Printf("%d new posts have been inserted from %s\n", insertedPostsCount, geturl) fmt.Printf("%d posts have been updated from %s\n", updatedPostsCount, geturl) fmt.Printf("%d new comments have been inserted from %s\n", insertedPostsCommentCount, geturl) fmt.Printf("%d comments have been updated from %s\n", updatedPostsCommentCount, geturl) fmt.Printf("%d comment errors\n", htmlCommentErrorCount) } return }
func RedditPostScraper(sub string) (err error) { //drivername := "postgres" //dsn := "user=golang password=golang dbname=golang sslmode=disable" //dialect := gorp.PostgresDialect{} drivername := "mysql" dsn := "golang:golang@/golang?parseTime=true" dialect := gorp.MySQLDialect{"InnoDB", "UTF8"} // connect to db using standard Go database/sql API db, err := sql.Open(drivername, dsn) if err != nil { return errors.New("sql.Open failed: " + err.Error()) } // Open doesn't open a connection. Validate DSN data: if err = db.Ping(); err != nil { return errors.New("db.Ping failed: " + err.Error()) } // construct a gorp DbMap dbmap := &gorp.DbMap{Db: db, Dialect: dialect} defer dbmap.Db.Close() dbmap.DebugLevel = DebugLevel // Will log all SQL statements + args as they are run // The first arg is a string prefix to prepend to all log messages //dbmap.TraceOn("[gorp]", log.New(os.Stdout, "fetch:", log.Lmicroseconds)) // register the structs you wish to use with gorp // you can also use the shorter dbmap.AddTable() if you // don't want to override the table name tablename := "posts_index_test" // SetKeys(true) means we have a auto increment primary key, which // will get automatically bound to your struct post-insert table := dbmap.AddTableWithName(post.Post{}, tablename) table.SetKeys(true, "PID") // create the table. in a production system you'd generally // use a migration tool, or create the tables via scripts if err = dbmap.CreateTablesIfNotExists(); err != nil { return errors.New("Create tables failed: " + err.Error()) } // Force create all indexes for this database if err = dbmap.CreateIndexes(); err != nil { return errors.New("Create indexes failed: " + err.Error()) } // Get data from reddit geturl := "http://www.reddit.com/r/" + sub + "/new" resp, err := http.Get(geturl) if err != nil { return errors.New("Failed to http.Get from " + geturl + ": " + err.Error()) } if resp != nil { if resp.Body == nil { return errors.New("Body from " + geturl + " is nil!") } else { defer resp.Body.Close() } } else { return errors.New("Response from " + geturl + " is nil!") } if resp.StatusCode != 200 { // 200 = OK httperr := fmt.Sprintf("Failed to http.Get from %s: Http Status code: %d: Msg: %s", geturl, resp.StatusCode, resp.Status) return errors.New(httperr) } // Create a new post slice and then parse the response body into ps ps := make([]post.Post, 0) ps, err = ParseHtmlReddit(resp.Body, ps) if err != nil { return errors.New("Error in RedditParseHtml: " + geturl + ": " + err.Error()) } foundnewposts := false updatedposts := 0 // insert rows - auto increment PKs will be set properly after the insert for _, htmlpost := range ps { if htmlpost.Err == nil { var postcount int // Store reddit sub htmlpost.PostSub = sub // check if post already exists intSelectResult := make([]int, 0) postcountsql := "select count(*) from " + dbmap.Dialect.QuoteField(tablename) + " where WebPostId = :post_id" _, err := dbmap.Select(&intSelectResult, postcountsql, map[string]interface{}{ "post_id": htmlpost.WebPostId, }) if err != nil { return errors.New(fmt.Sprintf("Query: %s failed: %s\n", postcountsql, err.Error())) } if len(intSelectResult) == 0 { return errors.New(fmt.Sprintf("Query: %s returned no result\n", postcountsql)) } postcount = intSelectResult[0] // DEBUG if DebugLevel > 3 { fmt.Println("HTMLpost.WebPostId: " + htmlpost.WebPostId) fmt.Printf("HTMLpost.Id: %v\n", htmlpost.Id) fmt.Printf("DBpost count: %v \n", postcount) } // New post? then insert if postcount == 0 { foundnewposts = true err = dbmap.Insert(&htmlpost) if DebugLevel > 2 { // Print out the crawled info fmt.Println("----------- INSERT POST START -----------------") fmt.Println(htmlpost.String()) } if err != nil { return errors.New("insert into table " + dbmap.Dialect.QuoteField(tablename) + " failed: " + err.Error()) } if DebugLevel > 2 { // Print out the end of the crawled info fmt.Println("----------- INSERT POST END -------------------") } } else { // Post already exists, do an update dbposts := make([]post.Post, 0) getpostsql := "select * from " + dbmap.Dialect.QuoteField(tablename) + " where WebPostId = :post_id" _, err := dbmap.Select(&dbposts, getpostsql, map[string]interface{}{ "post_id": htmlpost.WebPostId, }) if err != nil { return errors.New(fmt.Sprintf("Getting WebPostId %s from DB failes\n", htmlpost.WebPostId, err.Error())) } var dbpost post.Post if len(dbposts) > 0 { dbpost = dbposts[0] } else { return errors.New(fmt.Sprintf("Query: %s returned no result\n", getpostsql)) } // DEBUG if DebugLevel > 3 { fmt.Printf("DBPOST: %s\n", dbpost.String()) fmt.Printf("DBpost.Id: %v\n", dbpost.Id) fmt.Printf("DBpost.Score: %v\n", dbpost.Score) } if htmlpost.Score != dbpost.Score { if DebugLevel > 2 { // Print out the update info fmt.Println("----------- UPDATE POST START -----------------") fmt.Println("Title: " + dbpost.Title) fmt.Printf("Id: %v\n", dbpost.Id) fmt.Printf("From score %d to score %d\n", dbpost.Score, htmlpost.Score) fmt.Println("----------- UPDATE POST END -------------------") } dbpost.Score = htmlpost.Score affectedrows, err := dbmap.Update(&dbpost) switch { case err != nil: return errors.New("update table " + tablename + " failed: " + err.Error()) case affectedrows == 0: return errors.New(fmt.Sprintf("update table %s for Id %d did not affect any lines", tablename, dbpost.Id)) default: updatedposts++ } } } } else { if DebugLevel > 1 { fmt.Println("Single post error in " + geturl + ": " + htmlpost.Err.Error()) } } } if !foundnewposts { if DebugLevel > 2 { fmt.Println("No new posts found at " + geturl) } } if updatedposts > 0 { if DebugLevel > 2 { fmt.Printf("%d posts have been updated from %s\n", updatedposts, geturl) } } return }
func HackerNewsPostScraper(sub string) (err error) { //drivername := "postgres" //dsn := "user=golang password=golang dbname=golang sslmode=disable" //dialect := gorp.PostgresDialect{} drivername := "mysql" dsn := "golang:golang@/golang?parseTime=true" dialect := gorp.MySQLDialect{"InnoDB", "UTF8"} // connect to db using standard Go database/sql API db, err := sql.Open(drivername, dsn) if err != nil { return errors.New("sql.Open failed: " + err.Error()) } // Open doesn't open a connection. Validate DSN data using ping if err = db.Ping(); err != nil { return errors.New("db.Ping failed: " + err.Error()) } // construct a gorp DbMap dbmap := &gorp.DbMap{Db: db, Dialect: dialect} defer dbmap.Db.Close() dbmap.DebugLevel = DebugLevel // Will log all SQL statements + args as they are run // The first arg is a string prefix to prepend to all log messages //dbmap.TraceOn("[gorp]", log.New(os.Stdout, "fetch:", log.Lmicroseconds)) // register the structs you wish to use with gorp // you can also use the shorter dbmap.AddTable() if you // don't want to override the table name tablename := "posts_index_test" // SetKeys(true) means we have a auto increment primary key, which // will get automatically bound to your struct post-insert table := dbmap.AddTableWithName(post.Post{}, tablename) table.SetKeys(true, "PID") // create the table. in a production system you'd generally // use a migration tool, or create the tables via scripts if err = dbmap.CreateTablesIfNotExists(); err != nil { return errors.New("Create tables failed: " + err.Error()) } // Force create all indexes for this database if err = dbmap.CreateIndexes(); err != nil { return errors.New("Create indexes failed: " + err.Error()) } // Get data from hackernews geturl := "http://news.ycombinator.com/" body, err := GetHtmlBody(geturl) if err != nil { return errors.New("GetHtmlBody: " + err.Error()) } // Create a new post slice and then parse the response body into ps ps := make([]*post.Post, 0) ps, err = ParseHtmlHackerNews(body, ps) if err != nil { return errors.New("ParseHtmlHackerNews: " + err.Error()) } // Number of updated posts var updatedPostsCount uint32 // Number of new posts var insertedPostsCount uint32 // insert rows - auto increment PKs will be set properly after the insert for _, htmlpost := range ps { if htmlpost.WebPostId == "" { if DebugLevel > 1 { fmt.Printf("WebPostId not set in %s\n", htmlpost.Title) } // Fail early, continue with next post continue } if htmlpost.Err != nil { if DebugLevel > 1 { fmt.Println("Single post error in " + geturl + ": " + htmlpost.Err.Error()) } // Fail early, continue with next post continue } // Store post sub htmlpost.PostSub = sub // check if post already exists intSelectResult := make([]int, 0) postcountsql := "select count(*) from " + dbmap.Dialect.QuoteField(tablename) + " where WebPostId = :post_id" _, err := dbmap.Select(&intSelectResult, postcountsql, map[string]interface{}{ "post_id": htmlpost.WebPostId, }) if err != nil { return errors.New(fmt.Sprintf("Query: %s failed: %s\n", postcountsql, err.Error())) } if len(intSelectResult) == 0 { return errors.New(fmt.Sprintf("Query: %s returned no result\n", postcountsql)) } postcount := intSelectResult[0] // New post? then insert if postcount == 0 { // Insert the new post into the database err = dbmap.Insert(htmlpost) if DebugLevel > 2 { // Print out the crawled info fmt.Println("----------- INSERT POST START -----------------") fmt.Println(htmlpost.String()) } if err != nil { return errors.New("insert into table " + dbmap.Dialect.QuoteField(tablename) + " failed: " + err.Error()) } if DebugLevel > 2 { // Print out the end of the crawled info fmt.Println("----------- INSERT POST END -------------------") } insertedPostsCount++ } else { // Post already exists, do an update // Create a slice of posts to select into dbposts := make([]post.Post, 0) getpostsql := "select * from " + dbmap.Dialect.QuoteField(tablename) + " where WebPostId = :post_id" _, err := dbmap.Select(&dbposts, getpostsql, map[string]interface{}{ "post_id": htmlpost.WebPostId, }) if err != nil { return errors.New(fmt.Sprintf("Getting WebPostId %s from DB failed: %s\n", htmlpost.WebPostId, err.Error())) } var dbpost post.Post if len(dbposts) > 0 { dbpost = dbposts[0] } else { return errors.New(fmt.Sprintf("Query: %s returned no result\n", getpostsql)) } if htmlpost.Score != dbpost.Score { // The post score changed, do an update into the database //fmt.Println("Post Date db: " + dbpost.PostDate.String() + ", html: " + htmlpost.PostDate.String()) //fmt.Printf("Post Score db: %d, html: %d\n", dbpost.Score, htmlpost.Score) if DebugLevel > 2 { fmt.Println("----------- UPDATE POST START -----------------") fmt.Println(dbpost.String()) fmt.Printf("From score %d to score %d\n", dbpost.Score, htmlpost.Score) } dbpost.Score = htmlpost.Score dbpost.PostDate = htmlpost.PostDate affectedrows, err := dbmap.Update(&dbpost) switch { case err != nil: return errors.New("update table " + tablename + " failed: " + err.Error()) case affectedrows == 0: return errors.New(fmt.Sprintf("update table %s for Id %d did not affect any lines", tablename, dbpost.Id)) default: updatedPostsCount++ if DebugLevel > 2 { // Print out the update info fmt.Println("----------- UPDATE POST COMMIT -----------------") fmt.Println(dbpost.String()) fmt.Println("----------- UPDATE POST END -------------------") } } } } } if insertedPostsCount == 0 && updatedPostsCount == 0 { if DebugLevel > 2 { fmt.Println("No new posts found at " + geturl) } } if updatedPostsCount > 0 && DebugLevel > 2 { fmt.Printf("%d existing posts have been updated from %s\n", updatedPostsCount, geturl) } if insertedPostsCount > 0 && DebugLevel > 2 { fmt.Printf("%d new posts have been inserted from %s\n", insertedPostsCount, geturl) } return }