func HackerNewsPostScraper(sub string) (err error) {
	//drivername := "postgres"
	//dsn := "user=golang password=golang dbname=golang sslmode=disable"
	//dialect := gorp.PostgresDialect{}

	drivername := "mysql"
	dsn := "golang:golang@/golang?parseTime=true"
	dialect := gorp.MySQLDialect{"InnoDB", "utf8mb4"}

	// connect to db using standard Go database/sql API
	db, err := sql.Open(drivername, dsn)
	if err != nil {
		return errors.New("sql.Open failed: " + err.Error())
	}

	// Open doesn't open a connection. Validate DSN data using ping
	if err = db.Ping(); err != nil {
		return errors.New("db.Ping failed: " + err.Error())
	}

	// Set the connection to use utf8bmb4
	if dialect.Engine == "InnoDB" {
		fmt.Println("Setting connection to utf8mb4")
		_, err = db.Exec("SET NAMES utf8mb4 COLLATE utf8mb4_general_ci")
		if err != nil {
			return errors.New("SET NAMES utf8mb4 COLLATE utf8mb4_general_ci: " + err.Error())
		}
	}

	// construct a gorp DbMap
	dbmap := &gorp.DbMap{Db: db, Dialect: dialect}
	defer dbmap.Db.Close()
	dbmap.DebugLevel = DebugLevel
	// Will log all SQL statements + args as they are run
	// The first arg is a string prefix to prepend to all log messages
	//dbmap.TraceOn("[gorp]", log.New(os.Stdout, "Trace:", log.Lmicroseconds))

	// register the structs you wish to use with gorp
	// you can also use the shorter dbmap.AddTable() if you
	// don't want to override the table name

	// SetKeys(true) means we have a auto increment primary key, which
	// will get automatically bound to your struct post-insert
	table := dbmap.AddTableWithName(post.Post{}, "posts_index_test")
	table.SetKeys(true, "PID")

	// Add the comments table
	table = dbmap.AddTableWithName(post.Comment{}, "comments_index_test")
	table.SetKeys(true, "Id")

	// create the table. in a production system you'd generally
	// use a migration tool, or create the tables via scripts
	if err = dbmap.CreateTablesIfNotExists(); err != nil {
		return errors.New("Create tables failed: " + err.Error())
	}

	// Force create all indexes for this database
	if err = dbmap.CreateIndexes(); err != nil {
		return errors.New("Create indexes failed: " + err.Error())
	}

	// Get data from hackernews
	geturl := "http://news.ycombinator.com/" + sub
	// DEBUG for a special thread
	//geturl := "https://news.ycombinator.com/item?id=10056146"
	body, err := GetHtmlBody(geturl)
	if err != nil {
		return errors.New("GetHtmlBody: " + err.Error())
	}

	// Create a new post slice and then parse the response body into ps
	ps := make([]*post.Post, 0)
	//cs := make([]*post.Comment, 0)
	ps, err = ParseHtmlHackerNews(body, ps)
	if err != nil {
		return errors.New("ParseHtmlHackerNews: " + err.Error())
	}

	// Number of updated posts
	var updatedPostsCount int64
	// Number of new posts
	var insertedPostsCount int64

	var insertedPostsCommentCount int64
	var updatedPostsCommentCount int64

	// Number of post parsing errors
	var htmlPostErrorCount uint32
	// Number of comment parsing errors
	var htmlCommentErrorCount uint32

	// loop over all parsed posts
	for _, htmlpost := range ps {

		if htmlpost.WebPostId == "" {
			if DebugLevel > 1 {
				fmt.Printf("WebPostId not set in %s\n", htmlpost.Title)
			}
			// Fail early, continue with next post
			continue
		}

		if htmlpost.Err != nil {
			if DebugLevel > 1 {
				fmt.Println("Single post error in " + geturl + ": " + htmlpost.Err.Error())
			}
			// Fail early, continue with next post
			htmlPostErrorCount++
			continue
		}

		if len(htmlpost.CommentParseErrors) > 0 {
			for _, c := range htmlpost.CommentParseErrors {
				htmlCommentErrorCount++
				if DebugLevel > 2 {
					fmt.Println("Single comment error in '" + geturl + "' for WebPostId '" + htmlpost.WebPostId + ": " + c.Err.Error())
				}

			}
		}

		// Store post sub
		htmlpost.PostSub = sub

		tm, err := dbmap.TableFor(reflect.TypeOf(*htmlpost), true)
		if err != nil {
			return errors.New("Failed to get reflection type: " + err.Error())
		}
		if DebugLevel > 3 {
			fmt.Println("TABLEMAP: " + tm.TableName)
		}
		// check if post already exists
		dbposts := make([]post.Post, 0)
		getpostsql := "select * from " + dbmap.Dialect.QuotedTableForQuery("", tm.TableName) + " where WebPostId = :post_id"
		_, err = dbmap.Select(&dbposts, getpostsql, map[string]interface{}{
			"post_id": htmlpost.WebPostId,
		})
		if err != nil {
			return errors.New(fmt.Sprintf("Getting PostId %s from DB failed: %s", htmlpost.WebPostId, err.Error()))
		}
		var dbpost *post.Post
		if len(dbposts) == 1 {
			dbpost = &dbposts[0]
		} else if len(dbposts) > 1 {
			return errors.New(fmt.Sprintf("Query: %s returned %d rows", getpostsql, len(dbposts)))
		}
		postcount := len(dbposts)

		// New post? then insert
		if postcount == 0 {

			if DebugLevel > 2 {
				fmt.Printf("New post found, inserting htmlpost.WebPostId '%s'\n", htmlpost.WebPostId)
			}

			// Reset the rowcount info
			dbmap.LastOpInfo.Reset()
			htmlpost.CommentCount = uint64(len(htmlpost.Comments))
			// Insert the new post into the database
			err = dbmap.InsertWithChilds(htmlpost)

			if DebugLevel > 2 {
				// Print out the crawled info
				fmt.Println("----------- INSERT POST START -----------------")
				fmt.Println(htmlpost.String("INSERT: "))
			}
			if err != nil {
				return errors.New("insert into table " + dbmap.Dialect.QuoteField(tm.TableName) + " failed: " + err.Error())
			}
			if DebugLevel > 2 {
				// Print out the end of the crawled info
				fmt.Println("----------- INSERT POST END -------------------")
			}
			insertedPostsCount += dbmap.LastOpInfo.RowCount
			insertedPostsCommentCount += dbmap.LastOpInfo.ChildInsertRowCount

		} else {
			// Post already exists, get the full post with its comments from the db

			res, err := dbmap.GetWithChilds(post.Post{}, 9999999999, 0, dbpost.Id)
			if err != nil {
				return errors.New("get failed: " + err.Error())
			}
			if res == nil {
				return errors.New(fmt.Sprintf("Get post for id %d did not return any rows ", dbpost.Id))
			}
			dbpost = res.(*post.Post)

			// Check if an update is needed
			var updateNeeded bool
			updateNeeded, err = AddUpdatableChilds(htmlpost, dbpost, dbmap)
			if err != nil {
				return errors.New(fmt.Sprintf("CheckIfDataChanged for post '%s' failed: %s", htmlpost.WebPostId, err.Error()))
			}
			//if htmlpost.Score != dbpost.Score {
			if updateNeeded {
				// The post changed, do an update into the database

				//fmt.Println("Post Date db: " + dbpost.PostDate.String() + ", html: " + htmlpost.PostDate.String())
				//fmt.Printf("Post Score db: %d, html: %d\n", dbpost.Score, htmlpost.Score)

				if DebugLevel > 2 {
					fmt.Println("----------- UPDATE POST START -----------------")
					fmt.Println(dbpost.String("UPDATE1: "))
					fmt.Printf("From score %d to score %d\n", dbpost.Score, htmlpost.Score)
				}
				dbpost.Score = htmlpost.Score
				dbpost.PostDate = htmlpost.PostDate

				// Reset the rowcount info
				dbmap.LastOpInfo.Reset()

				// Update the posts together with its comments
				affectedrows, err := dbmap.UpdateWithChilds(dbpost)

				switch {
				case err != nil:
					return errors.New("update table " + tm.TableName + " failed: " + err.Error())
				case affectedrows == 0:
					return errors.New(fmt.Sprintf("update table %s for Id %d did not affect any lines", tm.TableName, dbpost.Id))
				default:

					updatedPostsCount += dbmap.LastOpInfo.RowCount
					insertedPostsCommentCount += dbmap.LastOpInfo.ChildInsertRowCount
					updatedPostsCommentCount += dbmap.LastOpInfo.ChildUpdateRowCount

					dbpost.CommentCount += uint64(dbmap.LastOpInfo.ChildInsertRowCount)
					_, err = dbmap.Update(dbpost)

					if err != nil {
						return errors.New(fmt.Sprintf("Update for post '%s' failed: %s", dbpost.WebPostId, err.Error()))
					}

					if DebugLevel > 2 {
						// Print out the update info
						fmt.Println("----------- UPDATE POST COMMIT -----------------")
						fmt.Println(dbpost.String("UPDATE2: "))
						fmt.Println("----------- UPDATE POST END -------------------")
					}
				}
			}

		}
	}
	if insertedPostsCount == 0 && updatedPostsCount == 0 {
		if DebugLevel > 2 {
			fmt.Println("No new posts found at " + geturl)
		}
		return
	}

	if DebugLevel > 2 {
		fmt.Printf("%d new posts have been inserted from %s\n", insertedPostsCount, geturl)
		fmt.Printf("%d posts have been updated from %s\n", updatedPostsCount, geturl)
		fmt.Printf("%d new comments have been inserted from %s\n", insertedPostsCommentCount, geturl)
		fmt.Printf("%d comments have been updated from %s\n", updatedPostsCommentCount, geturl)
		fmt.Printf("%d comment errors\n", htmlCommentErrorCount)

	}

	return
}
Exemplo n.º 2
0
func RedditPostScraper(sub string) (err error) {
	//drivername := "postgres"
	//dsn := "user=golang password=golang dbname=golang sslmode=disable"
	//dialect := gorp.PostgresDialect{}

	drivername := "mysql"
	dsn := "golang:golang@/golang?parseTime=true"
	dialect := gorp.MySQLDialect{"InnoDB", "UTF8"}

	// connect to db using standard Go database/sql API
	db, err := sql.Open(drivername, dsn)
	if err != nil {
		return errors.New("sql.Open failed: " + err.Error())
	}

	// Open doesn't open a connection. Validate DSN data:
	if err = db.Ping(); err != nil {
		return errors.New("db.Ping failed: " + err.Error())
	}

	// construct a gorp DbMap
	dbmap := &gorp.DbMap{Db: db, Dialect: dialect}
	defer dbmap.Db.Close()
	dbmap.DebugLevel = DebugLevel
	// Will log all SQL statements + args as they are run
	// The first arg is a string prefix to prepend to all log messages
	//dbmap.TraceOn("[gorp]", log.New(os.Stdout, "fetch:", log.Lmicroseconds))

	// register the structs you wish to use with gorp
	// you can also use the shorter dbmap.AddTable() if you
	// don't want to override the table name
	tablename := "posts_index_test"
	// SetKeys(true) means we have a auto increment primary key, which
	// will get automatically bound to your struct post-insert
	table := dbmap.AddTableWithName(post.Post{}, tablename)
	table.SetKeys(true, "PID")

	// create the table. in a production system you'd generally
	// use a migration tool, or create the tables via scripts
	if err = dbmap.CreateTablesIfNotExists(); err != nil {
		return errors.New("Create tables failed: " + err.Error())
	}

	// Force create all indexes for this database
	if err = dbmap.CreateIndexes(); err != nil {
		return errors.New("Create indexes failed: " + err.Error())
	}

	// Get data from reddit
	geturl := "http://www.reddit.com/r/" + sub + "/new"
	resp, err := http.Get(geturl)
	if err != nil {
		return errors.New("Failed to http.Get from " + geturl + ": " + err.Error())
	}
	if resp != nil {
		if resp.Body == nil {
			return errors.New("Body from " + geturl + " is nil!")
		} else {
			defer resp.Body.Close()
		}
	} else {
		return errors.New("Response from " + geturl + " is nil!")
	}
	if resp.StatusCode != 200 { // 200 = OK
		httperr := fmt.Sprintf("Failed to http.Get from %s: Http Status code: %d: Msg: %s", geturl, resp.StatusCode, resp.Status)
		return errors.New(httperr)
	}

	// Create a new post slice and then parse the response body into ps
	ps := make([]post.Post, 0)
	ps, err = ParseHtmlReddit(resp.Body, ps)
	if err != nil {
		return errors.New("Error in RedditParseHtml: " + geturl + ": " + err.Error())
	}
	foundnewposts := false
	updatedposts := 0

	// insert rows - auto increment PKs will be set properly after the insert
	for _, htmlpost := range ps {
		if htmlpost.Err == nil {
			var postcount int

			// Store reddit sub
			htmlpost.PostSub = sub

			// check if post already exists
			intSelectResult := make([]int, 0)
			postcountsql := "select count(*) from " + dbmap.Dialect.QuoteField(tablename) +
				" where WebPostId = :post_id"
			_, err := dbmap.Select(&intSelectResult, postcountsql, map[string]interface{}{
				"post_id": htmlpost.WebPostId,
			})
			if err != nil {
				return errors.New(fmt.Sprintf("Query: %s failed: %s\n", postcountsql, err.Error()))
			}
			if len(intSelectResult) == 0 {
				return errors.New(fmt.Sprintf("Query: %s returned no result\n", postcountsql))
			}
			postcount = intSelectResult[0]

			// DEBUG
			if DebugLevel > 3 {
				fmt.Println("HTMLpost.WebPostId: " + htmlpost.WebPostId)
				fmt.Printf("HTMLpost.Id: %v\n", htmlpost.Id)
				fmt.Printf("DBpost count: %v \n", postcount)
			}

			// New post? then insert
			if postcount == 0 {
				foundnewposts = true
				err = dbmap.Insert(&htmlpost)

				if DebugLevel > 2 {
					// Print out the crawled info
					fmt.Println("----------- INSERT POST START -----------------")
					fmt.Println(htmlpost.String())
				}
				if err != nil {
					return errors.New("insert into table " + dbmap.Dialect.QuoteField(tablename) + " failed: " + err.Error())
				}
				if DebugLevel > 2 {
					// Print out the end of the crawled info
					fmt.Println("----------- INSERT POST END -------------------")
				}
			} else {
				// Post already exists, do an update
				dbposts := make([]post.Post, 0)
				getpostsql := "select * from " + dbmap.Dialect.QuoteField(tablename) + " where WebPostId = :post_id"
				_, err := dbmap.Select(&dbposts, getpostsql, map[string]interface{}{
					"post_id": htmlpost.WebPostId,
				})
				if err != nil {
					return errors.New(fmt.Sprintf("Getting WebPostId %s from DB failes\n", htmlpost.WebPostId, err.Error()))
				}
				var dbpost post.Post
				if len(dbposts) > 0 {
					dbpost = dbposts[0]
				} else {
					return errors.New(fmt.Sprintf("Query: %s returned no result\n", getpostsql))
				}
				// DEBUG
				if DebugLevel > 3 {
					fmt.Printf("DBPOST: %s\n", dbpost.String())
					fmt.Printf("DBpost.Id: %v\n", dbpost.Id)
					fmt.Printf("DBpost.Score: %v\n", dbpost.Score)
				}

				if htmlpost.Score != dbpost.Score {

					if DebugLevel > 2 {
						// Print out the update info
						fmt.Println("----------- UPDATE POST START -----------------")
						fmt.Println("Title: " + dbpost.Title)
						fmt.Printf("Id: %v\n", dbpost.Id)
						fmt.Printf("From score %d to score %d\n", dbpost.Score, htmlpost.Score)
						fmt.Println("----------- UPDATE POST END -------------------")
					}

					dbpost.Score = htmlpost.Score
					affectedrows, err := dbmap.Update(&dbpost)
					switch {
					case err != nil:
						return errors.New("update table " + tablename + " failed: " + err.Error())
					case affectedrows == 0:
						return errors.New(fmt.Sprintf("update table %s for Id %d did not affect any lines", tablename, dbpost.Id))
					default:
						updatedposts++
					}
				}
			}
		} else {
			if DebugLevel > 1 {
				fmt.Println("Single post error in " + geturl + ": " + htmlpost.Err.Error())
			}
		}
	}
	if !foundnewposts {
		if DebugLevel > 2 {
			fmt.Println("No new posts found at " + geturl)
		}
	}

	if updatedposts > 0 {
		if DebugLevel > 2 {
			fmt.Printf("%d posts have been updated from %s\n", updatedposts, geturl)
		}
	}

	return
}
Exemplo n.º 3
0
func HackerNewsPostScraper(sub string) (err error) {
	//drivername := "postgres"
	//dsn := "user=golang password=golang dbname=golang sslmode=disable"
	//dialect := gorp.PostgresDialect{}

	drivername := "mysql"
	dsn := "golang:golang@/golang?parseTime=true"
	dialect := gorp.MySQLDialect{"InnoDB", "UTF8"}

	// connect to db using standard Go database/sql API
	db, err := sql.Open(drivername, dsn)
	if err != nil {
		return errors.New("sql.Open failed: " + err.Error())
	}

	// Open doesn't open a connection. Validate DSN data using ping
	if err = db.Ping(); err != nil {
		return errors.New("db.Ping failed: " + err.Error())
	}

	// construct a gorp DbMap
	dbmap := &gorp.DbMap{Db: db, Dialect: dialect}
	defer dbmap.Db.Close()
	dbmap.DebugLevel = DebugLevel
	// Will log all SQL statements + args as they are run
	// The first arg is a string prefix to prepend to all log messages
	//dbmap.TraceOn("[gorp]", log.New(os.Stdout, "fetch:", log.Lmicroseconds))

	// register the structs you wish to use with gorp
	// you can also use the shorter dbmap.AddTable() if you
	// don't want to override the table name
	tablename := "posts_index_test"
	// SetKeys(true) means we have a auto increment primary key, which
	// will get automatically bound to your struct post-insert
	table := dbmap.AddTableWithName(post.Post{}, tablename)
	table.SetKeys(true, "PID")

	// create the table. in a production system you'd generally
	// use a migration tool, or create the tables via scripts
	if err = dbmap.CreateTablesIfNotExists(); err != nil {
		return errors.New("Create tables failed: " + err.Error())
	}

	// Force create all indexes for this database
	if err = dbmap.CreateIndexes(); err != nil {
		return errors.New("Create indexes failed: " + err.Error())
	}

	// Get data from hackernews
	geturl := "http://news.ycombinator.com/"
	body, err := GetHtmlBody(geturl)
	if err != nil {
		return errors.New("GetHtmlBody: " + err.Error())
	}

	// Create a new post slice and then parse the response body into ps
	ps := make([]*post.Post, 0)
	ps, err = ParseHtmlHackerNews(body, ps)
	if err != nil {
		return errors.New("ParseHtmlHackerNews: " + err.Error())
	}

	// Number of updated posts
	var updatedPostsCount uint32
	// Number of new posts
	var insertedPostsCount uint32

	// insert rows - auto increment PKs will be set properly after the insert
	for _, htmlpost := range ps {

		if htmlpost.WebPostId == "" {
			if DebugLevel > 1 {
				fmt.Printf("WebPostId not set in %s\n", htmlpost.Title)
			}
			// Fail early, continue with next post
			continue
		}

		if htmlpost.Err != nil {
			if DebugLevel > 1 {
				fmt.Println("Single post error in " + geturl + ": " + htmlpost.Err.Error())
			}
			// Fail early, continue with next post
			continue
		}

		// Store post sub
		htmlpost.PostSub = sub

		// check if post already exists
		intSelectResult := make([]int, 0)
		postcountsql := "select count(*) from " + dbmap.Dialect.QuoteField(tablename) +
			" where WebPostId = :post_id"
		_, err := dbmap.Select(&intSelectResult, postcountsql, map[string]interface{}{
			"post_id": htmlpost.WebPostId,
		})
		if err != nil {
			return errors.New(fmt.Sprintf("Query: %s failed: %s\n", postcountsql, err.Error()))
		}
		if len(intSelectResult) == 0 {
			return errors.New(fmt.Sprintf("Query: %s returned no result\n", postcountsql))
		}
		postcount := intSelectResult[0]

		// New post? then insert
		if postcount == 0 {

			// Insert the new post into the database
			err = dbmap.Insert(htmlpost)

			if DebugLevel > 2 {
				// Print out the crawled info
				fmt.Println("----------- INSERT POST START -----------------")
				fmt.Println(htmlpost.String())
			}
			if err != nil {
				return errors.New("insert into table " + dbmap.Dialect.QuoteField(tablename) + " failed: " + err.Error())
			}
			if DebugLevel > 2 {
				// Print out the end of the crawled info
				fmt.Println("----------- INSERT POST END -------------------")
			}
			insertedPostsCount++

		} else {
			// Post already exists, do an update
			// Create a slice of posts to select into
			dbposts := make([]post.Post, 0)
			getpostsql := "select * from " + dbmap.Dialect.QuoteField(tablename) + " where WebPostId = :post_id"
			_, err := dbmap.Select(&dbposts, getpostsql, map[string]interface{}{
				"post_id": htmlpost.WebPostId,
			})
			if err != nil {
				return errors.New(fmt.Sprintf("Getting WebPostId %s from DB failed: %s\n", htmlpost.WebPostId, err.Error()))
			}
			var dbpost post.Post
			if len(dbposts) > 0 {
				dbpost = dbposts[0]
			} else {
				return errors.New(fmt.Sprintf("Query: %s returned no result\n", getpostsql))
			}

			if htmlpost.Score != dbpost.Score {
				// The post score changed, do an update into the database

				//fmt.Println("Post Date db: " + dbpost.PostDate.String() + ", html: " + htmlpost.PostDate.String())
				//fmt.Printf("Post Score db: %d, html: %d\n", dbpost.Score, htmlpost.Score)

				if DebugLevel > 2 {
					fmt.Println("----------- UPDATE POST START -----------------")
					fmt.Println(dbpost.String())
					fmt.Printf("From score %d to score %d\n", dbpost.Score, htmlpost.Score)
				}
				dbpost.Score = htmlpost.Score
				dbpost.PostDate = htmlpost.PostDate
				affectedrows, err := dbmap.Update(&dbpost)
				switch {
				case err != nil:
					return errors.New("update table " + tablename + " failed: " + err.Error())
				case affectedrows == 0:
					return errors.New(fmt.Sprintf("update table %s for Id %d did not affect any lines", tablename, dbpost.Id))
				default:
					updatedPostsCount++
					if DebugLevel > 2 {
						// Print out the update info
						fmt.Println("----------- UPDATE POST COMMIT -----------------")
						fmt.Println(dbpost.String())
						fmt.Println("----------- UPDATE POST END -------------------")
					}
				}
			}

		}
	}
	if insertedPostsCount == 0 && updatedPostsCount == 0 {
		if DebugLevel > 2 {
			fmt.Println("No new posts found at " + geturl)
		}
	}

	if updatedPostsCount > 0 && DebugLevel > 2 {
		fmt.Printf("%d existing posts have been updated from %s\n", updatedPostsCount, geturl)
	}

	if insertedPostsCount > 0 && DebugLevel > 2 {
		fmt.Printf("%d new posts have been inserted from %s\n", insertedPostsCount, geturl)
	}

	return
}