func HackerNewsPostScraper(sub string) (err error) {
	//drivername := "postgres"
	//dsn := "user=golang password=golang dbname=golang sslmode=disable"
	//dialect := gorp.PostgresDialect{}

	drivername := "mysql"
	dsn := "golang:golang@/golang?parseTime=true"
	dialect := gorp.MySQLDialect{"InnoDB", "utf8mb4"}

	// connect to db using standard Go database/sql API
	db, err := sql.Open(drivername, dsn)
	if err != nil {
		return errors.New("sql.Open failed: " + err.Error())
	}

	// Open doesn't open a connection. Validate DSN data using ping
	if err = db.Ping(); err != nil {
		return errors.New("db.Ping failed: " + err.Error())
	}

	// Set the connection to use utf8bmb4
	if dialect.Engine == "InnoDB" {
		fmt.Println("Setting connection to utf8mb4")
		_, err = db.Exec("SET NAMES utf8mb4 COLLATE utf8mb4_general_ci")
		if err != nil {
			return errors.New("SET NAMES utf8mb4 COLLATE utf8mb4_general_ci: " + err.Error())
		}
	}

	// construct a gorp DbMap
	dbmap := &gorp.DbMap{Db: db, Dialect: dialect}
	defer dbmap.Db.Close()
	dbmap.DebugLevel = DebugLevel
	// Will log all SQL statements + args as they are run
	// The first arg is a string prefix to prepend to all log messages
	//dbmap.TraceOn("[gorp]", log.New(os.Stdout, "Trace:", log.Lmicroseconds))

	// register the structs you wish to use with gorp
	// you can also use the shorter dbmap.AddTable() if you
	// don't want to override the table name

	// SetKeys(true) means we have a auto increment primary key, which
	// will get automatically bound to your struct post-insert
	table := dbmap.AddTableWithName(post.Post{}, "posts_index_test")
	table.SetKeys(true, "PID")

	// Add the comments table
	table = dbmap.AddTableWithName(post.Comment{}, "comments_index_test")
	table.SetKeys(true, "Id")

	// create the table. in a production system you'd generally
	// use a migration tool, or create the tables via scripts
	if err = dbmap.CreateTablesIfNotExists(); err != nil {
		return errors.New("Create tables failed: " + err.Error())
	}

	// Force create all indexes for this database
	if err = dbmap.CreateIndexes(); err != nil {
		return errors.New("Create indexes failed: " + err.Error())
	}

	// Get data from hackernews
	geturl := "http://news.ycombinator.com/" + sub
	// DEBUG for a special thread
	//geturl := "https://news.ycombinator.com/item?id=10056146"
	body, err := GetHtmlBody(geturl)
	if err != nil {
		return errors.New("GetHtmlBody: " + err.Error())
	}

	// Create a new post slice and then parse the response body into ps
	ps := make([]*post.Post, 0)
	//cs := make([]*post.Comment, 0)
	ps, err = ParseHtmlHackerNews(body, ps)
	if err != nil {
		return errors.New("ParseHtmlHackerNews: " + err.Error())
	}

	// Number of updated posts
	var updatedPostsCount int64
	// Number of new posts
	var insertedPostsCount int64

	var insertedPostsCommentCount int64
	var updatedPostsCommentCount int64

	// Number of post parsing errors
	var htmlPostErrorCount uint32
	// Number of comment parsing errors
	var htmlCommentErrorCount uint32

	// loop over all parsed posts
	for _, htmlpost := range ps {

		if htmlpost.WebPostId == "" {
			if DebugLevel > 1 {
				fmt.Printf("WebPostId not set in %s\n", htmlpost.Title)
			}
			// Fail early, continue with next post
			continue
		}

		if htmlpost.Err != nil {
			if DebugLevel > 1 {
				fmt.Println("Single post error in " + geturl + ": " + htmlpost.Err.Error())
			}
			// Fail early, continue with next post
			htmlPostErrorCount++
			continue
		}

		if len(htmlpost.CommentParseErrors) > 0 {
			for _, c := range htmlpost.CommentParseErrors {
				htmlCommentErrorCount++
				if DebugLevel > 2 {
					fmt.Println("Single comment error in '" + geturl + "' for WebPostId '" + htmlpost.WebPostId + ": " + c.Err.Error())
				}

			}
		}

		// Store post sub
		htmlpost.PostSub = sub

		tm, err := dbmap.TableFor(reflect.TypeOf(*htmlpost), true)
		if err != nil {
			return errors.New("Failed to get reflection type: " + err.Error())
		}
		if DebugLevel > 3 {
			fmt.Println("TABLEMAP: " + tm.TableName)
		}
		// check if post already exists
		dbposts := make([]post.Post, 0)
		getpostsql := "select * from " + dbmap.Dialect.QuotedTableForQuery("", tm.TableName) + " where WebPostId = :post_id"
		_, err = dbmap.Select(&dbposts, getpostsql, map[string]interface{}{
			"post_id": htmlpost.WebPostId,
		})
		if err != nil {
			return errors.New(fmt.Sprintf("Getting PostId %s from DB failed: %s", htmlpost.WebPostId, err.Error()))
		}
		var dbpost *post.Post
		if len(dbposts) == 1 {
			dbpost = &dbposts[0]
		} else if len(dbposts) > 1 {
			return errors.New(fmt.Sprintf("Query: %s returned %d rows", getpostsql, len(dbposts)))
		}
		postcount := len(dbposts)

		// New post? then insert
		if postcount == 0 {

			if DebugLevel > 2 {
				fmt.Printf("New post found, inserting htmlpost.WebPostId '%s'\n", htmlpost.WebPostId)
			}

			// Reset the rowcount info
			dbmap.LastOpInfo.Reset()
			htmlpost.CommentCount = uint64(len(htmlpost.Comments))
			// Insert the new post into the database
			err = dbmap.InsertWithChilds(htmlpost)

			if DebugLevel > 2 {
				// Print out the crawled info
				fmt.Println("----------- INSERT POST START -----------------")
				fmt.Println(htmlpost.String("INSERT: "))
			}
			if err != nil {
				return errors.New("insert into table " + dbmap.Dialect.QuoteField(tm.TableName) + " failed: " + err.Error())
			}
			if DebugLevel > 2 {
				// Print out the end of the crawled info
				fmt.Println("----------- INSERT POST END -------------------")
			}
			insertedPostsCount += dbmap.LastOpInfo.RowCount
			insertedPostsCommentCount += dbmap.LastOpInfo.ChildInsertRowCount

		} else {
			// Post already exists, get the full post with its comments from the db

			res, err := dbmap.GetWithChilds(post.Post{}, 9999999999, 0, dbpost.Id)
			if err != nil {
				return errors.New("get failed: " + err.Error())
			}
			if res == nil {
				return errors.New(fmt.Sprintf("Get post for id %d did not return any rows ", dbpost.Id))
			}
			dbpost = res.(*post.Post)

			// Check if an update is needed
			var updateNeeded bool
			updateNeeded, err = AddUpdatableChilds(htmlpost, dbpost, dbmap)
			if err != nil {
				return errors.New(fmt.Sprintf("CheckIfDataChanged for post '%s' failed: %s", htmlpost.WebPostId, err.Error()))
			}
			//if htmlpost.Score != dbpost.Score {
			if updateNeeded {
				// The post changed, do an update into the database

				//fmt.Println("Post Date db: " + dbpost.PostDate.String() + ", html: " + htmlpost.PostDate.String())
				//fmt.Printf("Post Score db: %d, html: %d\n", dbpost.Score, htmlpost.Score)

				if DebugLevel > 2 {
					fmt.Println("----------- UPDATE POST START -----------------")
					fmt.Println(dbpost.String("UPDATE1: "))
					fmt.Printf("From score %d to score %d\n", dbpost.Score, htmlpost.Score)
				}
				dbpost.Score = htmlpost.Score
				dbpost.PostDate = htmlpost.PostDate

				// Reset the rowcount info
				dbmap.LastOpInfo.Reset()

				// Update the posts together with its comments
				affectedrows, err := dbmap.UpdateWithChilds(dbpost)

				switch {
				case err != nil:
					return errors.New("update table " + tm.TableName + " failed: " + err.Error())
				case affectedrows == 0:
					return errors.New(fmt.Sprintf("update table %s for Id %d did not affect any lines", tm.TableName, dbpost.Id))
				default:

					updatedPostsCount += dbmap.LastOpInfo.RowCount
					insertedPostsCommentCount += dbmap.LastOpInfo.ChildInsertRowCount
					updatedPostsCommentCount += dbmap.LastOpInfo.ChildUpdateRowCount

					dbpost.CommentCount += uint64(dbmap.LastOpInfo.ChildInsertRowCount)
					_, err = dbmap.Update(dbpost)

					if err != nil {
						return errors.New(fmt.Sprintf("Update for post '%s' failed: %s", dbpost.WebPostId, err.Error()))
					}

					if DebugLevel > 2 {
						// Print out the update info
						fmt.Println("----------- UPDATE POST COMMIT -----------------")
						fmt.Println(dbpost.String("UPDATE2: "))
						fmt.Println("----------- UPDATE POST END -------------------")
					}
				}
			}

		}
	}
	if insertedPostsCount == 0 && updatedPostsCount == 0 {
		if DebugLevel > 2 {
			fmt.Println("No new posts found at " + geturl)
		}
		return
	}

	if DebugLevel > 2 {
		fmt.Printf("%d new posts have been inserted from %s\n", insertedPostsCount, geturl)
		fmt.Printf("%d posts have been updated from %s\n", updatedPostsCount, geturl)
		fmt.Printf("%d new comments have been inserted from %s\n", insertedPostsCommentCount, geturl)
		fmt.Printf("%d comments have been updated from %s\n", updatedPostsCommentCount, geturl)
		fmt.Printf("%d comment errors\n", htmlCommentErrorCount)

	}

	return
}
func Test() (err error) {
	//drivername := "postgres"
	//dsn := "user=golang password=golang dbname=golang sslmode=disable"
	//dialect := gorp.PostgresDialect{}

	drivername := "mysql"
	dsn := "golang:golang@/golang?parseTime=true&collation=utf8mb4_general_ci"
	dialect := gorp.MySQLDialect{"InnoDB", "utf8mb4"}

	// connect to db using standard Go database/sql API
	db, err := sql.Open(drivername, dsn)
	if err != nil {
		return errors.New("sql.Open failed: " + err.Error())
	}

	// Open doesn't open a connection. Validate DSN data using ping
	if err = db.Ping(); err != nil {
		return errors.New("db.Ping failed: " + err.Error())
	}

	// Set the connection to use utf8mb4
	if dialect.Engine == "InnoDB" {
		_, err = db.Exec("SET NAMES utf8mb4 COLLATE utf8mb4_general_ci")
		if err != nil {
			return errors.New("SET NAMES utf8mb4 COLLATE utf8mb4_general_ci: " + err.Error())
		}
	}

	// construct a gorp DbMap
	dbmap := &gorp.DbMap{Db: db, Dialect: dialect}
	defer dbmap.Db.Close()
	dbmap.DebugLevel = DebugLevel
	// Will log all SQL statements + args as they are run
	// The first arg is a string prefix to prepend to all log messages
	dbmap.TraceOn("[gorp]", log.New(os.Stdout, "fetch:", log.Lmicroseconds))

	// register the structs you wish to use with gorp
	// you can also use the shorter dbmap.AddTable() if you
	// don't want to override the table name

	// SetKeys(true) means we have a auto increment primary key, which
	// will get automatically bound to your struct post-insert
	table := dbmap.AddTableWithName(post.Post{}, "posts_embedded_test")
	table.SetKeys(true, "PID")
	fmt.Printf("AddTableWithName returned: %s\n", table.TableName)

	var r *gorp.RelationMap
	if len(table.Relations) > 0 {
		r = table.Relations[0]
		fmt.Printf("Relation DetailTable: %s\n", r.DetailTable.TableName)
	}

	// Add the comments table
	table = dbmap.AddTableWithName(post.Comment{}, "comments_embedded_test")
	table.SetKeys(true, "Id")
	fmt.Printf("AddTableWithName returned: %s\n", table.TableName)
	if r != nil {
		fmt.Printf("Relation DetailTable: %s\n", r.DetailTable.TableName)
	}

	// create the table. in a production system you'd generally
	// use a migration tool, or create the tables via scripts
	if err = dbmap.CreateTablesIfNotExists(); err != nil {
		return errors.New("Create tables failed: " + err.Error())
	}

	// Force create all indexes for this database
	if err = dbmap.CreateIndexes(); err != nil {
		return errors.New("Create indexes failed: " + err.Error())
	}

	i := 0
	x := 0
	var LastPkForGetTests uint64
	var p post.Post

	rand.Seed(42)

	for i < 10 {
		p = post.NewPost()
		p.Title = fmt.Sprintf("Post number %d", i)
		p.Site = "test"
		p.PostDate = time.Unix(time.Now().Unix(), 0).UTC()
		p.WebPostId = strconv.FormatUint(post.Hash(p.Title+p.PostDate.String()), 10)

		x = 0
		for x < 10 {
			c := p.AddComment()
			c.Title = fmt.Sprintf("Comment %d on post %d: ", x, i)
			//c.Title = "👩�👦�👦👨�👩�👧�👩�👩�"
			c.Title += "\U0001F475 \u2318 \xe2\x8c\x98 \U0001F474 \xF0\x9F\x91\xB4 \U0001F610"

			c.WebCommentId = strconv.FormatUint(post.Hash(c.Title+c.GetCommentDate().String())+uint64(rand.Int63n(100000)), 10)
			if utf8.ValidString(c.Title) {
				fmt.Printf("IS VALID: '%s'\n", c.Title)
			} else {
				fmt.Printf("IS *** NOT*** VALID: '%s'\n", c.Title)

			}
			nihongo := c.Title
			for i, w := 0, 0; i < len(nihongo); i += w {
				runeValue, width := utf8.DecodeRuneInString(nihongo[i:])
				fmt.Printf("%#U starts at byte position %d, lenght %d\n", runeValue, i, width)
				w = width
			}

			x++
		}

		// Inserting a post also inserts all its detail records (=comments)
		err = dbmap.InsertWithChilds(&p)
		if DebugLevel > 3 {
			// Print out the crawled info
			fmt.Println("----------- INSERT POST START -----------------")
			fmt.Println(p.String("IP: "))
		}
		if err != nil {
			return errors.New("Insert failed: " + err.Error())
		}

		LastPkForGetTests = p.Id

		if DebugLevel > 3 {
			// Print out the end of the crawled info
			fmt.Println("----------- INSERT POST END -------------------")
		}

		for y, c := range p.Comments {

			c.Title = fmt.Sprintf("UpdatedComment %d ", y) + c.Title
			x++
		}

		p.Title = fmt.Sprintf("UpdatedPost %d ", i) + p.Title
		var rowsaffected int64
		rowsaffected, err = dbmap.UpdateWithChilds(&p)
		if DebugLevel > 3 {
			// Print out the crawled info
			fmt.Println("----------- UPDATE POST START -----------------")
			fmt.Printf("Rows affected: %d\n", rowsaffected)
			fmt.Println(p.String("UP: "))
		}
		if err != nil {
			return errors.New("update failed: " + err.Error())
		}
		if DebugLevel > 3 {
			// Print out the end of the crawled info
			fmt.Println("----------- UPDATE POST END -------------------")
		}

		i++

	}
	fmt.Println("Starting Get tests")

	res, err := dbmap.GetWithChilds(post.Post{}, LastPkForGetTests)

	if err != nil {
		return errors.New("get failed: " + err.Error())
	}
	if res == nil {
		return errors.New(fmt.Sprintf("Get post for id %d did not return any rows ", LastPkForGetTests))
	}

	resp := res.(*post.Post)

	if DebugLevel > 3 {
		// Print out the selected post
		fmt.Println("----------- GET POST START -----------------")
		fmt.Println(resp.String("GP: "))
	}

	if DebugLevel > 3 {
		// Print out the end of the selected post
		fmt.Println("----------- GET POST END -------------------")
	}

	var updateNeeded bool
	updateNeeded, err = AddUpdatableChilds(&p, resp, dbmap)
	if err != nil {
		return errors.New(fmt.Sprintf("AddUpdatableChilds for post '%s' failed: %s", resp.WebPostId, err.Error()))
	}

	if updateNeeded {
		var rowsaffected int64
		rowsaffected, err = dbmap.UpdateWithChilds(resp)
		if DebugLevel > 3 {
			// Print out the crawled info
			fmt.Println("----------- REUPDATE POST START -----------------")
			fmt.Printf("Rows affected: %d\n", rowsaffected)
			fmt.Println(resp.String("RUP: "))
		}
		if err != nil {
			return errors.New("reupdate failed: " + err.Error())
		}
		if DebugLevel > 3 {
			// Print out the end of the crawled info
			fmt.Println("----------- REUPDATE POST END -------------------")
		}
	}

	return
}
Ejemplo n.º 3
0
func HackerNewsPostScraper(sub string) (err error) {
	//drivername := "postgres"
	//dsn := "user=golang password=golang dbname=golang sslmode=disable"
	//dialect := gorp.PostgresDialect{}

	drivername := "mysql"
	dsn := "golang:golang@/golang?parseTime=true"
	dialect := gorp.MySQLDialect{"InnoDB", "UTF8"}

	// connect to db using standard Go database/sql API
	db, err := sql.Open(drivername, dsn)
	if err != nil {
		return errors.New("sql.Open failed: " + err.Error())
	}

	// Open doesn't open a connection. Validate DSN data using ping
	if err = db.Ping(); err != nil {
		return errors.New("db.Ping failed: " + err.Error())
	}

	// construct a gorp DbMap
	dbmap := &gorp.DbMap{Db: db, Dialect: dialect}
	defer dbmap.Db.Close()
	dbmap.DebugLevel = DebugLevel
	// Will log all SQL statements + args as they are run
	// The first arg is a string prefix to prepend to all log messages
	//dbmap.TraceOn("[gorp]", log.New(os.Stdout, "fetch:", log.Lmicroseconds))

	// register the structs you wish to use with gorp
	// you can also use the shorter dbmap.AddTable() if you
	// don't want to override the table name
	tablename := "posts_index_test"
	// SetKeys(true) means we have a auto increment primary key, which
	// will get automatically bound to your struct post-insert
	table := dbmap.AddTableWithName(post.Post{}, tablename)
	table.SetKeys(true, "PID")

	// create the table. in a production system you'd generally
	// use a migration tool, or create the tables via scripts
	if err = dbmap.CreateTablesIfNotExists(); err != nil {
		return errors.New("Create tables failed: " + err.Error())
	}

	// Force create all indexes for this database
	if err = dbmap.CreateIndexes(); err != nil {
		return errors.New("Create indexes failed: " + err.Error())
	}

	// Get data from hackernews
	geturl := "http://news.ycombinator.com/"
	body, err := GetHtmlBody(geturl)
	if err != nil {
		return errors.New("GetHtmlBody: " + err.Error())
	}

	// Create a new post slice and then parse the response body into ps
	ps := make([]*post.Post, 0)
	ps, err = ParseHtmlHackerNews(body, ps)
	if err != nil {
		return errors.New("ParseHtmlHackerNews: " + err.Error())
	}

	// Number of updated posts
	var updatedPostsCount uint32
	// Number of new posts
	var insertedPostsCount uint32

	// insert rows - auto increment PKs will be set properly after the insert
	for _, htmlpost := range ps {

		if htmlpost.WebPostId == "" {
			if DebugLevel > 1 {
				fmt.Printf("WebPostId not set in %s\n", htmlpost.Title)
			}
			// Fail early, continue with next post
			continue
		}

		if htmlpost.Err != nil {
			if DebugLevel > 1 {
				fmt.Println("Single post error in " + geturl + ": " + htmlpost.Err.Error())
			}
			// Fail early, continue with next post
			continue
		}

		// Store post sub
		htmlpost.PostSub = sub

		// check if post already exists
		intSelectResult := make([]int, 0)
		postcountsql := "select count(*) from " + dbmap.Dialect.QuoteField(tablename) +
			" where WebPostId = :post_id"
		_, err := dbmap.Select(&intSelectResult, postcountsql, map[string]interface{}{
			"post_id": htmlpost.WebPostId,
		})
		if err != nil {
			return errors.New(fmt.Sprintf("Query: %s failed: %s\n", postcountsql, err.Error()))
		}
		if len(intSelectResult) == 0 {
			return errors.New(fmt.Sprintf("Query: %s returned no result\n", postcountsql))
		}
		postcount := intSelectResult[0]

		// New post? then insert
		if postcount == 0 {

			// Insert the new post into the database
			err = dbmap.Insert(htmlpost)

			if DebugLevel > 2 {
				// Print out the crawled info
				fmt.Println("----------- INSERT POST START -----------------")
				fmt.Println(htmlpost.String())
			}
			if err != nil {
				return errors.New("insert into table " + dbmap.Dialect.QuoteField(tablename) + " failed: " + err.Error())
			}
			if DebugLevel > 2 {
				// Print out the end of the crawled info
				fmt.Println("----------- INSERT POST END -------------------")
			}
			insertedPostsCount++

		} else {
			// Post already exists, do an update
			// Create a slice of posts to select into
			dbposts := make([]post.Post, 0)
			getpostsql := "select * from " + dbmap.Dialect.QuoteField(tablename) + " where WebPostId = :post_id"
			_, err := dbmap.Select(&dbposts, getpostsql, map[string]interface{}{
				"post_id": htmlpost.WebPostId,
			})
			if err != nil {
				return errors.New(fmt.Sprintf("Getting WebPostId %s from DB failed: %s\n", htmlpost.WebPostId, err.Error()))
			}
			var dbpost post.Post
			if len(dbposts) > 0 {
				dbpost = dbposts[0]
			} else {
				return errors.New(fmt.Sprintf("Query: %s returned no result\n", getpostsql))
			}

			if htmlpost.Score != dbpost.Score {
				// The post score changed, do an update into the database

				//fmt.Println("Post Date db: " + dbpost.PostDate.String() + ", html: " + htmlpost.PostDate.String())
				//fmt.Printf("Post Score db: %d, html: %d\n", dbpost.Score, htmlpost.Score)

				if DebugLevel > 2 {
					fmt.Println("----------- UPDATE POST START -----------------")
					fmt.Println(dbpost.String())
					fmt.Printf("From score %d to score %d\n", dbpost.Score, htmlpost.Score)
				}
				dbpost.Score = htmlpost.Score
				dbpost.PostDate = htmlpost.PostDate
				affectedrows, err := dbmap.Update(&dbpost)
				switch {
				case err != nil:
					return errors.New("update table " + tablename + " failed: " + err.Error())
				case affectedrows == 0:
					return errors.New(fmt.Sprintf("update table %s for Id %d did not affect any lines", tablename, dbpost.Id))
				default:
					updatedPostsCount++
					if DebugLevel > 2 {
						// Print out the update info
						fmt.Println("----------- UPDATE POST COMMIT -----------------")
						fmt.Println(dbpost.String())
						fmt.Println("----------- UPDATE POST END -------------------")
					}
				}
			}

		}
	}
	if insertedPostsCount == 0 && updatedPostsCount == 0 {
		if DebugLevel > 2 {
			fmt.Println("No new posts found at " + geturl)
		}
	}

	if updatedPostsCount > 0 && DebugLevel > 2 {
		fmt.Printf("%d existing posts have been updated from %s\n", updatedPostsCount, geturl)
	}

	if insertedPostsCount > 0 && DebugLevel > 2 {
		fmt.Printf("%d new posts have been inserted from %s\n", insertedPostsCount, geturl)
	}

	return
}