func AddUpdatableChilds(htmlpost *post.Post, dbpost *post.Post, dbmap *gorp.DbMap) (updateNeeded bool, err error) {
	// Check if there are aleady comments in dbpost
	// If not get them from the database

	if len(dbpost.Comments) == 0 {
		pk := dbpost.Id
		if pk == 0 {
			err = errors.New("primary key not set in dbpost")
			return
		}
		var res interface{}
		res, err = dbmap.GetWithChilds(post.Post{}, pk)
		if err != nil {
			err = errors.New("get failed: " + err.Error())
			return
		}
		if res == nil {
			err = errors.New(fmt.Sprintf("Get post for id %d did not return any rows ", pk))
			return
		}

		dbpost := res.(*post.Post)
		if DebugLevel > 3 {
			// Print out the update info
			fmt.Println("----------- DB POST -----------------")
			fmt.Println(dbpost.String("CHECK DB: "))
			fmt.Println("----------- DB POST END -------------------")
		}
	}
	if DebugLevel > 3 {
		// Print out the update info
		fmt.Println("----------- HTML POST -----------------")
		fmt.Println(htmlpost.String("CHECK HTML: "))
		fmt.Println("----------- HTML POST END -------------------")
	}

	updateNeeded = htmlpost.Hash() != dbpost.Hash()

	if updateNeeded {
		var UpdatedComments []*post.Comment
		var found bool

		if DebugLevel > 2 {
			fmt.Printf("**** UpdatedComments len %d\n", len(UpdatedComments))
		}
		for _, h := range htmlpost.Comments {
			found = false
			htmlHash := h.Hash()
			for _, d := range dbpost.Comments {
				if DebugLevel > 2 {
					fmt.Printf("**** COMPARE\n")
					fmt.Printf("**** **** d.Hash():%d htmlHash %d\n", d.Hash(), htmlHash)
					fmt.Printf("**** **** d.Date '%s' h.Date '%s'\n", d.GetCommentDate().String(), h.GetCommentDate().String())
					fmt.Printf("**** COMPARE END\n")
				}
				if d.Hash() == htmlHash {
					// post with identical content has been found - do not store this comment
					found = true
					if DebugLevel > 2 {
						fmt.Printf("**** ***************** MATCH d.Hash() == htmlHash %d\n", d.Hash())
					}
					break
				}
				if h.WebCommentId == d.WebCommentId {
					// external unique comment id found - this comment is already stored
					// but the comment content has been changed - update needed
					if DebugLevel > 3 {
						fmt.Printf("**** COMPARE h.WebCommentId\n")
						fmt.Printf("**** **** h '%s' d '%s'\n", h.WebCommentId, d.WebCommentId)
						fmt.Printf("**** COMPARE h.WebCommentId END\n")
					}
					h.Id = d.Id
					h.PostId = d.PostId
					break
				}
			}
			if !found {
				UpdatedComments = append(UpdatedComments, h)
				if DebugLevel > 2 {
					fmt.Printf("**** UpdatedComments len %d\n", len(UpdatedComments))
					fmt.Printf("**** **** append(UpdatedComments, h) %s\n", h.String("APP: "))
				}
			}

		}
		fmt.Printf("**** htmlpost.Comments len %d\n", len(htmlpost.Comments))
		fmt.Printf("**** UpdatedComments len %d\n", len(UpdatedComments))
		dbpost.Comments = make([]*post.Comment, len(UpdatedComments), len(UpdatedComments))
		fmt.Printf("**** dbpost.Comments1 len %d\n", len(dbpost.Comments))
		copy(dbpost.Comments, UpdatedComments)
		fmt.Printf("**** dbpost.Comments2 len %d\n", len(dbpost.Comments))
	}
	if (DebugLevel > 3) && updateNeeded {
		// Print out the update info
		fmt.Println("----------- UPDATE NEEDED -----------------")

		for i := range htmlpost.Comments {
			fmt.Println(htmlpost.Comments[i].String("UPDATE NEEDED HTML: "))
			if i < len(dbpost.Comments) {
				fmt.Println(dbpost.Comments[i].String("UPDATE NEEDED DB: "))
			}
		}

		//fmt.Println(htmlpost.String("UPDATE NEEDED HTML: "))
		//fmt.Println(dbpost.String("UPDATE NEEDED DB: "))
		fmt.Println("----------- UPDATE NEEDED END -------------------")
	}

	return
}
func Test() (err error) {
	//drivername := "postgres"
	//dsn := "user=golang password=golang dbname=golang sslmode=disable"
	//dialect := gorp.PostgresDialect{}

	drivername := "mysql"
	dsn := "golang:golang@/golang?parseTime=true&collation=utf8mb4_general_ci"
	dialect := gorp.MySQLDialect{"InnoDB", "utf8mb4"}

	// connect to db using standard Go database/sql API
	db, err := sql.Open(drivername, dsn)
	if err != nil {
		return errors.New("sql.Open failed: " + err.Error())
	}

	// Open doesn't open a connection. Validate DSN data using ping
	if err = db.Ping(); err != nil {
		return errors.New("db.Ping failed: " + err.Error())
	}

	// Set the connection to use utf8mb4
	if dialect.Engine == "InnoDB" {
		_, err = db.Exec("SET NAMES utf8mb4 COLLATE utf8mb4_general_ci")
		if err != nil {
			return errors.New("SET NAMES utf8mb4 COLLATE utf8mb4_general_ci: " + err.Error())
		}
	}

	// construct a gorp DbMap
	dbmap := &gorp.DbMap{Db: db, Dialect: dialect}
	defer dbmap.Db.Close()
	dbmap.DebugLevel = DebugLevel
	// Will log all SQL statements + args as they are run
	// The first arg is a string prefix to prepend to all log messages
	dbmap.TraceOn("[gorp]", log.New(os.Stdout, "fetch:", log.Lmicroseconds))

	// register the structs you wish to use with gorp
	// you can also use the shorter dbmap.AddTable() if you
	// don't want to override the table name

	// SetKeys(true) means we have a auto increment primary key, which
	// will get automatically bound to your struct post-insert
	table := dbmap.AddTableWithName(post.Post{}, "posts_embedded_test")
	table.SetKeys(true, "PID")
	fmt.Printf("AddTableWithName returned: %s\n", table.TableName)

	var r *gorp.RelationMap
	if len(table.Relations) > 0 {
		r = table.Relations[0]
		fmt.Printf("Relation DetailTable: %s\n", r.DetailTable.TableName)
	}

	// Add the comments table
	table = dbmap.AddTableWithName(post.Comment{}, "comments_embedded_test")
	table.SetKeys(true, "Id")
	fmt.Printf("AddTableWithName returned: %s\n", table.TableName)
	if r != nil {
		fmt.Printf("Relation DetailTable: %s\n", r.DetailTable.TableName)
	}

	// create the table. in a production system you'd generally
	// use a migration tool, or create the tables via scripts
	if err = dbmap.CreateTablesIfNotExists(); err != nil {
		return errors.New("Create tables failed: " + err.Error())
	}

	// Force create all indexes for this database
	if err = dbmap.CreateIndexes(); err != nil {
		return errors.New("Create indexes failed: " + err.Error())
	}

	i := 0
	x := 0
	var LastPkForGetTests uint64
	var p post.Post

	rand.Seed(42)

	for i < 10 {
		p = post.NewPost()
		p.Title = fmt.Sprintf("Post number %d", i)
		p.Site = "test"
		p.PostDate = time.Unix(time.Now().Unix(), 0).UTC()
		p.WebPostId = strconv.FormatUint(post.Hash(p.Title+p.PostDate.String()), 10)

		x = 0
		for x < 10 {
			c := p.AddComment()
			c.Title = fmt.Sprintf("Comment %d on post %d: ", x, i)
			//c.Title = "👩�👦�👦👨�👩�👧�👩�👩�"
			c.Title += "\U0001F475 \u2318 \xe2\x8c\x98 \U0001F474 \xF0\x9F\x91\xB4 \U0001F610"

			c.WebCommentId = strconv.FormatUint(post.Hash(c.Title+c.GetCommentDate().String())+uint64(rand.Int63n(100000)), 10)
			if utf8.ValidString(c.Title) {
				fmt.Printf("IS VALID: '%s'\n", c.Title)
			} else {
				fmt.Printf("IS *** NOT*** VALID: '%s'\n", c.Title)

			}
			nihongo := c.Title
			for i, w := 0, 0; i < len(nihongo); i += w {
				runeValue, width := utf8.DecodeRuneInString(nihongo[i:])
				fmt.Printf("%#U starts at byte position %d, lenght %d\n", runeValue, i, width)
				w = width
			}

			x++
		}

		// Inserting a post also inserts all its detail records (=comments)
		err = dbmap.InsertWithChilds(&p)
		if DebugLevel > 3 {
			// Print out the crawled info
			fmt.Println("----------- INSERT POST START -----------------")
			fmt.Println(p.String("IP: "))
		}
		if err != nil {
			return errors.New("Insert failed: " + err.Error())
		}

		LastPkForGetTests = p.Id

		if DebugLevel > 3 {
			// Print out the end of the crawled info
			fmt.Println("----------- INSERT POST END -------------------")
		}

		for y, c := range p.Comments {

			c.Title = fmt.Sprintf("UpdatedComment %d ", y) + c.Title
			x++
		}

		p.Title = fmt.Sprintf("UpdatedPost %d ", i) + p.Title
		var rowsaffected int64
		rowsaffected, err = dbmap.UpdateWithChilds(&p)
		if DebugLevel > 3 {
			// Print out the crawled info
			fmt.Println("----------- UPDATE POST START -----------------")
			fmt.Printf("Rows affected: %d\n", rowsaffected)
			fmt.Println(p.String("UP: "))
		}
		if err != nil {
			return errors.New("update failed: " + err.Error())
		}
		if DebugLevel > 3 {
			// Print out the end of the crawled info
			fmt.Println("----------- UPDATE POST END -------------------")
		}

		i++

	}
	fmt.Println("Starting Get tests")

	res, err := dbmap.GetWithChilds(post.Post{}, LastPkForGetTests)

	if err != nil {
		return errors.New("get failed: " + err.Error())
	}
	if res == nil {
		return errors.New(fmt.Sprintf("Get post for id %d did not return any rows ", LastPkForGetTests))
	}

	resp := res.(*post.Post)

	if DebugLevel > 3 {
		// Print out the selected post
		fmt.Println("----------- GET POST START -----------------")
		fmt.Println(resp.String("GP: "))
	}

	if DebugLevel > 3 {
		// Print out the end of the selected post
		fmt.Println("----------- GET POST END -------------------")
	}

	var updateNeeded bool
	updateNeeded, err = AddUpdatableChilds(&p, resp, dbmap)
	if err != nil {
		return errors.New(fmt.Sprintf("AddUpdatableChilds for post '%s' failed: %s", resp.WebPostId, err.Error()))
	}

	if updateNeeded {
		var rowsaffected int64
		rowsaffected, err = dbmap.UpdateWithChilds(resp)
		if DebugLevel > 3 {
			// Print out the crawled info
			fmt.Println("----------- REUPDATE POST START -----------------")
			fmt.Printf("Rows affected: %d\n", rowsaffected)
			fmt.Println(resp.String("RUP: "))
		}
		if err != nil {
			return errors.New("reupdate failed: " + err.Error())
		}
		if DebugLevel > 3 {
			// Print out the end of the crawled info
			fmt.Println("----------- REUPDATE POST END -------------------")
		}
	}

	return
}
Exemplo n.º 3
0
func RedditPostScraper(sub string) (err error) {
	//drivername := "postgres"
	//dsn := "user=golang password=golang dbname=golang sslmode=disable"
	//dialect := gorp.PostgresDialect{}

	drivername := "mysql"
	dsn := "golang:golang@/golang?parseTime=true"
	dialect := gorp.MySQLDialect{"InnoDB", "UTF8"}

	// connect to db using standard Go database/sql API
	db, err := sql.Open(drivername, dsn)
	if err != nil {
		return errors.New("sql.Open failed: " + err.Error())
	}

	// Open doesn't open a connection. Validate DSN data:
	if err = db.Ping(); err != nil {
		return errors.New("db.Ping failed: " + err.Error())
	}

	// construct a gorp DbMap
	dbmap := &gorp.DbMap{Db: db, Dialect: dialect}
	defer dbmap.Db.Close()
	dbmap.DebugLevel = DebugLevel
	// Will log all SQL statements + args as they are run
	// The first arg is a string prefix to prepend to all log messages
	//dbmap.TraceOn("[gorp]", log.New(os.Stdout, "fetch:", log.Lmicroseconds))

	// register the structs you wish to use with gorp
	// you can also use the shorter dbmap.AddTable() if you
	// don't want to override the table name
	tablename := "posts_index_test"
	// SetKeys(true) means we have a auto increment primary key, which
	// will get automatically bound to your struct post-insert
	table := dbmap.AddTableWithName(post.Post{}, tablename)
	table.SetKeys(true, "PID")

	// create the table. in a production system you'd generally
	// use a migration tool, or create the tables via scripts
	if err = dbmap.CreateTablesIfNotExists(); err != nil {
		return errors.New("Create tables failed: " + err.Error())
	}

	// Force create all indexes for this database
	if err = dbmap.CreateIndexes(); err != nil {
		return errors.New("Create indexes failed: " + err.Error())
	}

	// Get data from reddit
	geturl := "http://www.reddit.com/r/" + sub + "/new"
	resp, err := http.Get(geturl)
	if err != nil {
		return errors.New("Failed to http.Get from " + geturl + ": " + err.Error())
	}
	if resp != nil {
		if resp.Body == nil {
			return errors.New("Body from " + geturl + " is nil!")
		} else {
			defer resp.Body.Close()
		}
	} else {
		return errors.New("Response from " + geturl + " is nil!")
	}
	if resp.StatusCode != 200 { // 200 = OK
		httperr := fmt.Sprintf("Failed to http.Get from %s: Http Status code: %d: Msg: %s", geturl, resp.StatusCode, resp.Status)
		return errors.New(httperr)
	}

	// Create a new post slice and then parse the response body into ps
	ps := make([]post.Post, 0)
	ps, err = ParseHtmlReddit(resp.Body, ps)
	if err != nil {
		return errors.New("Error in RedditParseHtml: " + geturl + ": " + err.Error())
	}
	foundnewposts := false
	updatedposts := 0

	// insert rows - auto increment PKs will be set properly after the insert
	for _, htmlpost := range ps {
		if htmlpost.Err == nil {
			var postcount int

			// Store reddit sub
			htmlpost.PostSub = sub

			// check if post already exists
			intSelectResult := make([]int, 0)
			postcountsql := "select count(*) from " + dbmap.Dialect.QuoteField(tablename) +
				" where WebPostId = :post_id"
			_, err := dbmap.Select(&intSelectResult, postcountsql, map[string]interface{}{
				"post_id": htmlpost.WebPostId,
			})
			if err != nil {
				return errors.New(fmt.Sprintf("Query: %s failed: %s\n", postcountsql, err.Error()))
			}
			if len(intSelectResult) == 0 {
				return errors.New(fmt.Sprintf("Query: %s returned no result\n", postcountsql))
			}
			postcount = intSelectResult[0]

			// DEBUG
			if DebugLevel > 3 {
				fmt.Println("HTMLpost.WebPostId: " + htmlpost.WebPostId)
				fmt.Printf("HTMLpost.Id: %v\n", htmlpost.Id)
				fmt.Printf("DBpost count: %v \n", postcount)
			}

			// New post? then insert
			if postcount == 0 {
				foundnewposts = true
				err = dbmap.Insert(&htmlpost)

				if DebugLevel > 2 {
					// Print out the crawled info
					fmt.Println("----------- INSERT POST START -----------------")
					fmt.Println(htmlpost.String())
				}
				if err != nil {
					return errors.New("insert into table " + dbmap.Dialect.QuoteField(tablename) + " failed: " + err.Error())
				}
				if DebugLevel > 2 {
					// Print out the end of the crawled info
					fmt.Println("----------- INSERT POST END -------------------")
				}
			} else {
				// Post already exists, do an update
				dbposts := make([]post.Post, 0)
				getpostsql := "select * from " + dbmap.Dialect.QuoteField(tablename) + " where WebPostId = :post_id"
				_, err := dbmap.Select(&dbposts, getpostsql, map[string]interface{}{
					"post_id": htmlpost.WebPostId,
				})
				if err != nil {
					return errors.New(fmt.Sprintf("Getting WebPostId %s from DB failes\n", htmlpost.WebPostId, err.Error()))
				}
				var dbpost post.Post
				if len(dbposts) > 0 {
					dbpost = dbposts[0]
				} else {
					return errors.New(fmt.Sprintf("Query: %s returned no result\n", getpostsql))
				}
				// DEBUG
				if DebugLevel > 3 {
					fmt.Printf("DBPOST: %s\n", dbpost.String())
					fmt.Printf("DBpost.Id: %v\n", dbpost.Id)
					fmt.Printf("DBpost.Score: %v\n", dbpost.Score)
				}

				if htmlpost.Score != dbpost.Score {

					if DebugLevel > 2 {
						// Print out the update info
						fmt.Println("----------- UPDATE POST START -----------------")
						fmt.Println("Title: " + dbpost.Title)
						fmt.Printf("Id: %v\n", dbpost.Id)
						fmt.Printf("From score %d to score %d\n", dbpost.Score, htmlpost.Score)
						fmt.Println("----------- UPDATE POST END -------------------")
					}

					dbpost.Score = htmlpost.Score
					affectedrows, err := dbmap.Update(&dbpost)
					switch {
					case err != nil:
						return errors.New("update table " + tablename + " failed: " + err.Error())
					case affectedrows == 0:
						return errors.New(fmt.Sprintf("update table %s for Id %d did not affect any lines", tablename, dbpost.Id))
					default:
						updatedposts++
					}
				}
			}
		} else {
			if DebugLevel > 1 {
				fmt.Println("Single post error in " + geturl + ": " + htmlpost.Err.Error())
			}
		}
	}
	if !foundnewposts {
		if DebugLevel > 2 {
			fmt.Println("No new posts found at " + geturl)
		}
	}

	if updatedposts > 0 {
		if DebugLevel > 2 {
			fmt.Printf("%d posts have been updated from %s\n", updatedposts, geturl)
		}
	}

	return
}
func ParseHtmlComments(p *post.Post) (err error) {

	if p.WebPostId == "" {
		return errors.New(fmt.Sprintf("p.WebPostId is empty in post '%s'", p.String("PC: ")))
	}
	// Get comments from hackernews
	geturl := fmt.Sprintf("http://news.ycombinator.com/item?id=%s", p.WebPostId)
	// DEBUG
	//geturl := fmt.Sprintf("https://news.ycombinator.com/item?id=9751858")

	if DebugLevel > 2 {
		fmt.Printf("START GET COMMENTS FROM '%s'\n", geturl)
	}

	body, err := GetHtmlBody(geturl)
	if err != nil {
		return errors.New("GetHtmlBody: " + err.Error())
	}
	// Create a qoquery document to parse from an io.Reader
	doc, err := goquery.NewDocumentFromReader(body)
	if err != nil {
		return errors.New("Failed to parse HTML: " + err.Error())
	}
	// Find hackernews comments = elements with class "athing"
	thing := doc.Find(".athing")
	for iThing := range thing.Nodes {
		// use `singlecomment` as a selection of one single post
		singlecomment := thing.Eq(iThing)

		comment := post.NewComment()
		//p.Comments = append(p.Comments, &comment)

		comheads := singlecomment.Find(".comhead a")
		for i := range comheads.Nodes {

			comhead := comheads.Eq(i)
			t, _ := comhead.Html()
			s, exists := comhead.Attr("href")
			if exists {
				if strings.HasPrefix(s, "user?id") {
					comment.User = t
					continue
				}
				if strings.HasPrefix(s, "item?id") {
					if strings.Contains(t, "ago") {
						var commentDate time.Time
						commentDate, err = GetDateFromCreatedAgo(t)
						if err != nil {
							comment.Err = errors.New(fmt.Sprintf("Failed to convert to date: %s: %s\n", t, err.Error()))
							err = nil
							continue
						}
						comment.CommentDate = commentDate
						if len(strings.Split(s, "=")) > 1 {
							comment.WebCommentId = strings.Split(s, "=")[1]
						}
						//comment.Err = err
					}
				}
			}

			comments := singlecomment.Find("span.comment")

			removeReplySelection := comments.Find("span div.reply")
			removeReplySelection.Remove()

			var sep string
			for iComment, _ := range comments.Nodes {
				s := comments.Eq(iComment)

				h, _ := s.Html()

				if !utf8.ValidString(s.Text()) {
					comment.Err = errors.New(fmt.Sprintf("Ignoring invalid UTF-8: '%s'", s.Text()))
					break
				}

				h, err = HtmlToMarkdown(h)
				if err != nil {
					comment.Err = errors.New(fmt.Sprintf("Ignoring markdownifier: '%s'", err.Error()))
					break
				}

				if h != "" {
					comment.Body = comment.Body + sep + h
				}
				sep = "\n"
			}
			//fmt.Printf("POST %s BODY = %s\n", p.WebPostId, comment.Body)

			if comment.Err == nil && len(comment.WebCommentId) > 0 && len(comment.Body) > 0 {
				p.Comments = append(p.Comments, &comment)
			} else {
				p.CommentParseErrors = append(p.CommentParseErrors, &comment)
			}
		}
	}

	if DebugLevel > 0 {
		fmt.Printf("GET COMMENTS FROM '%s' yielded %d comments\n", geturl, len(p.Comments))
	}

	return err
}
func HackerNewsPostScraper(sub string) (err error) {
	//drivername := "postgres"
	//dsn := "user=golang password=golang dbname=golang sslmode=disable"
	//dialect := gorp.PostgresDialect{}

	drivername := "mysql"
	dsn := "golang:golang@/golang?parseTime=true"
	dialect := gorp.MySQLDialect{"InnoDB", "utf8mb4"}

	// connect to db using standard Go database/sql API
	db, err := sql.Open(drivername, dsn)
	if err != nil {
		return errors.New("sql.Open failed: " + err.Error())
	}

	// Open doesn't open a connection. Validate DSN data using ping
	if err = db.Ping(); err != nil {
		return errors.New("db.Ping failed: " + err.Error())
	}

	// Set the connection to use utf8bmb4
	if dialect.Engine == "InnoDB" {
		fmt.Println("Setting connection to utf8mb4")
		_, err = db.Exec("SET NAMES utf8mb4 COLLATE utf8mb4_general_ci")
		if err != nil {
			return errors.New("SET NAMES utf8mb4 COLLATE utf8mb4_general_ci: " + err.Error())
		}
	}

	// construct a gorp DbMap
	dbmap := &gorp.DbMap{Db: db, Dialect: dialect}
	defer dbmap.Db.Close()
	dbmap.DebugLevel = DebugLevel
	// Will log all SQL statements + args as they are run
	// The first arg is a string prefix to prepend to all log messages
	//dbmap.TraceOn("[gorp]", log.New(os.Stdout, "Trace:", log.Lmicroseconds))

	// register the structs you wish to use with gorp
	// you can also use the shorter dbmap.AddTable() if you
	// don't want to override the table name

	// SetKeys(true) means we have a auto increment primary key, which
	// will get automatically bound to your struct post-insert
	table := dbmap.AddTableWithName(post.Post{}, "posts_index_test")
	table.SetKeys(true, "PID")

	// Add the comments table
	table = dbmap.AddTableWithName(post.Comment{}, "comments_index_test")
	table.SetKeys(true, "Id")

	// create the table. in a production system you'd generally
	// use a migration tool, or create the tables via scripts
	if err = dbmap.CreateTablesIfNotExists(); err != nil {
		return errors.New("Create tables failed: " + err.Error())
	}

	// Force create all indexes for this database
	if err = dbmap.CreateIndexes(); err != nil {
		return errors.New("Create indexes failed: " + err.Error())
	}

	// Get data from hackernews
	geturl := "http://news.ycombinator.com/" + sub
	// DEBUG for a special thread
	//geturl := "https://news.ycombinator.com/item?id=10056146"
	body, err := GetHtmlBody(geturl)
	if err != nil {
		return errors.New("GetHtmlBody: " + err.Error())
	}

	// Create a new post slice and then parse the response body into ps
	ps := make([]*post.Post, 0)
	//cs := make([]*post.Comment, 0)
	ps, err = ParseHtmlHackerNews(body, ps)
	if err != nil {
		return errors.New("ParseHtmlHackerNews: " + err.Error())
	}

	// Number of updated posts
	var updatedPostsCount int64
	// Number of new posts
	var insertedPostsCount int64

	var insertedPostsCommentCount int64
	var updatedPostsCommentCount int64

	// Number of post parsing errors
	var htmlPostErrorCount uint32
	// Number of comment parsing errors
	var htmlCommentErrorCount uint32

	// loop over all parsed posts
	for _, htmlpost := range ps {

		if htmlpost.WebPostId == "" {
			if DebugLevel > 1 {
				fmt.Printf("WebPostId not set in %s\n", htmlpost.Title)
			}
			// Fail early, continue with next post
			continue
		}

		if htmlpost.Err != nil {
			if DebugLevel > 1 {
				fmt.Println("Single post error in " + geturl + ": " + htmlpost.Err.Error())
			}
			// Fail early, continue with next post
			htmlPostErrorCount++
			continue
		}

		if len(htmlpost.CommentParseErrors) > 0 {
			for _, c := range htmlpost.CommentParseErrors {
				htmlCommentErrorCount++
				if DebugLevel > 2 {
					fmt.Println("Single comment error in '" + geturl + "' for WebPostId '" + htmlpost.WebPostId + ": " + c.Err.Error())
				}

			}
		}

		// Store post sub
		htmlpost.PostSub = sub

		tm, err := dbmap.TableFor(reflect.TypeOf(*htmlpost), true)
		if err != nil {
			return errors.New("Failed to get reflection type: " + err.Error())
		}
		if DebugLevel > 3 {
			fmt.Println("TABLEMAP: " + tm.TableName)
		}
		// check if post already exists
		dbposts := make([]post.Post, 0)
		getpostsql := "select * from " + dbmap.Dialect.QuotedTableForQuery("", tm.TableName) + " where WebPostId = :post_id"
		_, err = dbmap.Select(&dbposts, getpostsql, map[string]interface{}{
			"post_id": htmlpost.WebPostId,
		})
		if err != nil {
			return errors.New(fmt.Sprintf("Getting PostId %s from DB failed: %s", htmlpost.WebPostId, err.Error()))
		}
		var dbpost *post.Post
		if len(dbposts) == 1 {
			dbpost = &dbposts[0]
		} else if len(dbposts) > 1 {
			return errors.New(fmt.Sprintf("Query: %s returned %d rows", getpostsql, len(dbposts)))
		}
		postcount := len(dbposts)

		// New post? then insert
		if postcount == 0 {

			if DebugLevel > 2 {
				fmt.Printf("New post found, inserting htmlpost.WebPostId '%s'\n", htmlpost.WebPostId)
			}

			// Reset the rowcount info
			dbmap.LastOpInfo.Reset()
			htmlpost.CommentCount = uint64(len(htmlpost.Comments))
			// Insert the new post into the database
			err = dbmap.InsertWithChilds(htmlpost)

			if DebugLevel > 2 {
				// Print out the crawled info
				fmt.Println("----------- INSERT POST START -----------------")
				fmt.Println(htmlpost.String("INSERT: "))
			}
			if err != nil {
				return errors.New("insert into table " + dbmap.Dialect.QuoteField(tm.TableName) + " failed: " + err.Error())
			}
			if DebugLevel > 2 {
				// Print out the end of the crawled info
				fmt.Println("----------- INSERT POST END -------------------")
			}
			insertedPostsCount += dbmap.LastOpInfo.RowCount
			insertedPostsCommentCount += dbmap.LastOpInfo.ChildInsertRowCount

		} else {
			// Post already exists, get the full post with its comments from the db

			res, err := dbmap.GetWithChilds(post.Post{}, 9999999999, 0, dbpost.Id)
			if err != nil {
				return errors.New("get failed: " + err.Error())
			}
			if res == nil {
				return errors.New(fmt.Sprintf("Get post for id %d did not return any rows ", dbpost.Id))
			}
			dbpost = res.(*post.Post)

			// Check if an update is needed
			var updateNeeded bool
			updateNeeded, err = AddUpdatableChilds(htmlpost, dbpost, dbmap)
			if err != nil {
				return errors.New(fmt.Sprintf("CheckIfDataChanged for post '%s' failed: %s", htmlpost.WebPostId, err.Error()))
			}
			//if htmlpost.Score != dbpost.Score {
			if updateNeeded {
				// The post changed, do an update into the database

				//fmt.Println("Post Date db: " + dbpost.PostDate.String() + ", html: " + htmlpost.PostDate.String())
				//fmt.Printf("Post Score db: %d, html: %d\n", dbpost.Score, htmlpost.Score)

				if DebugLevel > 2 {
					fmt.Println("----------- UPDATE POST START -----------------")
					fmt.Println(dbpost.String("UPDATE1: "))
					fmt.Printf("From score %d to score %d\n", dbpost.Score, htmlpost.Score)
				}
				dbpost.Score = htmlpost.Score
				dbpost.PostDate = htmlpost.PostDate

				// Reset the rowcount info
				dbmap.LastOpInfo.Reset()

				// Update the posts together with its comments
				affectedrows, err := dbmap.UpdateWithChilds(dbpost)

				switch {
				case err != nil:
					return errors.New("update table " + tm.TableName + " failed: " + err.Error())
				case affectedrows == 0:
					return errors.New(fmt.Sprintf("update table %s for Id %d did not affect any lines", tm.TableName, dbpost.Id))
				default:

					updatedPostsCount += dbmap.LastOpInfo.RowCount
					insertedPostsCommentCount += dbmap.LastOpInfo.ChildInsertRowCount
					updatedPostsCommentCount += dbmap.LastOpInfo.ChildUpdateRowCount

					dbpost.CommentCount += uint64(dbmap.LastOpInfo.ChildInsertRowCount)
					_, err = dbmap.Update(dbpost)

					if err != nil {
						return errors.New(fmt.Sprintf("Update for post '%s' failed: %s", dbpost.WebPostId, err.Error()))
					}

					if DebugLevel > 2 {
						// Print out the update info
						fmt.Println("----------- UPDATE POST COMMIT -----------------")
						fmt.Println(dbpost.String("UPDATE2: "))
						fmt.Println("----------- UPDATE POST END -------------------")
					}
				}
			}

		}
	}
	if insertedPostsCount == 0 && updatedPostsCount == 0 {
		if DebugLevel > 2 {
			fmt.Println("No new posts found at " + geturl)
		}
		return
	}

	if DebugLevel > 2 {
		fmt.Printf("%d new posts have been inserted from %s\n", insertedPostsCount, geturl)
		fmt.Printf("%d posts have been updated from %s\n", updatedPostsCount, geturl)
		fmt.Printf("%d new comments have been inserted from %s\n", insertedPostsCommentCount, geturl)
		fmt.Printf("%d comments have been updated from %s\n", updatedPostsCommentCount, geturl)
		fmt.Printf("%d comment errors\n", htmlCommentErrorCount)

	}

	return
}
Exemplo n.º 6
0
func HackerNewsPostScraper(sub string) (err error) {
	//drivername := "postgres"
	//dsn := "user=golang password=golang dbname=golang sslmode=disable"
	//dialect := gorp.PostgresDialect{}

	drivername := "mysql"
	dsn := "golang:golang@/golang?parseTime=true"
	dialect := gorp.MySQLDialect{"InnoDB", "UTF8"}

	// connect to db using standard Go database/sql API
	db, err := sql.Open(drivername, dsn)
	if err != nil {
		return errors.New("sql.Open failed: " + err.Error())
	}

	// Open doesn't open a connection. Validate DSN data using ping
	if err = db.Ping(); err != nil {
		return errors.New("db.Ping failed: " + err.Error())
	}

	// construct a gorp DbMap
	dbmap := &gorp.DbMap{Db: db, Dialect: dialect}
	defer dbmap.Db.Close()
	dbmap.DebugLevel = DebugLevel
	// Will log all SQL statements + args as they are run
	// The first arg is a string prefix to prepend to all log messages
	//dbmap.TraceOn("[gorp]", log.New(os.Stdout, "fetch:", log.Lmicroseconds))

	// register the structs you wish to use with gorp
	// you can also use the shorter dbmap.AddTable() if you
	// don't want to override the table name
	tablename := "posts_index_test"
	// SetKeys(true) means we have a auto increment primary key, which
	// will get automatically bound to your struct post-insert
	table := dbmap.AddTableWithName(post.Post{}, tablename)
	table.SetKeys(true, "PID")

	// create the table. in a production system you'd generally
	// use a migration tool, or create the tables via scripts
	if err = dbmap.CreateTablesIfNotExists(); err != nil {
		return errors.New("Create tables failed: " + err.Error())
	}

	// Force create all indexes for this database
	if err = dbmap.CreateIndexes(); err != nil {
		return errors.New("Create indexes failed: " + err.Error())
	}

	// Get data from hackernews
	geturl := "http://news.ycombinator.com/"
	body, err := GetHtmlBody(geturl)
	if err != nil {
		return errors.New("GetHtmlBody: " + err.Error())
	}

	// Create a new post slice and then parse the response body into ps
	ps := make([]*post.Post, 0)
	ps, err = ParseHtmlHackerNews(body, ps)
	if err != nil {
		return errors.New("ParseHtmlHackerNews: " + err.Error())
	}

	// Number of updated posts
	var updatedPostsCount uint32
	// Number of new posts
	var insertedPostsCount uint32

	// insert rows - auto increment PKs will be set properly after the insert
	for _, htmlpost := range ps {

		if htmlpost.WebPostId == "" {
			if DebugLevel > 1 {
				fmt.Printf("WebPostId not set in %s\n", htmlpost.Title)
			}
			// Fail early, continue with next post
			continue
		}

		if htmlpost.Err != nil {
			if DebugLevel > 1 {
				fmt.Println("Single post error in " + geturl + ": " + htmlpost.Err.Error())
			}
			// Fail early, continue with next post
			continue
		}

		// Store post sub
		htmlpost.PostSub = sub

		// check if post already exists
		intSelectResult := make([]int, 0)
		postcountsql := "select count(*) from " + dbmap.Dialect.QuoteField(tablename) +
			" where WebPostId = :post_id"
		_, err := dbmap.Select(&intSelectResult, postcountsql, map[string]interface{}{
			"post_id": htmlpost.WebPostId,
		})
		if err != nil {
			return errors.New(fmt.Sprintf("Query: %s failed: %s\n", postcountsql, err.Error()))
		}
		if len(intSelectResult) == 0 {
			return errors.New(fmt.Sprintf("Query: %s returned no result\n", postcountsql))
		}
		postcount := intSelectResult[0]

		// New post? then insert
		if postcount == 0 {

			// Insert the new post into the database
			err = dbmap.Insert(htmlpost)

			if DebugLevel > 2 {
				// Print out the crawled info
				fmt.Println("----------- INSERT POST START -----------------")
				fmt.Println(htmlpost.String())
			}
			if err != nil {
				return errors.New("insert into table " + dbmap.Dialect.QuoteField(tablename) + " failed: " + err.Error())
			}
			if DebugLevel > 2 {
				// Print out the end of the crawled info
				fmt.Println("----------- INSERT POST END -------------------")
			}
			insertedPostsCount++

		} else {
			// Post already exists, do an update
			// Create a slice of posts to select into
			dbposts := make([]post.Post, 0)
			getpostsql := "select * from " + dbmap.Dialect.QuoteField(tablename) + " where WebPostId = :post_id"
			_, err := dbmap.Select(&dbposts, getpostsql, map[string]interface{}{
				"post_id": htmlpost.WebPostId,
			})
			if err != nil {
				return errors.New(fmt.Sprintf("Getting WebPostId %s from DB failed: %s\n", htmlpost.WebPostId, err.Error()))
			}
			var dbpost post.Post
			if len(dbposts) > 0 {
				dbpost = dbposts[0]
			} else {
				return errors.New(fmt.Sprintf("Query: %s returned no result\n", getpostsql))
			}

			if htmlpost.Score != dbpost.Score {
				// The post score changed, do an update into the database

				//fmt.Println("Post Date db: " + dbpost.PostDate.String() + ", html: " + htmlpost.PostDate.String())
				//fmt.Printf("Post Score db: %d, html: %d\n", dbpost.Score, htmlpost.Score)

				if DebugLevel > 2 {
					fmt.Println("----------- UPDATE POST START -----------------")
					fmt.Println(dbpost.String())
					fmt.Printf("From score %d to score %d\n", dbpost.Score, htmlpost.Score)
				}
				dbpost.Score = htmlpost.Score
				dbpost.PostDate = htmlpost.PostDate
				affectedrows, err := dbmap.Update(&dbpost)
				switch {
				case err != nil:
					return errors.New("update table " + tablename + " failed: " + err.Error())
				case affectedrows == 0:
					return errors.New(fmt.Sprintf("update table %s for Id %d did not affect any lines", tablename, dbpost.Id))
				default:
					updatedPostsCount++
					if DebugLevel > 2 {
						// Print out the update info
						fmt.Println("----------- UPDATE POST COMMIT -----------------")
						fmt.Println(dbpost.String())
						fmt.Println("----------- UPDATE POST END -------------------")
					}
				}
			}

		}
	}
	if insertedPostsCount == 0 && updatedPostsCount == 0 {
		if DebugLevel > 2 {
			fmt.Println("No new posts found at " + geturl)
		}
	}

	if updatedPostsCount > 0 && DebugLevel > 2 {
		fmt.Printf("%d existing posts have been updated from %s\n", updatedPostsCount, geturl)
	}

	if insertedPostsCount > 0 && DebugLevel > 2 {
		fmt.Printf("%d new posts have been inserted from %s\n", insertedPostsCount, geturl)
	}

	return
}