Пример #1
0
func ParseHtml(io io.Reader, ps []post.Post) (psout []post.Post, err error) {

	// Create a qoquery document to parse from
	doc, err := goquery.NewDocumentFromReader(io)
	checkErr(err, "Failed to parse HTML")

	if err == nil {
		fmt.Println("---- Starting to parse ------------------------")

		// Find reddit posts = elements with class "thing"
		thing := doc.Find(".thing")
		for iThing := range thing.Nodes {

			post := post.NewPost()

			// use `single` as a selection of 1 node
			singlething := thing.Eq(iThing)

			// get the reddit post identifier
			reddit_post_id, exists := singlething.Attr("data-fullname")
			if exists == false {
				singlehtml, _ := singlething.Html()
				post.Err = fmt.Errorf("data-fullname not found in %s", singlehtml)
			} else {

				// find an element with class title and a child with class may-blank
				// Remove CRLF and unnecessary whitespaces
				post.Title = stringMinifier(singlething.Find(".title .may-blank").Text())

				post.PostId = reddit_post_id
				post.User = singlething.Find(".author").Text()
				post.Url, _ = singlething.Find(".comments.may-blank").Attr("href")
				post.SetScore(singlething.Find(".score.likes").Text())
				reddit_postdate, exists := singlething.Find("time").Attr("datetime")

				if exists == false {
					singlehtml, _ := singlething.Html()
					post.Err = fmt.Errorf("datetime not found in %s", singlehtml)
				} else {

					//post := post.NewPost()
					post.SetPostDate(reddit_postdate)

					// Print out the crawled info
					post.String()
					fmt.Println("-----------------------------------------------")

				}
			}
			ps = append(ps, post)

		}
	}

	return ps, err
}
// Parse for posts in html from hackernews, input html is an io.Reader and returns recognized posts in a psout slice of posts.
// Errors which affect only a single post are stored in their post.Err
func ParseHtmlHackerNews(body io.Reader, ps []*post.Post) (psout []*post.Post, err error) {

	var html string
	// Create a qoquery document to parse from an io.Reader
	doc, err := goquery.NewDocumentFromReader(body)
	if err != nil {
		return ps, errors.New("Failed to parse HTML: " + err.Error())
	}

	// Find hackernews posts = elements with class "athing"
	thing := doc.Find(".athing")
	for iThing := range thing.Nodes {

		// Create a new post struct - if the crawling fails the post will have an Err attached
		// but will be added to the outgoing (psout) slice nevertheless
		post := post.NewPost()
		post.Site = "hackernews"
		ps = append(ps, &post)

		// use `singlearticle` as a selection of one single post
		singlearticle := thing.Eq(iThing)

		// Get the next element containing additional info for this post
		scorenode := singlearticle.Next()
		if scorenode == nil {
			errhtml, _ := singlearticle.Html()
			post.Err = fmt.Errorf("Did not find next sibling for: %s\n", errhtml)
			continue
		}

		htmlpost := singlearticle.Find(".title a").First()
		if htmlpost.Size() == 0 {
			errhtml, _ := singlearticle.Html()
			post.Err = fmt.Errorf("Did not find title for: %s\n", errhtml)
			continue
		}

		post.Title = htmlpost.Text()
		var exists bool
		post.Url, exists = htmlpost.Attr("href")
		if exists == false {
			singlehtml, _ := htmlpost.Html()
			post.Err = fmt.Errorf("href not found in %s\n", singlehtml)
		}
		post.Url = stringMinifier(post.Url)

		if !(strings.HasPrefix(post.Url, "http")) {
			post.Url = "https://news.ycombinator.com/" + post.Url
		}

		if DebugLevel > 2 {
			fmt.Printf("**** URL post.Url: %s\n", post.Url)
		}

		if DebugLevel > 3 {
			fmt.Printf("---------------------------\n")
			html, _ = scorenode.Html()
			fmt.Printf("HTML: %s\n", html)
			fmt.Printf("---------------------------\n")
		}

		// Get the score
		scoretag := scorenode.Find(".subtext .score").First()
		if scoretag.Size() == 0 {
			post.Err = fmt.Errorf("Did not find score for: %v\n", scorenode)
			continue
		}

		if DebugLevel > 3 {
			fmt.Printf("------- SCORE -------------\n")
			html, _ = scoretag.Html()
			fmt.Printf("HTML: %s\n", html)
			score := scoretag.Text()
			fmt.Printf("TEXT: %s\n", score)
			fmt.Printf("---------------------------\n")
		}

		post.SetScore(strings.Split(scoretag.Text(), " ")[0])

		postid, exists := scoretag.Attr("id")
		if !exists {
			html, _ = scoretag.Html()
			post.Err = fmt.Errorf("Did not find postid in %s\n", html)
		}

		if DebugLevel > 3 {
			fmt.Printf("------- POST ID -----------\n")
			fmt.Printf("TEXT: %s\n", postid)
			fmt.Printf("---------------------------\n")
		}

		post.WebPostId = strings.Split(postid, "_")[1]

		// Get the username and postdate
		hrefs := scorenode.Find(".subtext a")
		if hrefs.Size() == 0 {
			errhtml, _ := scorenode.Html()
			post.Err = fmt.Errorf("Did not find user and date in %s\n", errhtml)
			continue
		}

		for i := range hrefs.Nodes {
			href := hrefs.Eq(i)
			t, _ := href.Html()
			s, exists := href.Attr("href")
			if exists {
				if strings.HasPrefix(s, "user?id") {
					post.User = t
					continue
				}
				if strings.HasPrefix(s, "item?id") {
					if strings.Contains(t, "ago") {
						var postDate time.Time
						postDate, err = GetDateFromCreatedAgo(t)
						if err != nil {
							post.Err = errors.New(fmt.Sprintf("Failed to convert to date: %s: %s\n", t, err.Error()))
							continue
						}
						post.PostDate = postDate
						post.Err = err
					}
				}
			}
			if DebugLevel > 3 {
				fmt.Printf("------- HREF --------------\n")
				fmt.Printf("TEXT: %s\n", t)
				fmt.Printf("HREF: %s\n", s)
				fmt.Printf("---------------------------\n")
			}
		}
		if post.Err == nil {
			err = ParseHtmlComments(&post)
		}
		if DebugLevel > 2 && err == nil {
			fmt.Printf("------ POST DUMP -----------\n")
			fmt.Print(post.String("PARSED: "))
			fmt.Printf("------ POST DUMP END -------\n")
		}

	}

	return ps, err
}
Пример #3
0
func RedditPostScraper(sub string) (err error) {

	// connect to db using standard Go database/sql API
	//db, err := sql.Open("mysql", "user:password@/dbname")
	db, err := sql.Open("mysql", "golang:golang@/golang")
	if err != nil {
		return errors.New("sql.Open failed: " + err.Error())
	}

	// Open doesn't open a connection. Validate DSN data:
	err = db.Ping()
	if err != nil {
		return errors.New("db.Ping failed: " + err.Error())
	}

	// construct a gorp DbMap
	dbmap := &gorp.DbMap{Db: db, Dialect: gorp.MySQLDialect{"InnoDB", "UTF8"}}
	defer dbmap.Db.Close()

	// register the structs you wish to use with gorp
	// you can also use the shorter dbmap.AddTable() if you
	// don't want to override the table name
	//
	// SetKeys(true) means we have a auto increment primary key, which
	// will get automatically bound to your struct post-insert
	table := dbmap.AddTableWithName(post.Post{}, "posts")
	table.SetKeys(true, "Id")
	// Set field Site to be a varchar(32)
	table.ColMap("Site").SetMaxSize(32)
	// Set field Site to be not null
	table.ColMap("Site").SetNotNull(true)
	// Set field PostId to be a varchar(32)
	table.ColMap("PostId").SetMaxSize(32)
	// Set field PostId to be not null
	table.ColMap("PostId").SetNotNull(true)
	// this creates an unique index on PostId
	table.ColMap("PostId").SetUnique(true)

	// create the table. in a production system you'd generally
	// use a migration tool, or create the tables via scripts
	err = dbmap.CreateTablesIfNotExists()
	if err != nil {
		return errors.New("Create table 'posts' failed: " + err.Error())
	}

	// Get data from reddit
	geturl := "http://www.reddit.com/r/" + sub + "/new"
	resp, err := http.Get(geturl)
	if err != nil {
		return errors.New("Failed to http.Get from " + geturl + ": " + err.Error())
	}
	if resp != nil {
		if resp.Body == nil {
			return errors.New("Body from " + geturl + " is nil!")
		} else {
			defer resp.Body.Close()
		}
	} else {
		return errors.New("Response from " + geturl + " is nil!")
	}
	if resp.StatusCode != 200 { // 200 = OK
		httperr := fmt.Sprintf("Failed to http.Get from %s: Http Status code: %d: Msg: %s", geturl, resp.StatusCode, resp.Status)
		return errors.New(httperr)
	}

	// Create a new post slice and then parse the response body into ps
	ps := make([]post.Post, 0)
	ps, err = ParseHtmlReddit(resp.Body, ps)
	if err != nil {
		return errors.New("Error in RedditParseHtml: " + geturl + ": " + err.Error())
	}
	foundnewposts := false

	// insert rows - auto increment PKs will be set properly after the insert
	for _, post := range ps {
		if post.Err == nil {

			// check if post already exists
			count, err := dbmap.SelectInt("select count(*) from posts where PostId = ?", post.PostId)
			if err != nil {
				return errors.New("select count(*) from posts failed: " + err.Error())
			}

			if count == 0 {
				foundnewposts = true
				err = dbmap.Insert(&post)
				if err != nil {
					return errors.New("insert into table posts failed: " + err.Error())
				}
				if err == nil {
					// Print out the crawled info
					fmt.Println(post.String())
					fmt.Println("-----------------------------------------------")
				}
			}
		} else {
			fmt.Println("Single post error in " + geturl + ": " + post.Err.Error())
		}
	}
	if !foundnewposts {
		fmt.Println("No new posts found at " + geturl)
	}

	return
}