コード例 #1
0
ファイル: scrape.go プロジェクト: Newbrict/EzSkins
func getMatchInfoTitle(z *html.Tokenizer) string {
	eof := false
	for !eof {
		tt := z.Next()

		switch {

		case tt == html.ErrorToken:
			eof = true

		case tt == html.StartTagToken:
			t := z.Token()

			// Check if the token is a <title> tag
			isTitle := t.Data == "title"

			if isTitle {
				z.Next()
				// This is the title
				return z.Token().Data
			}

		}
	}
	// If we reached here something went wrong :^(
	Error.Printf("Could not get title...")
	return ""
}
コード例 #2
0
func parse2(z *html.Tokenizer) (*Schedule, error) {

	schedule := &Schedule{}
	currentDate := ""

	for {
		tt := z.Next()

		switch tt {
		case html.ErrorToken:
			return schedule, nil
		case html.StartTagToken:
			t := z.Token()
			if isTokenTagWithAttr("font", "class", "PageHeading", &t, z) {
				z.Next()
				currentDate = z.Token().Data
			} else if isTokenTagWithAttr("tr", "bgcolor", "#ffffff", &t, z) || isTokenTagWithAttr("tr", "bgcolor", "#f5f5f5", &t, z) {
				game, err := parseGame(currentDate, z)
				if err != nil {
					return nil, err
				}
				schedule.Games = append(schedule.Games, game)
			}
		}
	}
}
コード例 #3
0
func advanceToTextToken(z *html.Tokenizer) *html.Token {
	for {
		tt := z.Next()

		switch tt {
		case html.ErrorToken:
			return nil
		case html.TextToken:
			t := z.Token()
			return &t
		}
	}
}
コード例 #4
0
func parseGame(date string, z *html.Tokenizer) (Game, error) {
	var game Game
	td := advanceToStartTag("td", z)
	if td == nil {
		return game, errors.New("Unable to find Game Number")
	}
	z.Next()
	gameNum := strings.TrimSpace(z.Token().Data)

	td = advanceToStartTag("td", z)
	if td == nil {
		return game, errors.New("Unable to find Game Time")
	}
	td = advanceToStartTag("div", z)
	if td == nil {
		return game, errors.New("Unable to find Game Time")
	}
	z.Next()
	gameTime := strings.TrimSpace(z.Token().Data)
	if gameTime == "" {
		t := advanceToTextToken(z)
		gameTime = strings.TrimSpace(t.Data)
	}

	var homeTeam, homeScore, awayTeam, awayScore string

	skipAwayScore := false

	homeTeam = parseTeamName(z)
	homeScore = parseScore(z)
	if len(homeScore) > 3 {
		awayTeam = homeScore
		homeScore = ""
		skipAwayScore = true
	} else {
		awayTeam = parseTeamName(z)
	}
	if !skipAwayScore {
		awayScore = parseScore(z)
	} else {
		awayScore = ""
	}

	gameDate, err := time.Parse("1/2/2006 3:04 PM", date+" "+gameTime)
	if err != nil {
		return game, err
	}

	return Game{gameDate, gameNum, homeTeam, homeScore, awayTeam, awayScore}, nil
}
コード例 #5
0
func advanceToStartTag(tagName string, z *html.Tokenizer) *html.Token {
	for {
		tt := z.Next()

		switch tt {
		case html.ErrorToken:
			return nil
		case html.StartTagToken:
			t := z.Token()
			if t.Data == tagName {
				return &t
			}
		}
	}
}
コード例 #6
0
ファイル: scrape.go プロジェクト: Newbrict/EzSkins
func getMatchInfoBets(z *html.Tokenizer) (bets []*Bet) {
	var bettor string
	var item string
	var statTrak bool

	eof := false
	for !eof {
		tt := z.Next()

		switch {

		case tt == html.ErrorToken:
			eof = true

		case tt == html.StartTagToken:
			t := z.Token()
			isDiv := t.Data == "div"
			isSpan := t.Data == "span"

			if isSpan {
				for _, a := range t.Attr {
					if a.Key == "class" && a.Val == "user" {
						z.Next()
						z.Next()
						t := z.Token()
						bettor = strings.TrimSpace(t.Data)
					}
				}
			}

			if isDiv {
				for _, a := range t.Attr {
					if a.Key == "class" && strings.Contains(a.Val, "item") {
						z.Next()
						z.Next()
						t = z.Token()

						// Get StatTrak status
						statTrak = strings.Contains(t.Attr[0].Val, "clreff")

						if statTrak {
							z.Next()
							z.Next()
							z.Next()
							z.Next()
							t = z.Token()
						}
						item = t.Attr[2].Val
						thisBet := &Bet{bettor, item, statTrak}
						bets = append(bets, thisBet)
					}
				}
			}
		}
	}

	return
}
コード例 #7
0
ファイル: html.go プロジェクト: hiromaily/golibs
// ParseToken is to parse token
func ParseToken(z *html.Tokenizer, tag string) {
	for {
		tt := z.Next()

		switch {
		case tt == html.ErrorToken:
			// End of the document, we're done
			return
		case tt == html.StartTagToken:
			t := z.Token()

			// check element
			checkElement(t, tag)
		}
	}
}
コード例 #8
0
ファイル: scrape.go プロジェクト: Newbrict/EzSkins
func getMatchInfoDateTime(z *html.Tokenizer) (matchDate, matchTime string) {
	eof := false
	for !eof {
		tt := z.Next()

		switch {

		case tt == html.ErrorToken:
			eof = true

		case tt == html.StartTagToken:
			t := z.Token()
			isDiv := t.Data == "div"

			if isDiv {
				possibleDate := false
				for _, a := range t.Attr {
					if a.Key == "class" && a.Val == "half" {
						possibleDate = true
					}

					if possibleDate && a.Key == "title" {
						// Definitely a date now, grab both date and time
						matchDate = a.Val
						z.Next()
						matchTime = z.Token().Data
						// Trim the whitespace around time
						matchTime = strings.TrimSpace(matchTime)
						return
					}
				}
			}
		}
	}
	Error.Printf("Could not get date and time...")
	return "", ""
}
コード例 #9
0
ファイル: NYT.go プロジェクト: opinionated/scraper-core
func (article *NYTArticle) DoParse(parser *html.Tokenizer) error {

articleOpeningTagLoop:
	for {
		token := parser.Next()

		switch {
		case token == html.ErrorToken:
			return fmt.Errorf("problem moving article %s to open tag", article.GetTitle())
		case token == html.StartTagToken:
			tmp := parser.Token()
			isStartArticle := tmp.Data == "p"
			if isStartArticle {
				for _, attr := range tmp.Attr {
					if attr.Key == "class" && attr.Val == "story-body-text story-content" {
						break articleOpeningTagLoop
					}
				}
			}
		}
	}

	isInParagraph := true
articleClosingTagLoop:
	for {
		token := parser.Next()
		switch {
		case token == html.ErrorToken:
			return fmt.Errorf("problem scraping article %s", article.GetTitle())
		case token == html.StartTagToken:
			tmp := parser.Token()
			isEndArticle := tmp.Data == "footer"
			if isEndArticle {
				for _, attr := range tmp.Attr {
					if attr.Key == "class" && attr.Val == "story-footer story-content" {
						break articleClosingTagLoop
					}
				}
			}

			if tmp.Data == "p" {
				for _, attr := range tmp.Attr {
					if attr.Key == "class" && strings.Contains(attr.Val, "story-body-text") {
						isInParagraph = true
					}
				}
				if isInParagraph {
					continue
				}
			}

			// is a link
			if tmp.Data == "a" {
				shouldSkip := false
				for _, attr := range tmp.Attr {
					if attr.Key == "class" && strings.Contains(attr.Val, "visually-hidden") {
						shouldSkip = true
					}
				}

				if shouldSkip {
					continue
				}

				parser.Next()
				tmp = parser.Token()
				newBody := strings.TrimSpace(article.GetData()) + " " + strings.TrimSpace(tmp.Data) + " "
				article.SetData(newBody)
				isInParagraph = true
			}

		case token == html.EndTagToken:
			tmp := parser.Token()
			if tmp.Data == "p" {
				isInParagraph = false
			}

		default:
			if !isInParagraph {
				continue
			}
			tmp := parser.Token()

			newBody := article.GetData()
			// add a space on the left just in case there is a comment or something
			if unicode.IsPunct(rune(tmp.Data[0])) {
				newBody = strings.TrimSpace(newBody)
			}
			newBody = newBody + strings.TrimSpace(tmp.Data)
			article.SetData(newBody)
			isInParagraph = false
		}
	}
	fmt.Println(article.GetData())
	return nil
}
コード例 #10
0
ファイル: Economist.go プロジェクト: opinionated/scraper-core
func (article *ECONArticle) DoParse(parser *html.Tokenizer) error {

	// ENDS WITH div class content clearfix everywhere
articleOpeningTagLoop:
	for {
		token := parser.Next()

		switch {
		case token == html.ErrorToken:
			fmt.Println("Prollem")
			return nil
		case token == html.StartTagToken:
			tmp := parser.Token()
			isStartArticle := tmp.Data == "p"
			if isStartArticle {
				for _, attr := range tmp.Attr {
					if attr.Key == "class" && attr.Val == "main-content" {
						fmt.Println("Found it, bitch")
						break articleOpeningTagLoop
					}
				}
			}
		}
	}

	isInParagraph := true
articleClosingTagLoop:
	for {
		token := parser.Next()
		switch {
		case token == html.ErrorToken:
			fmt.Println("Prollem")
			return nil
		case token == html.StartTagToken:
			tmp := parser.Token()
			isEndArticle := tmp.Data == "footer"
			if isEndArticle {
				for _, attr := range tmp.Attr {
					if attr.Key == "class" && attr.Val == "story-footer story-content" {
						fmt.Println("Hit end")
						break articleClosingTagLoop
					}
				}
			}
			isInParagraph = true
		default:
			if !isInParagraph {
				continue
			}
			tmp := parser.Token()

			newBody := article.GetData()
			// add a space on the left just in case there is a comment or something
			newBody = newBody + strings.TrimSpace(tmp.Data)
			article.SetData(newBody)
			isInParagraph = false
			//fmt.Println("Next p", newBody)
		}
	}
	fmt.Println(article.GetData())
	return nil
}
コード例 #11
0
ファイル: WSJScraper.go プロジェクト: jpatsenker/Opinionated
func (article *WSJArticle) DoParse(parser *html.Tokenizer) error {

	// find the start of the article
	// starts at the top of the html body, ends at the article tag
articleTagLoop:
	for {
		token := parser.Next()

		switch {
		case token == html.ErrorToken:
			fmt.Println("OH NOSE!!!! ERROR before we hit the end")
			return nil
		case token == html.StartTagToken:
			tmp := parser.Token()

			isStartArticle := tmp.Data == "article"
			if isStartArticle {
				break articleTagLoop
			}
		}
	}

	// find the article header, which has author, time etc
	// starts at the article tag, ends at the article header
	// TODO: get author info and such here
articleStartLoop:
	for {
		token := parser.Next()

		switch {
		case token == html.ErrorToken:
			return nil
		case token == html.StartTagToken:
			tmp := parser.Token()

			isStartArticleBody := tmp.Data == "div"
			// loop until we are at the first paragraph of the article body
			if isStartArticleBody {
				isStartArticleBody = false
				for _, attr := range tmp.Attr {
					if attr.Key == "class" && attr.Val == "clearfix byline-wrap" {
						isStartArticleBody = true
						break
					}
				}
				if isStartArticleBody {
					break articleStartLoop
				}
			}
		}
	}

	// find the start of the article
	// starts at the end of the article header, ends at the first article paragraph
articleBodyStartLoop:
	for {
		token := parser.Next()
		switch {
		case token == html.ErrorToken:
			return nil
		case token == html.StartTagToken:
			tmp := parser.Token()
			isStartArticleBody := tmp.Data == "p"
			if isStartArticleBody {
				break articleBodyStartLoop
			}
		}
	}

	// pull the article out of the html
	// starts at first paragraph, returns at the end of the article
	isInParagraph := true // true because we start inside the first paragraph
	depth := 1            // one because this loop starts at first paragraph
	for {
		token := parser.Next()
		switch {
		case token == html.ErrorToken:
			fmt.Println("hit err, depth is:", depth)
			return nil
		case token == html.StartTagToken:
			depth++
			tmp := parser.Token()

			isParagraph := tmp.Data == "p"
			if isParagraph {
				// start of a new paragraph
				if depth != 1 {
					fmt.Println("ERROR: hit new paragraph while depth != 0")
				}
				if isInParagraph {
					fmt.Println("ERROR: hit unexpected new paragraph tag while in paragraph")
				}
				isInParagraph = true
			}

			// text can have embeded links
			isLink := tmp.Data == "a"
			if isLink {
				if !isInParagraph {
					fmt.Println("ERROR: hit unexpected link outside of a paragraph")
					continue
				}

				// if we are in a paragraph, append the link name
				parser.Next()
				tmp = parser.Token()
				newBody := article.GetData() + tmp.Data
				article.SetData(newBody)
			}
		case token == html.EndTagToken:
			depth--
			tmp := parser.Token().Data
			if depth == -1 {
				// done with article when we are at a higher level than it
				return nil
			}

			if tmp == "p" {
				// add a paragraph and trim the space
				article.SetData(strings.TrimSpace(article.GetData() + "\n"))
				isInParagraph = false
			}

		default:
			if !isInParagraph {
				// if not inside a text paragraph, continue on
				continue
			}

			// get the paragraph text and append it to the article body
			// TODO: look into using a string builder instead of adding things on
			tmp := parser.Token()
			newBody := article.GetData()
			// add a space on the left just in case there is a comment or something
			newBody = newBody + strings.TrimSpace(tmp.Data) + " "

			article.SetData(newBody)
		}

	}
	return nil
}