func getMatchInfoTitle(z *html.Tokenizer) string { eof := false for !eof { tt := z.Next() switch { case tt == html.ErrorToken: eof = true case tt == html.StartTagToken: t := z.Token() // Check if the token is a <title> tag isTitle := t.Data == "title" if isTitle { z.Next() // This is the title return z.Token().Data } } } // If we reached here something went wrong :^( Error.Printf("Could not get title...") return "" }
func parse2(z *html.Tokenizer) (*Schedule, error) { schedule := &Schedule{} currentDate := "" for { tt := z.Next() switch tt { case html.ErrorToken: return schedule, nil case html.StartTagToken: t := z.Token() if isTokenTagWithAttr("font", "class", "PageHeading", &t, z) { z.Next() currentDate = z.Token().Data } else if isTokenTagWithAttr("tr", "bgcolor", "#ffffff", &t, z) || isTokenTagWithAttr("tr", "bgcolor", "#f5f5f5", &t, z) { game, err := parseGame(currentDate, z) if err != nil { return nil, err } schedule.Games = append(schedule.Games, game) } } } }
func advanceToTextToken(z *html.Tokenizer) *html.Token { for { tt := z.Next() switch tt { case html.ErrorToken: return nil case html.TextToken: t := z.Token() return &t } } }
func parseGame(date string, z *html.Tokenizer) (Game, error) { var game Game td := advanceToStartTag("td", z) if td == nil { return game, errors.New("Unable to find Game Number") } z.Next() gameNum := strings.TrimSpace(z.Token().Data) td = advanceToStartTag("td", z) if td == nil { return game, errors.New("Unable to find Game Time") } td = advanceToStartTag("div", z) if td == nil { return game, errors.New("Unable to find Game Time") } z.Next() gameTime := strings.TrimSpace(z.Token().Data) if gameTime == "" { t := advanceToTextToken(z) gameTime = strings.TrimSpace(t.Data) } var homeTeam, homeScore, awayTeam, awayScore string skipAwayScore := false homeTeam = parseTeamName(z) homeScore = parseScore(z) if len(homeScore) > 3 { awayTeam = homeScore homeScore = "" skipAwayScore = true } else { awayTeam = parseTeamName(z) } if !skipAwayScore { awayScore = parseScore(z) } else { awayScore = "" } gameDate, err := time.Parse("1/2/2006 3:04 PM", date+" "+gameTime) if err != nil { return game, err } return Game{gameDate, gameNum, homeTeam, homeScore, awayTeam, awayScore}, nil }
func advanceToStartTag(tagName string, z *html.Tokenizer) *html.Token { for { tt := z.Next() switch tt { case html.ErrorToken: return nil case html.StartTagToken: t := z.Token() if t.Data == tagName { return &t } } } }
func getMatchInfoBets(z *html.Tokenizer) (bets []*Bet) { var bettor string var item string var statTrak bool eof := false for !eof { tt := z.Next() switch { case tt == html.ErrorToken: eof = true case tt == html.StartTagToken: t := z.Token() isDiv := t.Data == "div" isSpan := t.Data == "span" if isSpan { for _, a := range t.Attr { if a.Key == "class" && a.Val == "user" { z.Next() z.Next() t := z.Token() bettor = strings.TrimSpace(t.Data) } } } if isDiv { for _, a := range t.Attr { if a.Key == "class" && strings.Contains(a.Val, "item") { z.Next() z.Next() t = z.Token() // Get StatTrak status statTrak = strings.Contains(t.Attr[0].Val, "clreff") if statTrak { z.Next() z.Next() z.Next() z.Next() t = z.Token() } item = t.Attr[2].Val thisBet := &Bet{bettor, item, statTrak} bets = append(bets, thisBet) } } } } } return }
// ParseToken is to parse token func ParseToken(z *html.Tokenizer, tag string) { for { tt := z.Next() switch { case tt == html.ErrorToken: // End of the document, we're done return case tt == html.StartTagToken: t := z.Token() // check element checkElement(t, tag) } } }
func getMatchInfoDateTime(z *html.Tokenizer) (matchDate, matchTime string) { eof := false for !eof { tt := z.Next() switch { case tt == html.ErrorToken: eof = true case tt == html.StartTagToken: t := z.Token() isDiv := t.Data == "div" if isDiv { possibleDate := false for _, a := range t.Attr { if a.Key == "class" && a.Val == "half" { possibleDate = true } if possibleDate && a.Key == "title" { // Definitely a date now, grab both date and time matchDate = a.Val z.Next() matchTime = z.Token().Data // Trim the whitespace around time matchTime = strings.TrimSpace(matchTime) return } } } } } Error.Printf("Could not get date and time...") return "", "" }
func (article *NYTArticle) DoParse(parser *html.Tokenizer) error { articleOpeningTagLoop: for { token := parser.Next() switch { case token == html.ErrorToken: return fmt.Errorf("problem moving article %s to open tag", article.GetTitle()) case token == html.StartTagToken: tmp := parser.Token() isStartArticle := tmp.Data == "p" if isStartArticle { for _, attr := range tmp.Attr { if attr.Key == "class" && attr.Val == "story-body-text story-content" { break articleOpeningTagLoop } } } } } isInParagraph := true articleClosingTagLoop: for { token := parser.Next() switch { case token == html.ErrorToken: return fmt.Errorf("problem scraping article %s", article.GetTitle()) case token == html.StartTagToken: tmp := parser.Token() isEndArticle := tmp.Data == "footer" if isEndArticle { for _, attr := range tmp.Attr { if attr.Key == "class" && attr.Val == "story-footer story-content" { break articleClosingTagLoop } } } if tmp.Data == "p" { for _, attr := range tmp.Attr { if attr.Key == "class" && strings.Contains(attr.Val, "story-body-text") { isInParagraph = true } } if isInParagraph { continue } } // is a link if tmp.Data == "a" { shouldSkip := false for _, attr := range tmp.Attr { if attr.Key == "class" && strings.Contains(attr.Val, "visually-hidden") { shouldSkip = true } } if shouldSkip { continue } parser.Next() tmp = parser.Token() newBody := strings.TrimSpace(article.GetData()) + " " + strings.TrimSpace(tmp.Data) + " " article.SetData(newBody) isInParagraph = true } case token == html.EndTagToken: tmp := parser.Token() if tmp.Data == "p" { isInParagraph = false } default: if !isInParagraph { continue } tmp := parser.Token() newBody := article.GetData() // add a space on the left just in case there is a comment or something if unicode.IsPunct(rune(tmp.Data[0])) { newBody = strings.TrimSpace(newBody) } newBody = newBody + strings.TrimSpace(tmp.Data) article.SetData(newBody) isInParagraph = false } } fmt.Println(article.GetData()) return nil }
func (article *ECONArticle) DoParse(parser *html.Tokenizer) error { // ENDS WITH div class content clearfix everywhere articleOpeningTagLoop: for { token := parser.Next() switch { case token == html.ErrorToken: fmt.Println("Prollem") return nil case token == html.StartTagToken: tmp := parser.Token() isStartArticle := tmp.Data == "p" if isStartArticle { for _, attr := range tmp.Attr { if attr.Key == "class" && attr.Val == "main-content" { fmt.Println("Found it, bitch") break articleOpeningTagLoop } } } } } isInParagraph := true articleClosingTagLoop: for { token := parser.Next() switch { case token == html.ErrorToken: fmt.Println("Prollem") return nil case token == html.StartTagToken: tmp := parser.Token() isEndArticle := tmp.Data == "footer" if isEndArticle { for _, attr := range tmp.Attr { if attr.Key == "class" && attr.Val == "story-footer story-content" { fmt.Println("Hit end") break articleClosingTagLoop } } } isInParagraph = true default: if !isInParagraph { continue } tmp := parser.Token() newBody := article.GetData() // add a space on the left just in case there is a comment or something newBody = newBody + strings.TrimSpace(tmp.Data) article.SetData(newBody) isInParagraph = false //fmt.Println("Next p", newBody) } } fmt.Println(article.GetData()) return nil }
func (article *WSJArticle) DoParse(parser *html.Tokenizer) error { // find the start of the article // starts at the top of the html body, ends at the article tag articleTagLoop: for { token := parser.Next() switch { case token == html.ErrorToken: fmt.Println("OH NOSE!!!! ERROR before we hit the end") return nil case token == html.StartTagToken: tmp := parser.Token() isStartArticle := tmp.Data == "article" if isStartArticle { break articleTagLoop } } } // find the article header, which has author, time etc // starts at the article tag, ends at the article header // TODO: get author info and such here articleStartLoop: for { token := parser.Next() switch { case token == html.ErrorToken: return nil case token == html.StartTagToken: tmp := parser.Token() isStartArticleBody := tmp.Data == "div" // loop until we are at the first paragraph of the article body if isStartArticleBody { isStartArticleBody = false for _, attr := range tmp.Attr { if attr.Key == "class" && attr.Val == "clearfix byline-wrap" { isStartArticleBody = true break } } if isStartArticleBody { break articleStartLoop } } } } // find the start of the article // starts at the end of the article header, ends at the first article paragraph articleBodyStartLoop: for { token := parser.Next() switch { case token == html.ErrorToken: return nil case token == html.StartTagToken: tmp := parser.Token() isStartArticleBody := tmp.Data == "p" if isStartArticleBody { break articleBodyStartLoop } } } // pull the article out of the html // starts at first paragraph, returns at the end of the article isInParagraph := true // true because we start inside the first paragraph depth := 1 // one because this loop starts at first paragraph for { token := parser.Next() switch { case token == html.ErrorToken: fmt.Println("hit err, depth is:", depth) return nil case token == html.StartTagToken: depth++ tmp := parser.Token() isParagraph := tmp.Data == "p" if isParagraph { // start of a new paragraph if depth != 1 { fmt.Println("ERROR: hit new paragraph while depth != 0") } if isInParagraph { fmt.Println("ERROR: hit unexpected new paragraph tag while in paragraph") } isInParagraph = true } // text can have embeded links isLink := tmp.Data == "a" if isLink { if !isInParagraph { fmt.Println("ERROR: hit unexpected link outside of a paragraph") continue } // if we are in a paragraph, append the link name parser.Next() tmp = parser.Token() newBody := article.GetData() + tmp.Data article.SetData(newBody) } case token == html.EndTagToken: depth-- tmp := parser.Token().Data if depth == -1 { // done with article when we are at a higher level than it return nil } if tmp == "p" { // add a paragraph and trim the space article.SetData(strings.TrimSpace(article.GetData() + "\n")) isInParagraph = false } default: if !isInParagraph { // if not inside a text paragraph, continue on continue } // get the paragraph text and append it to the article body // TODO: look into using a string builder instead of adding things on tmp := parser.Token() newBody := article.GetData() // add a space on the left just in case there is a comment or something newBody = newBody + strings.TrimSpace(tmp.Data) + " " article.SetData(newBody) } } return nil }