func parse2(z *html.Tokenizer) (*Schedule, error) { schedule := &Schedule{} currentDate := "" for { tt := z.Next() switch tt { case html.ErrorToken: return schedule, nil case html.StartTagToken: t := z.Token() if isTokenTagWithAttr("font", "class", "PageHeading", &t, z) { z.Next() currentDate = z.Token().Data } else if isTokenTagWithAttr("tr", "bgcolor", "#ffffff", &t, z) || isTokenTagWithAttr("tr", "bgcolor", "#f5f5f5", &t, z) { game, err := parseGame(currentDate, z) if err != nil { return nil, err } schedule.Games = append(schedule.Games, game) } } } }
func getMatchInfoTitle(z *html.Tokenizer) string { eof := false for !eof { tt := z.Next() switch { case tt == html.ErrorToken: eof = true case tt == html.StartTagToken: t := z.Token() // Check if the token is a <title> tag isTitle := t.Data == "title" if isTitle { z.Next() // This is the title return z.Token().Data } } } // If we reached here something went wrong :^( Error.Printf("Could not get title...") return "" }
func skipSubtreeIfUicRemove(z *html.Tokenizer, tt html.TokenType, tagName string, attrs []html.Attribute) bool { _, foundRemoveTag := getAttr(attrs, UicRemove) if !foundRemoveTag { return false } if isSelfClosingTag(tagName, tt) { return true } depth := 0 for { tt := z.Next() tag, _ := z.TagName() switch { case tt == html.ErrorToken: return true case tt == html.StartTagToken && !isSelfClosingTag(string(tag), tt): depth++ case tt == html.EndTagToken: depth-- if depth < 0 { return true } } } }
func getQ(tknzer html.Tokenizer, ch chan string) { tknzer.Next() tknzer.Next() tknzer.Next() tknzer.Next() ch <- string(tknzer.Text()) }
func (item *AnimeConventionItem) readResgiterNowurl(t *html.Tokenizer) { t.Next() if _, hasmore := t.TagName(); hasmore { if key, val, _ := t.TagAttr(); strings.EqualFold(string(key), "href") { item.registerNowURL = string(val) } } }
func getMatchInfoBets(z *html.Tokenizer) (bets []*Bet) { var bettor string var item string var statTrak bool eof := false for !eof { tt := z.Next() switch { case tt == html.ErrorToken: eof = true case tt == html.StartTagToken: t := z.Token() isDiv := t.Data == "div" isSpan := t.Data == "span" if isSpan { for _, a := range t.Attr { if a.Key == "class" && a.Val == "user" { z.Next() z.Next() t := z.Token() bettor = strings.TrimSpace(t.Data) } } } if isDiv { for _, a := range t.Attr { if a.Key == "class" && strings.Contains(a.Val, "item") { z.Next() z.Next() t = z.Token() // Get StatTrak status statTrak = strings.Contains(t.Attr[0].Val, "clreff") if statTrak { z.Next() z.Next() z.Next() z.Next() t = z.Token() } item = t.Attr[2].Val thisBet := &Bet{bettor, item, statTrak} bets = append(bets, thisBet) } } } } } return }
func getText(z *html.Tokenizer) string { tt := z.Next() switch tt { case html.ErrorToken: panic(z.Err()) case html.TextToken: return string(z.Text()) } return "" }
func (item *AnimeConventionItem) readCountry(t *html.Tokenizer, hasmore bool) { if hasmore { _, val, _ := t.TagAttr() item.country = string(val) return } t.Next() item.country = string(t.Text()) }
func (item *AnimeConventionItem) readNameAndLink(t *html.Tokenizer) { if label := t.Next(); label == html.StartTagToken { _, hasmore := t.TagName() if hasmore { if key, val, _ := t.TagAttr(); strings.EqualFold(string(key), "href") { item.siteURL = string(val) } } } if label := t.Next(); label == html.TextToken { item.name = string(t.Text()) } }
func advanceToTextToken(z *html.Tokenizer) *html.Token { for { tt := z.Next() switch tt { case html.ErrorToken: return nil case html.TextToken: t := z.Token() return &t } } }
func parseTableX86(z *html.Tokenizer, in *Intrinsic) *Intrinsic { in.Performance = make(map[string]Timing) for { tt := z.Next() switch tt { case html.ErrorToken: return in case html.StartTagToken, html.EndTagToken: tn, _ := z.TagName() tns := strings.ToLower(string(tn)) switch tns { case "tr": if tt == html.StartTagToken { n := 0 p := Timing{} for { tt = z.Next() tn, _ = z.TagName() tns = strings.ToLower(string(tn)) if tt == html.EndTagToken && tns == "tr" { break } if tt == html.StartTagToken && tns == "td" { switch n { case 0: p.Arch = getText(z) case 1: p.Latency, _ = strconv.ParseFloat(getText(z), 64) case 2: p.Throughput, _ = strconv.ParseFloat(getText(z), 64) in.Performance[p.Arch] = p } n++ } } } else { panic("tr ended") } case "table": if tt == html.EndTagToken { return in } else { panic("table started") } } } } }
func parseGame(date string, z *html.Tokenizer) (Game, error) { var game Game td := advanceToStartTag("td", z) if td == nil { return game, errors.New("Unable to find Game Number") } z.Next() gameNum := strings.TrimSpace(z.Token().Data) td = advanceToStartTag("td", z) if td == nil { return game, errors.New("Unable to find Game Time") } td = advanceToStartTag("div", z) if td == nil { return game, errors.New("Unable to find Game Time") } z.Next() gameTime := strings.TrimSpace(z.Token().Data) if gameTime == "" { t := advanceToTextToken(z) gameTime = strings.TrimSpace(t.Data) } var homeTeam, homeScore, awayTeam, awayScore string skipAwayScore := false homeTeam = parseTeamName(z) homeScore = parseScore(z) if len(homeScore) > 3 { awayTeam = homeScore homeScore = "" skipAwayScore = true } else { awayTeam = parseTeamName(z) } if !skipAwayScore { awayScore = parseScore(z) } else { awayScore = "" } gameDate, err := time.Parse("1/2/2006 3:04 PM", date+" "+gameTime) if err != nil { return game, err } return Game{gameDate, gameNum, homeTeam, homeScore, awayTeam, awayScore}, nil }
func (item *AnimeConventionItem) Parse(t *html.Tokenizer) { for { label := t.Next() switch label { case html.ErrorToken: fmt.Errorf("%v\n", t.Err()) return case html.TextToken: switch string(t.Text()) { case "Advance Rates:": //fmt.Println("rate") item.readadvanceRate(t) case "At-Door Rates:": item.readatDoorRate(t) } case html.StartTagToken, html.EndTagToken, html.SelfClosingTagToken: tag, hasmore := t.TagName() if strings.EqualFold(string(tag), "big") { item.readResgiterNowurl(t) } else if hasmore { key, val, hasmore := t.TagAttr() if strings.EqualFold(string(key), "itemprop") { //fmt.Println(string(val)) switch string(val) { case "description": item.readDescription(t) case "latitude": item.readLatitude(t) case "longitude": item.readLongitude(t) case "startDate": item.readStartDate(t) case "endDate": item.readEndDate(t) case "location": item.readLocation(t) case "addressLocality": item.readCity(t) case "addressRegion": item.readState(t) case "addressCountry": item.readCountry(t, hasmore) case "name": item.readNameAndLink(t) } } } } } }
func (item *AnimeConventionItem) readLocation(t *html.Tokenizer) { for { if label := t.Next(); label == html.StartTagToken { _, hasmore := t.TagName() if hasmore { if _, val, _ := t.TagAttr(); strings.EqualFold(string(val), "name") { break } } } } if label := t.Next(); label == html.TextToken { item.location = string(t.Text()) } }
func advanceToStartTag(tagName string, z *html.Tokenizer) *html.Token { for { tt := z.Next() switch tt { case html.ErrorToken: return nil case html.StartTagToken: t := z.Token() if t.Data == tagName { return &t } } } }
func (item *AnimeConventionItem) readRates(t *html.Tokenizer) string { rates := "" for { label := t.Next() if label == html.EndTagToken { val, _ := t.TagName() if strings.EqualFold(string(val), "p") { break } } if label == html.TextToken { rates = strings.Join([]string{rates, string(t.Text())}, "\n") } } return strings.TrimSpace(rates) }
// ParseToken is to parse token func ParseToken(z *html.Tokenizer, tag string) { for { tt := z.Next() switch { case tt == html.ErrorToken: // End of the document, we're done return case tt == html.StartTagToken: t := z.Token() // check element checkElement(t, tag) } } }
func parseFragment(z *html.Tokenizer) (f Fragment, dependencies []*FetchDefinition, err error) { attrs := make([]html.Attribute, 0, 10) dependencies = make([]*FetchDefinition, 0, 0) buff := bytes.NewBuffer(nil) forloop: for { tt := z.Next() tag, _ := z.TagName() raw := byteCopy(z.Raw()) // create a copy here, because readAttributes modifies z.Raw, if attributes contain an & attrs = readAttributes(z, attrs) switch { case tt == html.ErrorToken: if z.Err() != io.EOF { return nil, nil, z.Err() } break forloop case tt == html.StartTagToken || tt == html.SelfClosingTagToken: if string(tag) == UicInclude { if replaceTextStart, replaceTextEnd, err := getInclude(z, attrs); err != nil { return nil, nil, err } else { fmt.Fprintf(buff, replaceTextStart) // Enhancement: WriteOut sub tree, to allow alternative content // for optional includes. fmt.Fprintf(buff, replaceTextEnd) continue } } if skipSubtreeIfUicRemove(z, tt, string(tag), attrs) { continue } case tt == html.EndTagToken: if string(tag) == UicFragment || string(tag) == UicTail { break forloop } } buff.Write(raw) } return StringFragment(buff.String()), dependencies, nil }
func buildTokenArray(tokenizer *xhtml.Tokenizer) []tagInfo { tokens := []tagInfo{} for tt := tokenizer.Next(); tt != xhtml.ErrorToken; tt = tokenizer.Next() { switch tt { case xhtml.TextToken: txt := string(tokenizer.Text()) if len(tokens) == 0 { info := tagInfo{ raw: txt, } tokens = append(tokens, info) } tn, _ := tokenizer.TagName() key, val, _ := tokenizer.TagAttr() info := tagInfo{ tag: string(tn), key: string(key), val: string(val), txt: txt, } tokens = append(tokens, info) case xhtml.StartTagToken: tn, _ := tokenizer.TagName() key, val, _ := tokenizer.TagAttr() info := tagInfo{ tag: string(tn), key: string(key), val: string(val), } tokens = append(tokens, info) case xhtml.SelfClosingTagToken, xhtml.EndTagToken: tn, _ := tokenizer.TagName() key, val, _ := tokenizer.TagAttr() info := tagInfo{ tag: string(tn), key: string(key), val: string(val), closingTag: true, } tokens = append(tokens, info) } } return tokens }
func (parser *HtmlContentParser) parseHead(z *html.Tokenizer, c *MemoryContent) error { attrs := make([]html.Attribute, 0, 10) headBuff := bytes.NewBuffer(nil) forloop: for { tt := z.Next() tag, _ := z.TagName() raw := byteCopy(z.Raw()) // create a copy here, because readAttributes modifies z.Raw, if attributes contain an & attrs = readAttributes(z, attrs) switch { case tt == html.ErrorToken: if z.Err() != io.EOF { return z.Err() } break forloop case tt == html.StartTagToken || tt == html.SelfClosingTagToken: if skipSubtreeIfUicRemove(z, tt, string(tag), attrs) { continue } if string(tag) == "script" && attrHasValue(attrs, "type", ScriptTypeMeta) { if err := parseMetaJson(z, c); err != nil { return err } continue } case tt == html.EndTagToken: if string(tag) == "head" { break forloop } } headBuff.Write(raw) } s := headBuff.String() st := strings.Trim(s, " \n") if len(st) > 0 { c.head = StringFragment(st) } return nil }
func parseMetaJson(z *html.Tokenizer, c *MemoryContent) error { tt := z.Next() if tt != html.TextToken { return fmt.Errorf("expected text node for meta json, but found %v, (%s)", tt.String(), z.Raw()) } bytes := z.Text() err := json.Unmarshal(bytes, &c.meta) if err != nil { return fmt.Errorf("error while parsing json from meta json element: %v", err.Error()) } tt = z.Next() tag, _ := z.TagName() if tt != html.EndTagToken || string(tag) != "script" { return fmt.Errorf("Tag not properly ended. Expected </script>, but found %s", z.Raw()) } return nil }
func skipCompleteTag(z *html.Tokenizer, tagName string) error { forloop: for { tt := z.Next() tag, _ := z.TagName() switch { case tt == html.ErrorToken: if z.Err() != io.EOF { return z.Err() } break forloop case tt == html.EndTagToken: tagAsString := string(tag) if tagAsString == tagName { break forloop } } } return nil }
func parseToken(tokenizer *html.Tokenizer, htmlDoc *htmlDocument, parent *tagElement) (bool, bool, string) { tokenType := tokenizer.Next() switch tokenType { case html.ErrorToken: return true, false, "" case html.TextToken: text := string(tokenizer.Text()) if strings.TrimSpace(text) == "" { break } textElement := &textElement{text: text} appendElement(htmlDoc, parent, textElement) case html.StartTagToken: tagElement := &tagElement{tagName: getTagName(tokenizer), startTagRaw: string(tokenizer.Raw())} appendElement(htmlDoc, parent, tagElement) for { errorToken, parentEnded, unsetEndTag := parseToken(tokenizer, htmlDoc, tagElement) if errorToken { return true, false, "" } if parentEnded { if unsetEndTag != "" { return false, false, unsetEndTag } break } if unsetEndTag != "" { return false, false, setEndTagRaw(tokenizer, tagElement, unsetEndTag) } } case html.EndTagToken: return false, true, setEndTagRaw(tokenizer, parent, getTagName(tokenizer)) case html.DoctypeToken, html.SelfClosingTagToken, html.CommentToken: tagElement := &tagElement{tagName: getTagName(tokenizer), startTagRaw: string(tokenizer.Raw())} appendElement(htmlDoc, parent, tagElement) } return false, false, "" }
func (list *AnimeConventionList) Parse(t *html.Tokenizer) { for { next := t.Next() switch next { case html.ErrorToken: return case html.TextToken: //fmt.Println(string(t.Text())) case html.StartTagToken, html.EndTagToken, html.SelfClosingTagToken: _, hasmore := t.TagName() if hasmore { key, val, _ := t.TagAttr() if strings.EqualFold(string(key), "href") && strings.HasPrefix(string(val), "/events/info.shtml") { var item = &AnimeConventionItem{} item.url = strings.Join([]string{"http://animecons.com", string(val)}, "") list.taskNum++ go item.crawlInformation(&list.ConventionList) //time.Sleep(100 * time.Millisecond) } } } } }
func getMatchInfoDateTime(z *html.Tokenizer) (matchDate, matchTime string) { eof := false for !eof { tt := z.Next() switch { case tt == html.ErrorToken: eof = true case tt == html.StartTagToken: t := z.Token() isDiv := t.Data == "div" if isDiv { possibleDate := false for _, a := range t.Attr { if a.Key == "class" && a.Val == "half" { possibleDate = true } if possibleDate && a.Key == "title" { // Definitely a date now, grab both date and time matchDate = a.Val z.Next() matchTime = z.Token().Data // Trim the whitespace around time matchTime = strings.TrimSpace(matchTime) return } } } } } Error.Printf("Could not get date and time...") return "", "" }
func getTextR(z *html.Tokenizer) string { r := "" depth := 1 for { tt := z.Next() switch tt { case html.ErrorToken: panic(z.Err()) case html.TextToken: r += string(z.Text()) case html.StartTagToken: tn, _ := z.TagName() tns := strings.ToLower(string(tn)) switch tns { case "div": r += "\r" depth++ case "span": r += "'" depth++ } case html.EndTagToken: tn, _ := z.TagName() tns := strings.ToLower(string(tn)) switch tns { case "div": depth-- case "span": r += "'" depth-- } } if depth == 0 { return r } } }
func extractTitleFromTree(z *html.Tokenizer) string { depth := 0 for { tt := z.Next() switch tt { case html.ErrorToken: return "" case html.TextToken: if depth > 0 { title := strings.TrimSpace(string(z.Text())) lower := strings.ToLower(title) if strings.HasPrefix(lower, "imgur") { return "" } return title } case html.StartTagToken: tn, _ := z.TagName() if string(tn) == "title" { depth++ } } } }
func (parser *HtmlContentParser) parseBody(z *html.Tokenizer, c *MemoryContent) error { attrs := make([]html.Attribute, 0, 10) bodyBuff := bytes.NewBuffer(nil) attrs = readAttributes(z, attrs) if len(attrs) > 0 { c.bodyAttributes = StringFragment(joinAttrs(attrs)) } forloop: for { tt := z.Next() tag, _ := z.TagName() raw := byteCopy(z.Raw()) // create a copy here, because readAttributes modifies z.Raw, if attributes contain an & attrs = readAttributes(z, attrs) switch { case tt == html.ErrorToken: if z.Err() != io.EOF { return z.Err() } break forloop case tt == html.StartTagToken || tt == html.SelfClosingTagToken: if skipSubtreeIfUicRemove(z, tt, string(tag), attrs) { continue } if string(tag) == UicFragment { if f, deps, err := parseFragment(z); err != nil { return err } else { c.body[getFragmentName(attrs)] = f for _, dep := range deps { c.requiredContent[dep.URL] = dep } } continue } if string(tag) == UicTail { if f, deps, err := parseFragment(z); err != nil { return err } else { c.tail = f for _, dep := range deps { c.requiredContent[dep.URL] = dep } } continue } if string(tag) == UicFetch { if fd, err := getFetch(z, attrs); err != nil { return err } else { c.requiredContent[fd.URL] = fd continue } } if string(tag) == UicInclude { if replaceTextStart, replaceTextEnd, err := getInclude(z, attrs); err != nil { return err } else { bodyBuff.WriteString(replaceTextStart) // Enhancement: WriteOut sub tree, to allow alternative content // for optional includes. bodyBuff.WriteString(replaceTextEnd) continue } } case tt == html.EndTagToken: if string(tag) == "body" { break forloop } } bodyBuff.Write(raw) } s := bodyBuff.String() if _, defaultFragmentExists := c.body[""]; !defaultFragmentExists { if st := strings.Trim(s, " \n"); len(st) > 0 { c.body[""] = StringFragment(st) } } return nil }
func (article *NYTArticle) DoParse(parser *html.Tokenizer) error { articleOpeningTagLoop: for { token := parser.Next() switch { case token == html.ErrorToken: return fmt.Errorf("problem moving article %s to open tag", article.GetTitle()) case token == html.StartTagToken: tmp := parser.Token() isStartArticle := tmp.Data == "p" if isStartArticle { for _, attr := range tmp.Attr { if attr.Key == "class" && attr.Val == "story-body-text story-content" { break articleOpeningTagLoop } } } } } isInParagraph := true articleClosingTagLoop: for { token := parser.Next() switch { case token == html.ErrorToken: return fmt.Errorf("problem scraping article %s", article.GetTitle()) case token == html.StartTagToken: tmp := parser.Token() isEndArticle := tmp.Data == "footer" if isEndArticle { for _, attr := range tmp.Attr { if attr.Key == "class" && attr.Val == "story-footer story-content" { break articleClosingTagLoop } } } if tmp.Data == "p" { for _, attr := range tmp.Attr { if attr.Key == "class" && strings.Contains(attr.Val, "story-body-text") { isInParagraph = true } } if isInParagraph { continue } } // is a link if tmp.Data == "a" { shouldSkip := false for _, attr := range tmp.Attr { if attr.Key == "class" && strings.Contains(attr.Val, "visually-hidden") { shouldSkip = true } } if shouldSkip { continue } parser.Next() tmp = parser.Token() newBody := strings.TrimSpace(article.GetData()) + " " + strings.TrimSpace(tmp.Data) + " " article.SetData(newBody) isInParagraph = true } case token == html.EndTagToken: tmp := parser.Token() if tmp.Data == "p" { isInParagraph = false } default: if !isInParagraph { continue } tmp := parser.Token() newBody := article.GetData() // add a space on the left just in case there is a comment or something if unicode.IsPunct(rune(tmp.Data[0])) { newBody = strings.TrimSpace(newBody) } newBody = newBody + strings.TrimSpace(tmp.Data) article.SetData(newBody) isInParagraph = false } } fmt.Println(article.GetData()) return nil }
func nextToken(tokenizer *html.Tokenizer) error { if t := tokenizer.Next(); t == html.ErrorToken { return tokenizer.Err() } return nil }