func flushTagToken(htmlBuf *[]byte, tz *html.Tokenizer, url string) string { *htmlBuf = append(*htmlBuf, '<') tagName, hasAttr := tz.TagName() *htmlBuf = append(*htmlBuf, tagName...) if hasAttr { for { attrKey, attrValue, hasMore := tz.TagAttr() *htmlBuf = append(*htmlBuf, ' ') *htmlBuf = append(*htmlBuf, attrKey...) *htmlBuf = append(*htmlBuf, '=', '"') if tagAttrToProxy[string(tagName)][string(attrKey)] { urlInAttr := string(attrValue) *htmlBuf = append(*htmlBuf, []byte(GetProxiedUrl(urlInAttr, url))...) } else { *htmlBuf = append(*htmlBuf, attrValue...) } *htmlBuf = append(*htmlBuf, '"') if !hasMore { break } } } *htmlBuf = append(*htmlBuf, '>') if string(tagName) == "head" { *htmlBuf = append(*htmlBuf, []byte(getJsHookTag())...) } return string(tagName) }
func getInclude(z *html.Tokenizer, attrs []html.Attribute) (startMarker, endMarker string, error error) { var srcString string if url, hasUrl := getAttr(attrs, "src"); !hasUrl { return "", "", fmt.Errorf("include definition without src %s", z.Raw()) } else { srcString = strings.TrimSpace(url.Val) if strings.HasPrefix(srcString, "#") { srcString = srcString[1:] } } required := false if r, hasRequired := getAttr(attrs, "required"); hasRequired { if requiredBool, err := strconv.ParseBool(r.Val); err != nil { return "", "", fmt.Errorf("error parsing bool in %s: %s", z.Raw(), err.Error()) } else { required = requiredBool } } if required { return fmt.Sprintf("§[> %s]§", srcString), "", nil } else { return fmt.Sprintf("§[#> %s]§", srcString), fmt.Sprintf("§[/%s]§", srcString), nil } }
func skipSubtreeIfUicRemove(z *html.Tokenizer, tt html.TokenType, tagName string, attrs []html.Attribute) bool { _, foundRemoveTag := getAttr(attrs, UicRemove) if !foundRemoveTag { return false } if isSelfClosingTag(tagName, tt) { return true } depth := 0 for { tt := z.Next() tag, _ := z.TagName() switch { case tt == html.ErrorToken: return true case tt == html.StartTagToken && !isSelfClosingTag(string(tag), tt): depth++ case tt == html.EndTagToken: depth-- if depth < 0 { return true } } } }
func advanceToTextToken(z *html.Tokenizer) *html.Token { for { tt := z.Next() switch tt { case html.ErrorToken: return nil case html.TextToken: t := z.Token() return &t } } }
func readAttributes(z *html.Tokenizer, buff []html.Attribute) []html.Attribute { buff = buff[:0] for { key, value, more := z.TagAttr() if key != nil { buff = append(buff, html.Attribute{Key: string(key), Val: string(value)}) } if !more { return buff } } }
func (item *AnimeConventionItem) readNameAndLink(t *html.Tokenizer) { if label := t.Next(); label == html.StartTagToken { _, hasmore := t.TagName() if hasmore { if key, val, _ := t.TagAttr(); strings.EqualFold(string(key), "href") { item.siteURL = string(val) } } } if label := t.Next(); label == html.TextToken { item.name = string(t.Text()) } }
func parse2(z *html.Tokenizer) (*Schedule, error) { schedule := &Schedule{} currentDate := "" for { tt := z.Next() switch tt { case html.ErrorToken: return schedule, nil case html.StartTagToken: t := z.Token() if isTokenTagWithAttr("font", "class", "PageHeading", &t, z) { z.Next() currentDate = z.Token().Data } else if isTokenTagWithAttr("tr", "bgcolor", "#ffffff", &t, z) || isTokenTagWithAttr("tr", "bgcolor", "#f5f5f5", &t, z) { game, err := parseGame(currentDate, z) if err != nil { return nil, err } schedule.Games = append(schedule.Games, game) } } } }
// AttrMap parses the attributes of the current element into a friendly map. // It only makes sense to call this while processing a start or self closing tag token. func AttrMap(hasAttr bool, z *html.Tokenizer) map[string]string { attrs := make(map[string]string) if !hasAttr { return attrs } for { k, v, more := z.TagAttr() attrs[string(k)] = string(v) if !more { break } } return attrs }
func (item *AnimeConventionItem) Parse(t *html.Tokenizer) { for { label := t.Next() switch label { case html.ErrorToken: fmt.Errorf("%v\n", t.Err()) return case html.TextToken: switch string(t.Text()) { case "Advance Rates:": //fmt.Println("rate") item.readadvanceRate(t) case "At-Door Rates:": item.readatDoorRate(t) } case html.StartTagToken, html.EndTagToken, html.SelfClosingTagToken: tag, hasmore := t.TagName() if strings.EqualFold(string(tag), "big") { item.readResgiterNowurl(t) } else if hasmore { key, val, hasmore := t.TagAttr() if strings.EqualFold(string(key), "itemprop") { //fmt.Println(string(val)) switch string(val) { case "description": item.readDescription(t) case "latitude": item.readLatitude(t) case "longitude": item.readLongitude(t) case "startDate": item.readStartDate(t) case "endDate": item.readEndDate(t) case "location": item.readLocation(t) case "addressLocality": item.readCity(t) case "addressRegion": item.readState(t) case "addressCountry": item.readCountry(t, hasmore) case "name": item.readNameAndLink(t) } } } } } }
func getMatchInfoTitle(z *html.Tokenizer) string { eof := false for !eof { tt := z.Next() switch { case tt == html.ErrorToken: eof = true case tt == html.StartTagToken: t := z.Token() // Check if the token is a <title> tag isTitle := t.Data == "title" if isTitle { z.Next() // This is the title return z.Token().Data } } } // If we reached here something went wrong :^( Error.Printf("Could not get title...") return "" }
func advanceToStartTag(tagName string, z *html.Tokenizer) *html.Token { for { tt := z.Next() switch tt { case html.ErrorToken: return nil case html.StartTagToken: t := z.Token() if t.Data == tagName { return &t } } } }
func (item *AnimeConventionItem) readLocation(t *html.Tokenizer) { for { if label := t.Next(); label == html.StartTagToken { _, hasmore := t.TagName() if hasmore { if _, val, _ := t.TagAttr(); strings.EqualFold(string(val), "name") { break } } } } if label := t.Next(); label == html.TextToken { item.location = string(t.Text()) } }
// ParseToken is to parse token func ParseToken(z *html.Tokenizer, tag string) { for { tt := z.Next() switch { case tt == html.ErrorToken: // End of the document, we're done return case tt == html.StartTagToken: t := z.Token() // check element checkElement(t, tag) } } }
func parseFragment(z *html.Tokenizer) (f Fragment, dependencies []*FetchDefinition, err error) { attrs := make([]html.Attribute, 0, 10) dependencies = make([]*FetchDefinition, 0, 0) buff := bytes.NewBuffer(nil) forloop: for { tt := z.Next() tag, _ := z.TagName() raw := byteCopy(z.Raw()) // create a copy here, because readAttributes modifies z.Raw, if attributes contain an & attrs = readAttributes(z, attrs) switch { case tt == html.ErrorToken: if z.Err() != io.EOF { return nil, nil, z.Err() } break forloop case tt == html.StartTagToken || tt == html.SelfClosingTagToken: if string(tag) == UicInclude { if replaceTextStart, replaceTextEnd, err := getInclude(z, attrs); err != nil { return nil, nil, err } else { fmt.Fprintf(buff, replaceTextStart) // Enhancement: WriteOut sub tree, to allow alternative content // for optional includes. fmt.Fprintf(buff, replaceTextEnd) continue } } if skipSubtreeIfUicRemove(z, tt, string(tag), attrs) { continue } case tt == html.EndTagToken: if string(tag) == UicFragment || string(tag) == UicTail { break forloop } } buff.Write(raw) } return StringFragment(buff.String()), dependencies, nil }
func parseDivX86(z *html.Tokenizer, in *Intrinsic) *Intrinsic { more := true var k, v []byte for more { k, v, more = z.TagAttr() // fmt.Println("attr:", string(k)) switch string(k) { case "class": val := string(v) if strings.Contains(val, "intrinsic") { in.FinishX86() return NewIntrinsic() } switch val { case "cpuid": in.CpuID = getText(z) case "instruction": in.Instruction = strings.ToUpper(getText(z)) case "rettype": in.RetType = fixTypeX86(getText(z)) case "param_type": in.cParam = &Param{Type: fixTypeX86(getText(z))} case "param_name": in.cParam.Name = getText(z) if !in.Params.HasParam(in.cParam.Name) { in.Params = append(in.Params, *in.cParam) } in.cParam = nil case "description": in.Description = strings.TrimSpace(getTextR(z)) case "name": in.OrgName = getText(z) in.Name = fixFuncNameX86(in.OrgName) case "operation": in.Operation = strings.TrimSpace(getText(z)) default: //fmt.Println("unparsed class:", string(v)) } } } return in }
func (parser *HtmlContentParser) parseHead(z *html.Tokenizer, c *MemoryContent) error { attrs := make([]html.Attribute, 0, 10) headBuff := bytes.NewBuffer(nil) forloop: for { tt := z.Next() tag, _ := z.TagName() raw := byteCopy(z.Raw()) // create a copy here, because readAttributes modifies z.Raw, if attributes contain an & attrs = readAttributes(z, attrs) switch { case tt == html.ErrorToken: if z.Err() != io.EOF { return z.Err() } break forloop case tt == html.StartTagToken || tt == html.SelfClosingTagToken: if skipSubtreeIfUicRemove(z, tt, string(tag), attrs) { continue } if string(tag) == "script" && attrHasValue(attrs, "type", ScriptTypeMeta) { if err := parseMetaJson(z, c); err != nil { return err } continue } case tt == html.EndTagToken: if string(tag) == "head" { break forloop } } headBuff.Write(raw) } s := headBuff.String() st := strings.Trim(s, " \n") if len(st) > 0 { c.head = StringFragment(st) } return nil }
func parseMetaJson(z *html.Tokenizer, c *MemoryContent) error { tt := z.Next() if tt != html.TextToken { return fmt.Errorf("expected text node for meta json, but found %v, (%s)", tt.String(), z.Raw()) } bytes := z.Text() err := json.Unmarshal(bytes, &c.meta) if err != nil { return fmt.Errorf("error while parsing json from meta json element: %v", err.Error()) } tt = z.Next() tag, _ := z.TagName() if tt != html.EndTagToken || string(tag) != "script" { return fmt.Errorf("Tag not properly ended. Expected </script>, but found %s", z.Raw()) } return nil }
func getQ(tknzer html.Tokenizer, ch chan string) { tknzer.Next() tknzer.Next() tknzer.Next() tknzer.Next() ch <- string(tknzer.Text()) }
func parseGame(date string, z *html.Tokenizer) (Game, error) { var game Game td := advanceToStartTag("td", z) if td == nil { return game, errors.New("Unable to find Game Number") } z.Next() gameNum := strings.TrimSpace(z.Token().Data) td = advanceToStartTag("td", z) if td == nil { return game, errors.New("Unable to find Game Time") } td = advanceToStartTag("div", z) if td == nil { return game, errors.New("Unable to find Game Time") } z.Next() gameTime := strings.TrimSpace(z.Token().Data) if gameTime == "" { t := advanceToTextToken(z) gameTime = strings.TrimSpace(t.Data) } var homeTeam, homeScore, awayTeam, awayScore string skipAwayScore := false homeTeam = parseTeamName(z) homeScore = parseScore(z) if len(homeScore) > 3 { awayTeam = homeScore homeScore = "" skipAwayScore = true } else { awayTeam = parseTeamName(z) } if !skipAwayScore { awayScore = parseScore(z) } else { awayScore = "" } gameDate, err := time.Parse("1/2/2006 3:04 PM", date+" "+gameTime) if err != nil { return game, err } return Game{gameDate, gameNum, homeTeam, homeScore, awayTeam, awayScore}, nil }
func buildHTML(tokenizer *html.Tokenizer) (s string, err error) { buf := new(bytes.Buffer) bp := 0 if tag, _ := tokenizer.TagName(); string(tag) == "div" { div := tokenizer.Raw() buf.Write(div) bp = len(div) err = nextToken(tokenizer) } ep := bp for err != io.EOF { if err != nil && err != io.EOF { return } ep = buf.Len() b := tokenizer.Raw() if _, err := buf.Write(b); err != nil { return "", err } err = nextToken(tokenizer) } b := buf.Bytes() if bp > 0 { b = b[bp:ep] } return string(b), nil }
func parseTableX86(z *html.Tokenizer, in *Intrinsic) *Intrinsic { in.Performance = make(map[string]Timing) for { tt := z.Next() switch tt { case html.ErrorToken: return in case html.StartTagToken, html.EndTagToken: tn, _ := z.TagName() tns := strings.ToLower(string(tn)) switch tns { case "tr": if tt == html.StartTagToken { n := 0 p := Timing{} for { tt = z.Next() tn, _ = z.TagName() tns = strings.ToLower(string(tn)) if tt == html.EndTagToken && tns == "tr" { break } if tt == html.StartTagToken && tns == "td" { switch n { case 0: p.Arch = getText(z) case 1: p.Latency, _ = strconv.ParseFloat(getText(z), 64) case 2: p.Throughput, _ = strconv.ParseFloat(getText(z), 64) in.Performance[p.Arch] = p } n++ } } } else { panic("tr ended") } case "table": if tt == html.EndTagToken { return in } else { panic("table started") } } } } }
func (item *AnimeConventionItem) readResgiterNowurl(t *html.Tokenizer) { t.Next() if _, hasmore := t.TagName(); hasmore { if key, val, _ := t.TagAttr(); strings.EqualFold(string(key), "href") { item.registerNowURL = string(val) } } }
func getTextR(z *html.Tokenizer) string { r := "" depth := 1 for { tt := z.Next() switch tt { case html.ErrorToken: panic(z.Err()) case html.TextToken: r += string(z.Text()) case html.StartTagToken: tn, _ := z.TagName() tns := strings.ToLower(string(tn)) switch tns { case "div": r += "\r" depth++ case "span": r += "'" depth++ } case html.EndTagToken: tn, _ := z.TagName() tns := strings.ToLower(string(tn)) switch tns { case "div": depth-- case "span": r += "'" depth-- } } if depth == 0 { return r } } }
func (item *AnimeConventionItem) readCountry(t *html.Tokenizer, hasmore bool) { if hasmore { _, val, _ := t.TagAttr() item.country = string(val) return } t.Next() item.country = string(t.Text()) }
func getText(z *html.Tokenizer) string { tt := z.Next() switch tt { case html.ErrorToken: panic(z.Err()) case html.TextToken: return string(z.Text()) } return "" }
func skipCompleteTag(z *html.Tokenizer, tagName string) error { forloop: for { tt := z.Next() tag, _ := z.TagName() switch { case tt == html.ErrorToken: if z.Err() != io.EOF { return z.Err() } break forloop case tt == html.EndTagToken: tagAsString := string(tag) if tagAsString == tagName { break forloop } } } return nil }
func parseToken(tokenizer *html.Tokenizer, htmlDoc *htmlDocument, parent *tagElement) (bool, bool, string) { tokenType := tokenizer.Next() switch tokenType { case html.ErrorToken: return true, false, "" case html.TextToken: text := string(tokenizer.Text()) if strings.TrimSpace(text) == "" { break } textElement := &textElement{text: text} appendElement(htmlDoc, parent, textElement) case html.StartTagToken: tagElement := &tagElement{tagName: getTagName(tokenizer), startTagRaw: string(tokenizer.Raw())} appendElement(htmlDoc, parent, tagElement) for { errorToken, parentEnded, unsetEndTag := parseToken(tokenizer, htmlDoc, tagElement) if errorToken { return true, false, "" } if parentEnded { if unsetEndTag != "" { return false, false, unsetEndTag } break } if unsetEndTag != "" { return false, false, setEndTagRaw(tokenizer, tagElement, unsetEndTag) } } case html.EndTagToken: return false, true, setEndTagRaw(tokenizer, parent, getTagName(tokenizer)) case html.DoctypeToken, html.SelfClosingTagToken, html.CommentToken: tagElement := &tagElement{tagName: getTagName(tokenizer), startTagRaw: string(tokenizer.Raw())} appendElement(htmlDoc, parent, tagElement) } return false, false, "" }
func (item *AnimeConventionItem) readRates(t *html.Tokenizer) string { rates := "" for { label := t.Next() if label == html.EndTagToken { val, _ := t.TagName() if strings.EqualFold(string(val), "p") { break } } if label == html.TextToken { rates = strings.Join([]string{rates, string(t.Text())}, "\n") } } return strings.TrimSpace(rates) }
func getMatchInfoDateTime(z *html.Tokenizer) (matchDate, matchTime string) { eof := false for !eof { tt := z.Next() switch { case tt == html.ErrorToken: eof = true case tt == html.StartTagToken: t := z.Token() isDiv := t.Data == "div" if isDiv { possibleDate := false for _, a := range t.Attr { if a.Key == "class" && a.Val == "half" { possibleDate = true } if possibleDate && a.Key == "title" { // Definitely a date now, grab both date and time matchDate = a.Val z.Next() matchTime = z.Token().Data // Trim the whitespace around time matchTime = strings.TrimSpace(matchTime) return } } } } } Error.Printf("Could not get date and time...") return "", "" }
// setEndTagRaw sets an endTagRaw to the parent. func setEndTagRaw(tokenizer *html.Tokenizer, parent *tagElement, tagName string) string { if parent != nil && parent.tagName == tagName { parent.endTagRaw = string(tokenizer.Raw()) return "" } return tagName }