func buildHTML(tokenizer *html.Tokenizer) (s string, err error) { buf := new(bytes.Buffer) bp := 0 if tag, _ := tokenizer.TagName(); string(tag) == "div" { div := tokenizer.Raw() buf.Write(div) bp = len(div) err = nextToken(tokenizer) } ep := bp for err != io.EOF { if err != nil && err != io.EOF { return } ep = buf.Len() b := tokenizer.Raw() if _, err := buf.Write(b); err != nil { return "", err } err = nextToken(tokenizer) } b := buf.Bytes() if bp > 0 { b = b[bp:ep] } return string(b), nil }
func skipSubtreeIfUicRemove(z *html.Tokenizer, tt html.TokenType, tagName string, attrs []html.Attribute) bool { _, foundRemoveTag := getAttr(attrs, UicRemove) if !foundRemoveTag { return false } if isSelfClosingTag(tagName, tt) { return true } depth := 0 for { tt := z.Next() tag, _ := z.TagName() switch { case tt == html.ErrorToken: return true case tt == html.StartTagToken && !isSelfClosingTag(string(tag), tt): depth++ case tt == html.EndTagToken: depth-- if depth < 0 { return true } } } }
func flushTagToken(htmlBuf *[]byte, tz *html.Tokenizer, url string) string { *htmlBuf = append(*htmlBuf, '<') tagName, hasAttr := tz.TagName() *htmlBuf = append(*htmlBuf, tagName...) if hasAttr { for { attrKey, attrValue, hasMore := tz.TagAttr() *htmlBuf = append(*htmlBuf, ' ') *htmlBuf = append(*htmlBuf, attrKey...) *htmlBuf = append(*htmlBuf, '=', '"') if tagAttrToProxy[string(tagName)][string(attrKey)] { urlInAttr := string(attrValue) *htmlBuf = append(*htmlBuf, []byte(GetProxiedUrl(urlInAttr, url))...) } else { *htmlBuf = append(*htmlBuf, attrValue...) } *htmlBuf = append(*htmlBuf, '"') if !hasMore { break } } } *htmlBuf = append(*htmlBuf, '>') if string(tagName) == "head" { *htmlBuf = append(*htmlBuf, []byte(getJsHookTag())...) } return string(tagName) }
func (item *AnimeConventionItem) readResgiterNowurl(t *html.Tokenizer) { t.Next() if _, hasmore := t.TagName(); hasmore { if key, val, _ := t.TagAttr(); strings.EqualFold(string(key), "href") { item.registerNowURL = string(val) } } }
func (item *AnimeConventionItem) readNameAndLink(t *html.Tokenizer) { if label := t.Next(); label == html.StartTagToken { _, hasmore := t.TagName() if hasmore { if key, val, _ := t.TagAttr(); strings.EqualFold(string(key), "href") { item.siteURL = string(val) } } } if label := t.Next(); label == html.TextToken { item.name = string(t.Text()) } }
func parseTableX86(z *html.Tokenizer, in *Intrinsic) *Intrinsic { in.Performance = make(map[string]Timing) for { tt := z.Next() switch tt { case html.ErrorToken: return in case html.StartTagToken, html.EndTagToken: tn, _ := z.TagName() tns := strings.ToLower(string(tn)) switch tns { case "tr": if tt == html.StartTagToken { n := 0 p := Timing{} for { tt = z.Next() tn, _ = z.TagName() tns = strings.ToLower(string(tn)) if tt == html.EndTagToken && tns == "tr" { break } if tt == html.StartTagToken && tns == "td" { switch n { case 0: p.Arch = getText(z) case 1: p.Latency, _ = strconv.ParseFloat(getText(z), 64) case 2: p.Throughput, _ = strconv.ParseFloat(getText(z), 64) in.Performance[p.Arch] = p } n++ } } } else { panic("tr ended") } case "table": if tt == html.EndTagToken { return in } else { panic("table started") } } } } }
func (item *AnimeConventionItem) Parse(t *html.Tokenizer) { for { label := t.Next() switch label { case html.ErrorToken: fmt.Errorf("%v\n", t.Err()) return case html.TextToken: switch string(t.Text()) { case "Advance Rates:": //fmt.Println("rate") item.readadvanceRate(t) case "At-Door Rates:": item.readatDoorRate(t) } case html.StartTagToken, html.EndTagToken, html.SelfClosingTagToken: tag, hasmore := t.TagName() if strings.EqualFold(string(tag), "big") { item.readResgiterNowurl(t) } else if hasmore { key, val, hasmore := t.TagAttr() if strings.EqualFold(string(key), "itemprop") { //fmt.Println(string(val)) switch string(val) { case "description": item.readDescription(t) case "latitude": item.readLatitude(t) case "longitude": item.readLongitude(t) case "startDate": item.readStartDate(t) case "endDate": item.readEndDate(t) case "location": item.readLocation(t) case "addressLocality": item.readCity(t) case "addressRegion": item.readState(t) case "addressCountry": item.readCountry(t, hasmore) case "name": item.readNameAndLink(t) } } } } } }
func (item *AnimeConventionItem) readLocation(t *html.Tokenizer) { for { if label := t.Next(); label == html.StartTagToken { _, hasmore := t.TagName() if hasmore { if _, val, _ := t.TagAttr(); strings.EqualFold(string(val), "name") { break } } } } if label := t.Next(); label == html.TextToken { item.location = string(t.Text()) } }
func (item *AnimeConventionItem) readRates(t *html.Tokenizer) string { rates := "" for { label := t.Next() if label == html.EndTagToken { val, _ := t.TagName() if strings.EqualFold(string(val), "p") { break } } if label == html.TextToken { rates = strings.Join([]string{rates, string(t.Text())}, "\n") } } return strings.TrimSpace(rates) }
func parseFragment(z *html.Tokenizer) (f Fragment, dependencies []*FetchDefinition, err error) { attrs := make([]html.Attribute, 0, 10) dependencies = make([]*FetchDefinition, 0, 0) buff := bytes.NewBuffer(nil) forloop: for { tt := z.Next() tag, _ := z.TagName() raw := byteCopy(z.Raw()) // create a copy here, because readAttributes modifies z.Raw, if attributes contain an & attrs = readAttributes(z, attrs) switch { case tt == html.ErrorToken: if z.Err() != io.EOF { return nil, nil, z.Err() } break forloop case tt == html.StartTagToken || tt == html.SelfClosingTagToken: if string(tag) == UicInclude { if replaceTextStart, replaceTextEnd, err := getInclude(z, attrs); err != nil { return nil, nil, err } else { fmt.Fprintf(buff, replaceTextStart) // Enhancement: WriteOut sub tree, to allow alternative content // for optional includes. fmt.Fprintf(buff, replaceTextEnd) continue } } if skipSubtreeIfUicRemove(z, tt, string(tag), attrs) { continue } case tt == html.EndTagToken: if string(tag) == UicFragment || string(tag) == UicTail { break forloop } } buff.Write(raw) } return StringFragment(buff.String()), dependencies, nil }
func (parser *HtmlContentParser) parseHead(z *html.Tokenizer, c *MemoryContent) error { attrs := make([]html.Attribute, 0, 10) headBuff := bytes.NewBuffer(nil) forloop: for { tt := z.Next() tag, _ := z.TagName() raw := byteCopy(z.Raw()) // create a copy here, because readAttributes modifies z.Raw, if attributes contain an & attrs = readAttributes(z, attrs) switch { case tt == html.ErrorToken: if z.Err() != io.EOF { return z.Err() } break forloop case tt == html.StartTagToken || tt == html.SelfClosingTagToken: if skipSubtreeIfUicRemove(z, tt, string(tag), attrs) { continue } if string(tag) == "script" && attrHasValue(attrs, "type", ScriptTypeMeta) { if err := parseMetaJson(z, c); err != nil { return err } continue } case tt == html.EndTagToken: if string(tag) == "head" { break forloop } } headBuff.Write(raw) } s := headBuff.String() st := strings.Trim(s, " \n") if len(st) > 0 { c.head = StringFragment(st) } return nil }
func parseMetaJson(z *html.Tokenizer, c *MemoryContent) error { tt := z.Next() if tt != html.TextToken { return fmt.Errorf("expected text node for meta json, but found %v, (%s)", tt.String(), z.Raw()) } bytes := z.Text() err := json.Unmarshal(bytes, &c.meta) if err != nil { return fmt.Errorf("error while parsing json from meta json element: %v", err.Error()) } tt = z.Next() tag, _ := z.TagName() if tt != html.EndTagToken || string(tag) != "script" { return fmt.Errorf("Tag not properly ended. Expected </script>, but found %s", z.Raw()) } return nil }
func skipCompleteTag(z *html.Tokenizer, tagName string) error { forloop: for { tt := z.Next() tag, _ := z.TagName() switch { case tt == html.ErrorToken: if z.Err() != io.EOF { return z.Err() } break forloop case tt == html.EndTagToken: tagAsString := string(tag) if tagAsString == tagName { break forloop } } } return nil }
func (list *AnimeConventionList) Parse(t *html.Tokenizer) { for { next := t.Next() switch next { case html.ErrorToken: return case html.TextToken: //fmt.Println(string(t.Text())) case html.StartTagToken, html.EndTagToken, html.SelfClosingTagToken: _, hasmore := t.TagName() if hasmore { key, val, _ := t.TagAttr() if strings.EqualFold(string(key), "href") && strings.HasPrefix(string(val), "/events/info.shtml") { var item = &AnimeConventionItem{} item.url = strings.Join([]string{"http://animecons.com", string(val)}, "") list.taskNum++ go item.crawlInformation(&list.ConventionList) //time.Sleep(100 * time.Millisecond) } } } } }
func getTextR(z *html.Tokenizer) string { r := "" depth := 1 for { tt := z.Next() switch tt { case html.ErrorToken: panic(z.Err()) case html.TextToken: r += string(z.Text()) case html.StartTagToken: tn, _ := z.TagName() tns := strings.ToLower(string(tn)) switch tns { case "div": r += "\r" depth++ case "span": r += "'" depth++ } case html.EndTagToken: tn, _ := z.TagName() tns := strings.ToLower(string(tn)) switch tns { case "div": depth-- case "span": r += "'" depth-- } } if depth == 0 { return r } } }
func extractTitleFromTree(z *html.Tokenizer) string { depth := 0 for { tt := z.Next() switch tt { case html.ErrorToken: return "" case html.TextToken: if depth > 0 { title := strings.TrimSpace(string(z.Text())) lower := strings.ToLower(title) if strings.HasPrefix(lower, "imgur") { return "" } return title } case html.StartTagToken: tn, _ := z.TagName() if string(tn) == "title" { depth++ } } } }
func buildTokenArray(tokenizer *xhtml.Tokenizer) []tagInfo { tokens := []tagInfo{} for tt := tokenizer.Next(); tt != xhtml.ErrorToken; tt = tokenizer.Next() { switch tt { case xhtml.TextToken: txt := string(tokenizer.Text()) if len(tokens) == 0 { info := tagInfo{ raw: txt, } tokens = append(tokens, info) } tn, _ := tokenizer.TagName() key, val, _ := tokenizer.TagAttr() info := tagInfo{ tag: string(tn), key: string(key), val: string(val), txt: txt, } tokens = append(tokens, info) case xhtml.StartTagToken: tn, _ := tokenizer.TagName() key, val, _ := tokenizer.TagAttr() info := tagInfo{ tag: string(tn), key: string(key), val: string(val), } tokens = append(tokens, info) case xhtml.SelfClosingTagToken, xhtml.EndTagToken: tn, _ := tokenizer.TagName() key, val, _ := tokenizer.TagAttr() info := tagInfo{ tag: string(tn), key: string(key), val: string(val), closingTag: true, } tokens = append(tokens, info) } } return tokens }
// getTagName gets a tagName from tokenizer. func getTagName(tokenizer *html.Tokenizer) string { tagName, _ := tokenizer.TagName() return string(tagName) }
func (parser *HtmlContentParser) parseBody(z *html.Tokenizer, c *MemoryContent) error { attrs := make([]html.Attribute, 0, 10) bodyBuff := bytes.NewBuffer(nil) attrs = readAttributes(z, attrs) if len(attrs) > 0 { c.bodyAttributes = StringFragment(joinAttrs(attrs)) } forloop: for { tt := z.Next() tag, _ := z.TagName() raw := byteCopy(z.Raw()) // create a copy here, because readAttributes modifies z.Raw, if attributes contain an & attrs = readAttributes(z, attrs) switch { case tt == html.ErrorToken: if z.Err() != io.EOF { return z.Err() } break forloop case tt == html.StartTagToken || tt == html.SelfClosingTagToken: if skipSubtreeIfUicRemove(z, tt, string(tag), attrs) { continue } if string(tag) == UicFragment { if f, deps, err := parseFragment(z); err != nil { return err } else { c.body[getFragmentName(attrs)] = f for _, dep := range deps { c.requiredContent[dep.URL] = dep } } continue } if string(tag) == UicTail { if f, deps, err := parseFragment(z); err != nil { return err } else { c.tail = f for _, dep := range deps { c.requiredContent[dep.URL] = dep } } continue } if string(tag) == UicFetch { if fd, err := getFetch(z, attrs); err != nil { return err } else { c.requiredContent[fd.URL] = fd continue } } if string(tag) == UicInclude { if replaceTextStart, replaceTextEnd, err := getInclude(z, attrs); err != nil { return err } else { bodyBuff.WriteString(replaceTextStart) // Enhancement: WriteOut sub tree, to allow alternative content // for optional includes. bodyBuff.WriteString(replaceTextEnd) continue } } case tt == html.EndTagToken: if string(tag) == "body" { break forloop } } bodyBuff.Write(raw) } s := bodyBuff.String() if _, defaultFragmentExists := c.body[""]; !defaultFragmentExists { if st := strings.Trim(s, " \n"); len(st) > 0 { c.body[""] = StringFragment(st) } } return nil }
func pullNode(tokens *html.Tokenizer, root *Markup) { var node *Markup for { token := tokens.Next() switch token { case html.ErrorToken: return case html.TextToken, html.CommentToken, html.DoctypeToken: text := strings.TrimSpace(string(tokens.Text())) if text == "" { continue } if token == html.CommentToken { text = "<!--" + text + "-->" } if node != nil { NewText(text).Apply(node) continue } NewText(text).Apply(root) continue case html.StartTagToken, html.EndTagToken, html.SelfClosingTagToken: if token == html.EndTagToken { node = nil return } tagName, hasAttr := tokens.TagName() node = NewMarkup(string(tagName), token == html.SelfClosingTagToken) node.Apply(root) if hasAttr { attrLoop: for { key, val, more := tokens.TagAttr() NewAttr(string(key), string(val)).Apply(node) if !more { break attrLoop } } } if token == html.SelfClosingTagToken { continue } pullNode(tokens, node) } } }