func traverse_html_token(z *html.Tokenizer) { for { if z.Next() == html.ErrorToken { return } token := z.Token() token_type := token.Type fmt.Printf("|token_type:%-20s|token.Data:%-10s|token.Attr:%-10s|\n", token_type, token.Data, token.Attr) } }
func traverse_html_tokenizer(z *html.Tokenizer) { for { if z.Next() == html.ErrorToken { return } text_b := z.Text() tag_name_b, hasAttri := z.TagName() tag_attr_key_b, tag_attr_value_b, _ := z.TagAttr() text := string(text_b) tag_name := string(tag_name_b) tag_attr_key := string(tag_attr_key_b) tag_attr_value := string(tag_attr_value_b) fmt.Printf("|Tokenizer.Text:%-10s|Tokenizer.TagName:%-10s|hasAttri:%-10t|tag_attr_key:%-10s|tag_attr_value:%-10s|\n", text, tag_name, hasAttri, tag_attr_key, tag_attr_value) } }
func burnTokensUntilEndTag(firewood *html.Tokenizer, tagName string) { rawTagName := []byte(tagName) for { token := firewood.Next() switch token { case html.ErrorToken: return case html.EndTagToken: name, _ := firewood.TagName() // log.Println("Struck token " + string(name)) if bytes.Equal(name, rawTagName) { // log.Println("Extinguishing token fire.") return } } } }
// skip forward to the next text, and return it as a string func next(z *html.Tokenizer) string { for tt := z.Next(); true; tt = z.Next() { if tt == html.TextToken { res := string(z.Text()) if debug { fmt.Printf("next: %q\n", res) } return res } if tt == html.ErrorToken { return "" } if debug { fmt.Println("skipping: ", tt) } } return "" }
func textUpToEndTag(tokenizer *html.Tokenizer, tagName string) []byte { var textBuffer bytes.Buffer rawTagName := []byte(tagName) for done := false; !done; { token := tokenizer.Next() switch token { case html.TextToken: textBuffer.Write(tokenizer.Text()) case html.EndTagToken: name, _ := tokenizer.TagName() if bytes.Equal(rawTagName, name) { done = true } case html.ErrorToken: done = true } } return textBuffer.Bytes() }
func parseToken(tokenizer *html.Tokenizer, htmlDoc *htmlDocument, parent *tagElement) (bool, bool, string) { tokenType := tokenizer.Next() switch tokenType { case html.ErrorToken: return true, false, "" case html.TextToken: text := string(tokenizer.Text()) if strings.TrimSpace(text) == "" { break } textElement := &textElement{text: text} appendElement(htmlDoc, parent, textElement) case html.StartTagToken: tagElement := &tagElement{tagName: getTagName(tokenizer), startTagRaw: string(tokenizer.Raw())} appendElement(htmlDoc, parent, tagElement) for { errorToken, parentEnded, unsetEndTag := parseToken(tokenizer, htmlDoc, tagElement) if errorToken { return true, false, "" } if parentEnded { if unsetEndTag != "" { return false, false, unsetEndTag } break } if unsetEndTag != "" { return false, false, setEndTagRaw(tokenizer, tagElement, unsetEndTag) } } case html.EndTagToken: return false, true, setEndTagRaw(tokenizer, parent, getTagName(tokenizer)) case html.DoctypeToken, html.SelfClosingTagToken, html.CommentToken: tagElement := &tagElement{tagName: getTagName(tokenizer), startTagRaw: string(tokenizer.Raw())} appendElement(htmlDoc, parent, tagElement) } return false, false, "" }