// setEndTagRaw sets an endTagRaw to the parent. func setEndTagRaw(tokenizer *html.Tokenizer, parent *tagElement, tagName string) string { if parent != nil && parent.tagName == tagName { parent.endTagRaw = string(tokenizer.Raw()) return "" } return tagName }
/* * Read current tag with attributes. * @param tk *html.Tokenizer - tokenizer instance * @return *Tag - reference to read tag */ func readTag(tk *html.Tokenizer) *Tag { // we are only interested in certain tags tag, _ := tk.TagName() name := string(tag) switch name { //----------------------------------------------------- // external script files //----------------------------------------------------- case "script": attrs := getAttrs(tk) if attrs != nil { if _, ok := attrs["src"]; ok { // add external reference to script file return NewTag("script", attrs) } } //----------------------------------------------------- // external image //----------------------------------------------------- case "img": attrs := getAttrs(tk) if attrs != nil { return NewTag("img", attrs) } //----------------------------------------------------- // external links (style sheets) //----------------------------------------------------- case "link": attrs := getAttrs(tk) if attrs != nil { if _, ok := attrs["href"]; ok { // add external reference to link return NewTag("link", attrs) } } //----------------------------------------------------- // input fields //----------------------------------------------------- case "input": attrs := getAttrs(tk) if attrs != nil { if _, ok := attrs["type"]; ok { // add external reference to link return NewTag("input", attrs) } } } //----------------------------------------------------- // ignore all other tags (no tag processed). //----------------------------------------------------- return nil }
func hasTablPluieClass(z *html.Tokenizer) bool { key, val, more := z.TagAttr() if string(key) == "class" && string(val) == "tablPluie" { return true } if more { return hasTablPluieClass(z) } return false }
func traverse_html_token(z *html.Tokenizer) { for { if z.Next() == html.ErrorToken { return } token := z.Token() token_type := token.Type fmt.Printf("|token_type:%-20s|token.Data:%-10s|token.Attr:%-10s|\n", token_type, token.Data, token.Attr) } }
func getAttrVal(tokenizer *html.Tokenizer, attrName string) string { for { key, val, moreAttr := tokenizer.TagAttr() if string(key) == attrName { return string(val) } if !moreAttr { return "" } } }
// parseAnchorAttrs iterates over all of the attributes in the current anchor token. // If a href is found, it adds the link value to the links slice. // Returns the new link slice. func parseAnchorAttrs(tokenizer *html.Tokenizer, links []*URL) []*URL { //TODO: rework this to be cleaner, passing in `links` to be appended to //isn't great for { key, val, moreAttr := tokenizer.TagAttr() if bytes.Compare(key, []byte("href")) == 0 { u, err := ParseURL(strings.TrimSpace(string(val))) if err == nil { links = append(links, u) } } if !moreAttr { return links } } }
func burnTokensUntilEndTag(firewood *html.Tokenizer, tagName string) { rawTagName := []byte(tagName) for { token := firewood.Next() switch token { case html.ErrorToken: return case html.EndTagToken: name, _ := firewood.TagName() // log.Println("Struck token " + string(name)) if bytes.Equal(name, rawTagName) { // log.Println("Extinguishing token fire.") return } } } }
/* * Get list of attributes for a tag. * If the tag is at the end of a HTML fragment and not all attributes * can be read by the tokenizer, this call terminates with a "nil" * map to indicate failure. The tag is than dropped (for an eavesdropper * this looks like a cached resource) * @param tk *html.Tokenizer - tokenizer instance * @return map[string]string - list of attributes */ func getAttrs(tk *html.Tokenizer) (list map[string]string) { // handle panic during parsing defer func() { if r := recover(); r != nil { logger.Printf(logger.WARN, "[sid.html] Skipping fragmented tag: %v\n", r) list = nil } }() // parse attributes from HTML text list = make(map[string]string) for { key, val, more := tk.TagAttr() list[string(key)] = string(val) if !more { break } } return }
func traverse_html_tokenizer(z *html.Tokenizer) { for { if z.Next() == html.ErrorToken { return } text_b := z.Text() tag_name_b, hasAttri := z.TagName() tag_attr_key_b, tag_attr_value_b, _ := z.TagAttr() text := string(text_b) tag_name := string(tag_name_b) tag_attr_key := string(tag_attr_key_b) tag_attr_value := string(tag_attr_value_b) fmt.Printf("|Tokenizer.Text:%-10s|Tokenizer.TagName:%-10s|hasAttri:%-10t|tag_attr_key:%-10s|tag_attr_value:%-10s|\n", text, tag_name, hasAttri, tag_attr_key, tag_attr_value) } }
func parseToken(tokenizer *html.Tokenizer, htmlDoc *htmlDocument, parent *tagElement) (bool, bool, string) { tokenType := tokenizer.Next() switch tokenType { case html.ErrorToken: return true, false, "" case html.TextToken: text := string(tokenizer.Text()) if strings.TrimSpace(text) == "" { break } textElement := &textElement{text: text} appendElement(htmlDoc, parent, textElement) case html.StartTagToken: tagElement := &tagElement{tagName: getTagName(tokenizer), startTagRaw: string(tokenizer.Raw())} appendElement(htmlDoc, parent, tagElement) for { errorToken, parentEnded, unsetEndTag := parseToken(tokenizer, htmlDoc, tagElement) if errorToken { return true, false, "" } if parentEnded { if unsetEndTag != "" { return false, false, unsetEndTag } break } if unsetEndTag != "" { return false, false, setEndTagRaw(tokenizer, tagElement, unsetEndTag) } } case html.EndTagToken: return false, true, setEndTagRaw(tokenizer, parent, getTagName(tokenizer)) case html.DoctypeToken, html.SelfClosingTagToken, html.CommentToken: tagElement := &tagElement{tagName: getTagName(tokenizer), startTagRaw: string(tokenizer.Raw())} appendElement(htmlDoc, parent, tagElement) } return false, false, "" }
// skip forward to the next text, and return it as a string func next(z *html.Tokenizer) string { for tt := z.Next(); true; tt = z.Next() { if tt == html.TextToken { res := string(z.Text()) if debug { fmt.Printf("next: %q\n", res) } return res } if tt == html.ErrorToken { return "" } if debug { fmt.Println("skipping: ", tt) } } return "" }
func textUpToEndTag(tokenizer *html.Tokenizer, tagName string) []byte { var textBuffer bytes.Buffer rawTagName := []byte(tagName) for done := false; !done; { token := tokenizer.Next() switch token { case html.TextToken: textBuffer.Write(tokenizer.Text()) case html.EndTagToken: name, _ := tokenizer.TagName() if bytes.Equal(rawTagName, name) { done = true } case html.ErrorToken: done = true } } return textBuffer.Bytes() }
// getTagName gets a tagName from tokenizer. func getTagName(tokenizer *html.Tokenizer) string { tagName, _ := tokenizer.TagName() return string(tagName) }