/* * Read current tag with attributes. * @param tk *html.Tokenizer - tokenizer instance * @return *Tag - reference to read tag */ func readTag(tk *html.Tokenizer) *Tag { // we are only interested in certain tags tag, _ := tk.TagName() name := string(tag) switch name { //----------------------------------------------------- // external script files //----------------------------------------------------- case "script": attrs := getAttrs(tk) if attrs != nil { if _, ok := attrs["src"]; ok { // add external reference to script file return NewTag("script", attrs) } } //----------------------------------------------------- // external image //----------------------------------------------------- case "img": attrs := getAttrs(tk) if attrs != nil { return NewTag("img", attrs) } //----------------------------------------------------- // external links (style sheets) //----------------------------------------------------- case "link": attrs := getAttrs(tk) if attrs != nil { if _, ok := attrs["href"]; ok { // add external reference to link return NewTag("link", attrs) } } //----------------------------------------------------- // input fields //----------------------------------------------------- case "input": attrs := getAttrs(tk) if attrs != nil { if _, ok := attrs["type"]; ok { // add external reference to link return NewTag("input", attrs) } } } //----------------------------------------------------- // ignore all other tags (no tag processed). //----------------------------------------------------- return nil }
func traverse_html_tokenizer(z *html.Tokenizer) { for { if z.Next() == html.ErrorToken { return } text_b := z.Text() tag_name_b, hasAttri := z.TagName() tag_attr_key_b, tag_attr_value_b, _ := z.TagAttr() text := string(text_b) tag_name := string(tag_name_b) tag_attr_key := string(tag_attr_key_b) tag_attr_value := string(tag_attr_value_b) fmt.Printf("|Tokenizer.Text:%-10s|Tokenizer.TagName:%-10s|hasAttri:%-10t|tag_attr_key:%-10s|tag_attr_value:%-10s|\n", text, tag_name, hasAttri, tag_attr_key, tag_attr_value) } }
func burnTokensUntilEndTag(firewood *html.Tokenizer, tagName string) { rawTagName := []byte(tagName) for { token := firewood.Next() switch token { case html.ErrorToken: return case html.EndTagToken: name, _ := firewood.TagName() // log.Println("Struck token " + string(name)) if bytes.Equal(name, rawTagName) { // log.Println("Extinguishing token fire.") return } } } }
func textUpToEndTag(tokenizer *html.Tokenizer, tagName string) []byte { var textBuffer bytes.Buffer rawTagName := []byte(tagName) for done := false; !done; { token := tokenizer.Next() switch token { case html.TextToken: textBuffer.Write(tokenizer.Text()) case html.EndTagToken: name, _ := tokenizer.TagName() if bytes.Equal(rawTagName, name) { done = true } case html.ErrorToken: done = true } } return textBuffer.Bytes() }
// getTagName gets a tagName from tokenizer. func getTagName(tokenizer *html.Tokenizer) string { tagName, _ := tokenizer.TagName() return string(tagName) }