Example #1
0
func traverse_html_tokenizer(z *html.Tokenizer) {
	for {
		if z.Next() == html.ErrorToken {
			return
		}
		text_b := z.Text()
		tag_name_b, hasAttri := z.TagName()
		tag_attr_key_b, tag_attr_value_b, _ := z.TagAttr()
		text := string(text_b)
		tag_name := string(tag_name_b)
		tag_attr_key := string(tag_attr_key_b)
		tag_attr_value := string(tag_attr_value_b)
		fmt.Printf("|Tokenizer.Text:%-10s|Tokenizer.TagName:%-10s|hasAttri:%-10t|tag_attr_key:%-10s|tag_attr_value:%-10s|\n", text, tag_name, hasAttri, tag_attr_key, tag_attr_value)
	}
}
Example #2
0
// skip forward to the next text, and return it as a string
func next(z *html.Tokenizer) string {
	for tt := z.Next(); true; tt = z.Next() {
		if tt == html.TextToken {
			res := string(z.Text())
			if debug {
				fmt.Printf("next: %q\n", res)
			}
			return res
		}
		if tt == html.ErrorToken {
			return ""
		}
		if debug {
			fmt.Println("skipping: ", tt)
		}
	}
	return ""
}
Example #3
0
func textUpToEndTag(tokenizer *html.Tokenizer, tagName string) []byte {
	var textBuffer bytes.Buffer
	rawTagName := []byte(tagName)
	for done := false; !done; {
		token := tokenizer.Next()
		switch token {
		case html.TextToken:
			textBuffer.Write(tokenizer.Text())
		case html.EndTagToken:
			name, _ := tokenizer.TagName()
			if bytes.Equal(rawTagName, name) {
				done = true
			}
		case html.ErrorToken:
			done = true
		}
	}
	return textBuffer.Bytes()
}
Example #4
0
func parseToken(tokenizer *html.Tokenizer, htmlDoc *htmlDocument, parent *tagElement) (bool, bool, string) {
	tokenType := tokenizer.Next()
	switch tokenType {
	case html.ErrorToken:
		return true, false, ""
	case html.TextToken:
		text := string(tokenizer.Text())
		if strings.TrimSpace(text) == "" {
			break
		}
		textElement := &textElement{text: text}
		appendElement(htmlDoc, parent, textElement)
	case html.StartTagToken:
		tagElement := &tagElement{tagName: getTagName(tokenizer), startTagRaw: string(tokenizer.Raw())}
		appendElement(htmlDoc, parent, tagElement)
		for {
			errorToken, parentEnded, unsetEndTag := parseToken(tokenizer, htmlDoc, tagElement)
			if errorToken {
				return true, false, ""
			}
			if parentEnded {
				if unsetEndTag != "" {
					return false, false, unsetEndTag
				}
				break
			}
			if unsetEndTag != "" {
				return false, false, setEndTagRaw(tokenizer, tagElement, unsetEndTag)
			}
		}
	case html.EndTagToken:
		return false, true, setEndTagRaw(tokenizer, parent, getTagName(tokenizer))
	case html.DoctypeToken, html.SelfClosingTagToken, html.CommentToken:
		tagElement := &tagElement{tagName: getTagName(tokenizer), startTagRaw: string(tokenizer.Raw())}
		appendElement(htmlDoc, parent, tagElement)
	}
	return false, false, ""
}