Example #1
0
// setEndTagRaw sets an endTagRaw to the parent.
func setEndTagRaw(tokenizer *html.Tokenizer, parent *tagElement, tagName string) string {
	if parent != nil && parent.tagName == tagName {
		parent.endTagRaw = string(tokenizer.Raw())
		return ""
	}
	return tagName
}
Example #2
0
File: html.go Project: bfix/sid
/*
 * Read current tag with attributes.
 * @param tk *html.Tokenizer - tokenizer instance
 * @return *Tag - reference to read tag
 */
func readTag(tk *html.Tokenizer) *Tag {

	// we are only interested in certain tags
	tag, _ := tk.TagName()
	name := string(tag)
	switch name {
	//-----------------------------------------------------
	// external script files
	//-----------------------------------------------------
	case "script":
		attrs := getAttrs(tk)
		if attrs != nil {
			if _, ok := attrs["src"]; ok {
				// add external reference to script file
				return NewTag("script", attrs)
			}
		}

	//-----------------------------------------------------
	// external image
	//-----------------------------------------------------
	case "img":
		attrs := getAttrs(tk)
		if attrs != nil {
			return NewTag("img", attrs)
		}

	//-----------------------------------------------------
	// external links (style sheets)
	//-----------------------------------------------------
	case "link":
		attrs := getAttrs(tk)
		if attrs != nil {
			if _, ok := attrs["href"]; ok {
				// add external reference to link
				return NewTag("link", attrs)
			}
		}

	//-----------------------------------------------------
	// input fields
	//-----------------------------------------------------
	case "input":
		attrs := getAttrs(tk)
		if attrs != nil {
			if _, ok := attrs["type"]; ok {
				// add external reference to link
				return NewTag("input", attrs)
			}
		}
	}
	//-----------------------------------------------------
	// ignore all other tags (no tag processed).
	//-----------------------------------------------------
	return nil
}
Example #3
0
func hasTablPluieClass(z *html.Tokenizer) bool {
	key, val, more := z.TagAttr()
	if string(key) == "class" && string(val) == "tablPluie" {
		return true
	}
	if more {
		return hasTablPluieClass(z)
	}
	return false
}
Example #4
0
func traverse_html_token(z *html.Tokenizer) {
	for {
		if z.Next() == html.ErrorToken {
			return
		}
		token := z.Token()
		token_type := token.Type
		fmt.Printf("|token_type:%-20s|token.Data:%-10s|token.Attr:%-10s|\n", token_type, token.Data, token.Attr)
	}
}
Example #5
0
func getAttrVal(tokenizer *html.Tokenizer, attrName string) string {

	for {

		key, val, moreAttr := tokenizer.TagAttr()

		if string(key) == attrName {
			return string(val)
		}

		if !moreAttr {
			return ""
		}
	}
}
Example #6
0
// parseAnchorAttrs iterates over all of the attributes in the current anchor token.
// If a href is found, it adds the link value to the links slice.
// Returns the new link slice.
func parseAnchorAttrs(tokenizer *html.Tokenizer, links []*URL) []*URL {
	//TODO: rework this to be cleaner, passing in `links` to be appended to
	//isn't great
	for {
		key, val, moreAttr := tokenizer.TagAttr()
		if bytes.Compare(key, []byte("href")) == 0 {
			u, err := ParseURL(strings.TrimSpace(string(val)))
			if err == nil {
				links = append(links, u)
			}
		}
		if !moreAttr {
			return links
		}
	}
}
Example #7
0
func burnTokensUntilEndTag(firewood *html.Tokenizer, tagName string) {
	rawTagName := []byte(tagName)
	for {
		token := firewood.Next()
		switch token {
		case html.ErrorToken:
			return
		case html.EndTagToken:
			name, _ := firewood.TagName()
			// log.Println("Struck token " + string(name))
			if bytes.Equal(name, rawTagName) {
				// log.Println("Extinguishing token fire.")
				return
			}
		}
	}
}
Example #8
0
File: html.go Project: bfix/sid
/*
 * Get list of attributes for a tag.
 * If the tag is at the end of a HTML fragment and not all attributes
 * can be read by the tokenizer, this call terminates with a "nil"
 * map to indicate failure. The tag is than dropped (for an eavesdropper
 * this looks like a cached resource)
 * @param tk *html.Tokenizer - tokenizer instance
 * @return map[string]string - list of attributes
 */
func getAttrs(tk *html.Tokenizer) (list map[string]string) {

	// handle panic during parsing
	defer func() {
		if r := recover(); r != nil {
			logger.Printf(logger.WARN, "[sid.html] Skipping fragmented tag: %v\n", r)
			list = nil
		}
	}()

	// parse attributes from HTML text
	list = make(map[string]string)
	for {
		key, val, more := tk.TagAttr()
		list[string(key)] = string(val)
		if !more {
			break
		}
	}
	return
}
Example #9
0
func traverse_html_tokenizer(z *html.Tokenizer) {
	for {
		if z.Next() == html.ErrorToken {
			return
		}
		text_b := z.Text()
		tag_name_b, hasAttri := z.TagName()
		tag_attr_key_b, tag_attr_value_b, _ := z.TagAttr()
		text := string(text_b)
		tag_name := string(tag_name_b)
		tag_attr_key := string(tag_attr_key_b)
		tag_attr_value := string(tag_attr_value_b)
		fmt.Printf("|Tokenizer.Text:%-10s|Tokenizer.TagName:%-10s|hasAttri:%-10t|tag_attr_key:%-10s|tag_attr_value:%-10s|\n", text, tag_name, hasAttri, tag_attr_key, tag_attr_value)
	}
}
Example #10
0
func parseToken(tokenizer *html.Tokenizer, htmlDoc *htmlDocument, parent *tagElement) (bool, bool, string) {
	tokenType := tokenizer.Next()
	switch tokenType {
	case html.ErrorToken:
		return true, false, ""
	case html.TextToken:
		text := string(tokenizer.Text())
		if strings.TrimSpace(text) == "" {
			break
		}
		textElement := &textElement{text: text}
		appendElement(htmlDoc, parent, textElement)
	case html.StartTagToken:
		tagElement := &tagElement{tagName: getTagName(tokenizer), startTagRaw: string(tokenizer.Raw())}
		appendElement(htmlDoc, parent, tagElement)
		for {
			errorToken, parentEnded, unsetEndTag := parseToken(tokenizer, htmlDoc, tagElement)
			if errorToken {
				return true, false, ""
			}
			if parentEnded {
				if unsetEndTag != "" {
					return false, false, unsetEndTag
				}
				break
			}
			if unsetEndTag != "" {
				return false, false, setEndTagRaw(tokenizer, tagElement, unsetEndTag)
			}
		}
	case html.EndTagToken:
		return false, true, setEndTagRaw(tokenizer, parent, getTagName(tokenizer))
	case html.DoctypeToken, html.SelfClosingTagToken, html.CommentToken:
		tagElement := &tagElement{tagName: getTagName(tokenizer), startTagRaw: string(tokenizer.Raw())}
		appendElement(htmlDoc, parent, tagElement)
	}
	return false, false, ""
}
Example #11
0
// skip forward to the next text, and return it as a string
func next(z *html.Tokenizer) string {
	for tt := z.Next(); true; tt = z.Next() {
		if tt == html.TextToken {
			res := string(z.Text())
			if debug {
				fmt.Printf("next: %q\n", res)
			}
			return res
		}
		if tt == html.ErrorToken {
			return ""
		}
		if debug {
			fmt.Println("skipping: ", tt)
		}
	}
	return ""
}
Example #12
0
func textUpToEndTag(tokenizer *html.Tokenizer, tagName string) []byte {
	var textBuffer bytes.Buffer
	rawTagName := []byte(tagName)
	for done := false; !done; {
		token := tokenizer.Next()
		switch token {
		case html.TextToken:
			textBuffer.Write(tokenizer.Text())
		case html.EndTagToken:
			name, _ := tokenizer.TagName()
			if bytes.Equal(rawTagName, name) {
				done = true
			}
		case html.ErrorToken:
			done = true
		}
	}
	return textBuffer.Bytes()
}
Example #13
0
// getTagName gets a tagName from tokenizer.
func getTagName(tokenizer *html.Tokenizer) string {
	tagName, _ := tokenizer.TagName()
	return string(tagName)
}