Example #1
0
// FixHtml parses bytes as HTML and returns well-formed HTML if the parse
// was successful, or escaped HTML, if not.
func fixHtml(linkUrl string, wild []byte) (well []byte) {
	n, err := html.Parse(bytes.NewReader(wild))
	if err != nil {
		return []byte(html.EscapeString(string(wild)))
	}

	fixImgs(linkUrl, n)

	defer func() {
		if err := recover(); err == bytes.ErrTooLarge {
			well = []byte(html.EscapeString(string(wild)))
		} else if err != nil {
			panic(err)
		}
	}()
	buf := bytes.NewBuffer(make([]byte, 0, len(wild)*2))
	if err := html.Render(buf, n); err != nil {
		return []byte(html.EscapeString(string(wild)))
	}

	well = buf.Bytes()
	openBody := []byte("<body>")
	i := bytes.Index(well, openBody)
	if i < 0 {
		return []byte(html.EscapeString(string(wild)))
	}
	well = well[i+len(openBody):]

	closeBody := []byte("</body>")
	i = bytes.Index(well, closeBody)
	if i < 0 {
		return []byte(html.EscapeString(string(wild)))
	}
	return well[:i]
}
Example #2
0
// Should produce original html as seen by parser.
// Two differences: a) selfclosing tags are selfclosing in the output
// and b) attribute values and text is escaped.
func (n *Node) Html() (s string) {
	s = "<" + n.Name
	for _, a := range n.Attr {
		s += " " + a.Key + "=\"" + html.EscapeString(a.Val) + "\""
	}
	if len(n.class) > 0 {
		s += " class=\""
		for i, c := range n.class {
			if i > 0 {
				s += " "
			}
			s += c
		}
		s += "\""
	}
	s += ">"
	for _, c := range n.subs {
		if c.Name == TEXT_NODE {
			s += html.EscapeString(c.Text)
		} else {
			s += c.Html()
		}
	}
	switch n.Name {
	case "img", "meta", "br", "input":
		s += " />"
	default:
		s += "</" + n.Name + ">"
	}
	return
}
Example #3
0
// Sanitizes the given input by parsing it as HTML5, then whitelisting known to
// be safe elements and attributes. All other HTML is escaped, unsafe attributes
// are stripped.
func sanitizeHtmlSafe(input []byte) []byte {
	r := bytes.NewReader(input)
	var w bytes.Buffer
	tokenizer := html.NewTokenizer(r)
	wr := bufio.NewWriter(&w)

	// Iterate through all tokens in the input stream and sanitize them.
	for t := tokenizer.Next(); t != html.ErrorToken; t = tokenizer.Next() {
		switch t {
		case html.TextToken:
			// Text is written escaped.
			wr.WriteString(tokenizer.Token().String())
		case html.SelfClosingTagToken, html.StartTagToken:
			// HTML tags are escaped unless whitelisted.
			tag, hasAttributes := tokenizer.TagName()
			tagName := string(tag)
			if whitelistTags[tagName] {
				wr.WriteString("<")
				wr.Write(tag)
				for hasAttributes {
					var key, val []byte
					key, val, hasAttributes = tokenizer.TagAttr()
					attrName := string(key)
					// Only include whitelisted attributes for the given tagName.
					tagWhitelistedAttrs, ok := whitelistAttrs[tagName]
					if ok && tagWhitelistedAttrs[attrName] {
						// For whitelisted attributes, if it's an attribute that requires
						// protocol checking, do so and strip it if it's not known to be safe.
						tagProtocolAttrs, ok := protocolAttrs[tagName]
						if ok && tagProtocolAttrs[attrName] {
							if !protocolAllowed(val) {
								continue
							}
						}
						wr.WriteByte(' ')
						wr.Write(key)
						wr.WriteString(`="`)
						wr.WriteString(html.EscapeString(string(val)))
						wr.WriteByte('"')
					}
				}
				wr.WriteString(">")
			} else {
				wr.WriteString(html.EscapeString(string(tokenizer.Raw())))
			}
		case html.EndTagToken:
			// Whitelisted tokens can be written in raw.
			tag, _ := tokenizer.TagName()
			if whitelistTags[string(tag)] {
				wr.Write(tokenizer.Raw())
			} else {
				wr.WriteString(html.EscapeString(string(tokenizer.Raw())))
			}
		case html.CommentToken:
			// Comments are not really expected, but harmless.
			wr.Write(tokenizer.Raw())
		case html.DoctypeToken:
			// Escape DOCTYPES, entities etc can be dangerous
			wr.WriteString(html.EscapeString(string(tokenizer.Raw())))
		default:
			tokenizer.Token()
			panic(fmt.Errorf("Unexpected token type %v", t))
		}
	}
	err := tokenizer.Err()
	if err != nil && err != io.EOF {
		panic(tokenizer.Err())
	}
	wr.Flush()
	return w.Bytes()
}