// FixHtml parses bytes as HTML and returns well-formed HTML if the parse // was successful, or escaped HTML, if not. func fixHtml(linkUrl string, wild []byte) (well []byte) { n, err := html.Parse(bytes.NewReader(wild)) if err != nil { return []byte(html.EscapeString(string(wild))) } fixImgs(linkUrl, n) defer func() { if err := recover(); err == bytes.ErrTooLarge { well = []byte(html.EscapeString(string(wild))) } else if err != nil { panic(err) } }() buf := bytes.NewBuffer(make([]byte, 0, len(wild)*2)) if err := html.Render(buf, n); err != nil { return []byte(html.EscapeString(string(wild))) } well = buf.Bytes() openBody := []byte("<body>") i := bytes.Index(well, openBody) if i < 0 { return []byte(html.EscapeString(string(wild))) } well = well[i+len(openBody):] closeBody := []byte("</body>") i = bytes.Index(well, closeBody) if i < 0 { return []byte(html.EscapeString(string(wild))) } return well[:i] }
// Should produce original html as seen by parser. // Two differences: a) selfclosing tags are selfclosing in the output // and b) attribute values and text is escaped. func (n *Node) Html() (s string) { s = "<" + n.Name for _, a := range n.Attr { s += " " + a.Key + "=\"" + html.EscapeString(a.Val) + "\"" } if len(n.class) > 0 { s += " class=\"" for i, c := range n.class { if i > 0 { s += " " } s += c } s += "\"" } s += ">" for _, c := range n.subs { if c.Name == TEXT_NODE { s += html.EscapeString(c.Text) } else { s += c.Html() } } switch n.Name { case "img", "meta", "br", "input": s += " />" default: s += "</" + n.Name + ">" } return }
// Sanitizes the given input by parsing it as HTML5, then whitelisting known to // be safe elements and attributes. All other HTML is escaped, unsafe attributes // are stripped. func sanitizeHtmlSafe(input []byte) []byte { r := bytes.NewReader(input) var w bytes.Buffer tokenizer := html.NewTokenizer(r) wr := bufio.NewWriter(&w) // Iterate through all tokens in the input stream and sanitize them. for t := tokenizer.Next(); t != html.ErrorToken; t = tokenizer.Next() { switch t { case html.TextToken: // Text is written escaped. wr.WriteString(tokenizer.Token().String()) case html.SelfClosingTagToken, html.StartTagToken: // HTML tags are escaped unless whitelisted. tag, hasAttributes := tokenizer.TagName() tagName := string(tag) if whitelistTags[tagName] { wr.WriteString("<") wr.Write(tag) for hasAttributes { var key, val []byte key, val, hasAttributes = tokenizer.TagAttr() attrName := string(key) // Only include whitelisted attributes for the given tagName. tagWhitelistedAttrs, ok := whitelistAttrs[tagName] if ok && tagWhitelistedAttrs[attrName] { // For whitelisted attributes, if it's an attribute that requires // protocol checking, do so and strip it if it's not known to be safe. tagProtocolAttrs, ok := protocolAttrs[tagName] if ok && tagProtocolAttrs[attrName] { if !protocolAllowed(val) { continue } } wr.WriteByte(' ') wr.Write(key) wr.WriteString(`="`) wr.WriteString(html.EscapeString(string(val))) wr.WriteByte('"') } } wr.WriteString(">") } else { wr.WriteString(html.EscapeString(string(tokenizer.Raw()))) } case html.EndTagToken: // Whitelisted tokens can be written in raw. tag, _ := tokenizer.TagName() if whitelistTags[string(tag)] { wr.Write(tokenizer.Raw()) } else { wr.WriteString(html.EscapeString(string(tokenizer.Raw()))) } case html.CommentToken: // Comments are not really expected, but harmless. wr.Write(tokenizer.Raw()) case html.DoctypeToken: // Escape DOCTYPES, entities etc can be dangerous wr.WriteString(html.EscapeString(string(tokenizer.Raw()))) default: tokenizer.Token() panic(fmt.Errorf("Unexpected token type %v", t)) } } err := tokenizer.Err() if err != nil && err != io.EOF { panic(tokenizer.Err()) } wr.Flush() return w.Bytes() }