// Performs the actual sanitization process. func (p *Policy) sanitize(r io.Reader) *bytes.Buffer { // It is possible that the developer has created the policy via: // p := bluemonday.Policy{} // rather than: // p := bluemonday.NewPolicy() // If this is the case, and if they haven't yet triggered an action that // would initiliaze the maps, then we need to do that. p.init() var buff bytes.Buffer tokenizer := html.NewTokenizer(r) skipElementContent := false skipClosingTag := false for { if tokenizer.Next() == html.ErrorToken { err := tokenizer.Err() if err == io.EOF { // End of input means end of processing return &buff } // Raw tokenizer error return &bytes.Buffer{} } token := tokenizer.Token() switch token.Type { case html.DoctypeToken: if p.allowDocType { buff.WriteString(token.String()) } case html.CommentToken: // Comments are ignored by default case html.StartTagToken: aps, ok := p.elsAndAttrs[token.Data] if !ok { if _, ok := p.setOfElementsToSkipContent[token.Data]; ok { skipElementContent = true } break } if len(token.Attr) != 0 { token.Attr = p.sanitizeAttrs(token.Data, token.Attr, aps) } if len(token.Attr) == 0 { if !p.allowNoAttrs(token.Data) { skipClosingTag = true break } } buff.WriteString(token.String()) case html.EndTagToken: if skipClosingTag { skipClosingTag = false break } if _, ok := p.elsAndAttrs[token.Data]; !ok { if _, ok := p.setOfElementsToSkipContent[token.Data]; ok { skipElementContent = false } break } buff.WriteString(token.String()) case html.SelfClosingTagToken: aps, ok := p.elsAndAttrs[token.Data] if !ok { break } if len(token.Attr) != 0 { token.Attr = p.sanitizeAttrs(token.Data, token.Attr, aps) } if len(token.Attr) == 0 && !p.allowNoAttrs(token.Data) { break } buff.WriteString(token.String()) case html.TextToken: if !skipElementContent { buff.WriteString(token.String()) } default: // A token that didn't exist in the html package when we wrote this return &bytes.Buffer{} } } }
func prescan(content []byte) (e encoding.Encoding, name string) { z := html.NewTokenizer(bytes.NewReader(content)) for { switch z.Next() { case html.ErrorToken: return nil, "" case html.StartTagToken, html.SelfClosingTagToken: tagName, hasAttr := z.TagName() if !bytes.Equal(tagName, []byte("meta")) { continue } attrList := make(map[string]bool) gotPragma := false const ( dontKnow = iota doNeedPragma doNotNeedPragma ) needPragma := dontKnow name = "" e = nil for hasAttr { var key, val []byte key, val, hasAttr = z.TagAttr() ks := string(key) if attrList[ks] { continue } attrList[ks] = true for i, c := range val { if 'A' <= c && c <= 'Z' { val[i] = c + 0x20 } } switch ks { case "http-equiv": if bytes.Equal(val, []byte("content-type")) { gotPragma = true } case "content": if e == nil { name = fromMetaElement(string(val)) if name != "" { e, name = Lookup(name) if e != nil { needPragma = doNeedPragma } } } case "charset": e, name = Lookup(string(val)) needPragma = doNotNeedPragma } } if needPragma == dontKnow || needPragma == doNeedPragma && !gotPragma { continue } if strings.HasPrefix(name, "utf-16") { name = "utf-8" e = encoding.Nop } if e != nil { return e, name } } } }