Example #1
0
// Performs the actual sanitization process.
func (p *Policy) sanitize(r io.Reader) *bytes.Buffer {

	// It is possible that the developer has created the policy via:
	//   p := bluemonday.Policy{}
	// rather than:
	//   p := bluemonday.NewPolicy()
	// If this is the case, and if they haven't yet triggered an action that
	// would initiliaze the maps, then we need to do that.
	p.init()

	var buff bytes.Buffer
	tokenizer := html.NewTokenizer(r)

	skipElementContent := false
	skipClosingTag := false
	for {
		if tokenizer.Next() == html.ErrorToken {
			err := tokenizer.Err()
			if err == io.EOF {
				// End of input means end of processing
				return &buff
			}

			// Raw tokenizer error
			return &bytes.Buffer{}
		}

		token := tokenizer.Token()
		switch token.Type {
		case html.DoctypeToken:

			if p.allowDocType {
				buff.WriteString(token.String())
			}

		case html.CommentToken:

			// Comments are ignored by default

		case html.StartTagToken:

			aps, ok := p.elsAndAttrs[token.Data]
			if !ok {
				if _, ok := p.setOfElementsToSkipContent[token.Data]; ok {
					skipElementContent = true
				}
				break
			}

			if len(token.Attr) != 0 {
				token.Attr = p.sanitizeAttrs(token.Data, token.Attr, aps)
			}

			if len(token.Attr) == 0 {
				if !p.allowNoAttrs(token.Data) {
					skipClosingTag = true
					break
				}
			}

			buff.WriteString(token.String())

		case html.EndTagToken:

			if skipClosingTag {
				skipClosingTag = false
				break
			}

			if _, ok := p.elsAndAttrs[token.Data]; !ok {
				if _, ok := p.setOfElementsToSkipContent[token.Data]; ok {
					skipElementContent = false
				}
				break
			}

			buff.WriteString(token.String())

		case html.SelfClosingTagToken:

			aps, ok := p.elsAndAttrs[token.Data]
			if !ok {
				break
			}

			if len(token.Attr) != 0 {
				token.Attr = p.sanitizeAttrs(token.Data, token.Attr, aps)
			}

			if len(token.Attr) == 0 && !p.allowNoAttrs(token.Data) {
				break
			}

			buff.WriteString(token.String())

		case html.TextToken:

			if !skipElementContent {
				buff.WriteString(token.String())
			}

		default:
			// A token that didn't exist in the html package when we wrote this
			return &bytes.Buffer{}
		}
	}
}
Example #2
0
func prescan(content []byte) (e encoding.Encoding, name string) {
	z := html.NewTokenizer(bytes.NewReader(content))
	for {
		switch z.Next() {
		case html.ErrorToken:
			return nil, ""

		case html.StartTagToken, html.SelfClosingTagToken:
			tagName, hasAttr := z.TagName()
			if !bytes.Equal(tagName, []byte("meta")) {
				continue
			}
			attrList := make(map[string]bool)
			gotPragma := false

			const (
				dontKnow = iota
				doNeedPragma
				doNotNeedPragma
			)
			needPragma := dontKnow

			name = ""
			e = nil
			for hasAttr {
				var key, val []byte
				key, val, hasAttr = z.TagAttr()
				ks := string(key)
				if attrList[ks] {
					continue
				}
				attrList[ks] = true
				for i, c := range val {
					if 'A' <= c && c <= 'Z' {
						val[i] = c + 0x20
					}
				}

				switch ks {
				case "http-equiv":
					if bytes.Equal(val, []byte("content-type")) {
						gotPragma = true
					}

				case "content":
					if e == nil {
						name = fromMetaElement(string(val))
						if name != "" {
							e, name = Lookup(name)
							if e != nil {
								needPragma = doNeedPragma
							}
						}
					}

				case "charset":
					e, name = Lookup(string(val))
					needPragma = doNotNeedPragma
				}
			}

			if needPragma == dontKnow || needPragma == doNeedPragma && !gotPragma {
				continue
			}

			if strings.HasPrefix(name, "utf-16") {
				name = "utf-8"
				e = encoding.Nop
			}

			if e != nil {
				return e, name
			}
		}
	}
}