Beispiel #1
0
// ProcessHTML parses given html from Reader interface and fills up OpenGraph structure
func (og *OpenGraph) ProcessHTML(buffer io.Reader) error {
	z := html.NewTokenizer(buffer)
	for {
		tt := z.Next()
		switch tt {
		case html.ErrorToken:
			if z.Err() == io.EOF {
				return nil
			}
			return z.Err()
		case html.StartTagToken, html.SelfClosingTagToken, html.EndTagToken:
			name, hasAttr := z.TagName()
			if atom.Lookup(name) == atom.Body {
				return nil // OpenGraph is only in head, so we don't need body
			}
			if atom.Lookup(name) != atom.Meta || !hasAttr {
				continue
			}
			m := make(map[string]string)
			var key, val []byte
			for hasAttr {
				key, val, hasAttr = z.TagAttr()
				m[atom.String(key)] = string(val)
			}
			og.ProcessMeta(m)
		}
	}
	return nil
}
Beispiel #2
0
// Preprocess escapes disallowed tags in a cleaner way, but does not fix
// nesting problems. Use with Clean.
func Preprocess(config *Config, fragment string) string {
	if config == nil {
		config = DefaultConfig
	}

	var buf bytes.Buffer
	write := func(raw string) {
		_, err := buf.WriteString(raw)

		// The only possible error is running out of memory.
		expectError(err, nil)
	}

	t := html.NewTokenizer(strings.NewReader(fragment))
	for {
		switch tok := t.Next(); tok {
		case html.ErrorToken:
			err := t.Err()

			// The only possible errors are from the Reader or from
			// the buffer capacity being exceeded. Neither can
			// happen with strings.NewReader as the string must
			// already fit into memory.
			expectError(err, io.EOF)

			if err == io.EOF {
				write(html.EscapeString(string(t.Raw())))
				return buf.String()
			}
		case html.TextToken:
			write(string(t.Raw()))
		case html.StartTagToken, html.EndTagToken, html.SelfClosingTagToken:
			raw := string(t.Raw())
			tagName, _ := t.TagName()
			allowed := false
			if tag := atom.Lookup(tagName); tag != 0 {
				if _, ok := config.elem[tag]; ok {
					allowed = true
				}
			}
			if !allowed {
				if _, ok := config.elemCustom[string(tagName)]; ok {
					allowed = true
				}
			}
			if !allowed {
				raw = html.EscapeString(raw)
			}
			write(raw)
		case html.CommentToken:
			raw := string(t.Raw())
			if config.EscapeComments || !strings.HasPrefix(raw, "<!--") || !strings.HasSuffix(raw, "-->") {
				raw = html.EscapeString(raw)
			}
			write(raw)
		default:
			write(html.EscapeString(string(t.Raw())))
		}
	}
}
Beispiel #3
0
func atomize(s string) (atom.Atom, string) {
	// wasteful due to []byte allocs?
	if a := atom.Lookup([]byte(s)); a != 0 {
		return a, a.String()
	}
	return 0, s
}
Beispiel #4
0
// ParseFragment parses a fragment of HTML and returns the nodes that were
// found. If the fragment is the InnerHTML for an existing element, pass that
// element in context.
func ParseFragment(r io.Reader, context *Node) ([]*Node, error) {
	contextTag := ""
	if context != nil {
		if context.Type != ElementNode {
			return nil, errors.New("html: ParseFragment of non-element Node")
		}
		// The next check isn't just context.DataAtom.String() == context.Data because
		// it is valid to pass an element whose tag isn't a known atom. For example,
		// DataAtom == 0 and Data = "tagfromthefuture" is perfectly consistent.
		if context.DataAtom != a.Lookup([]byte(context.Data)) {
			return nil, fmt.Errorf("html: inconsistent Node: DataAtom=%q, Data=%q", context.DataAtom, context.Data)
		}
		contextTag = context.DataAtom.String()
	}
	p := &parser{
		tokenizer: NewTokenizerFragment(r, contextTag),
		doc: &Node{
			Type: DocumentNode,
		},
		scripting: true,
		fragment:  true,
		context:   context,
	}

	root := &Node{
		Type:     ElementNode,
		DataAtom: a.Html,
		Data:     a.Html.String(),
	}
	p.doc.AppendChild(root)
	p.oe = nodeStack{root}
	p.resetInsertionMode()

	for n := context; n != nil; n = n.Parent {
		if n.Type == ElementNode && n.DataAtom == a.Form {
			p.form = n
			break
		}
	}

	err := p.parse()
	if err != nil {
		return nil, err
	}

	parent := p.doc
	if context != nil {
		parent = root
	}

	var result []*Node
	for c := parent.FirstChild; c != nil; {
		next := c.NextSibling
		parent.RemoveChild(c)
		result = append(result, c)
		c = next
	}
	return result, nil
}
Beispiel #5
0
// ElemAttrMatch allows an attribute name on the specified element, but
// only if the value matches a regular expression. The receiver is returned to
// allow call chaining.
func (c *Config) ElemAttrMatch(elem, attr string, match *regexp.Regexp) *Config {
	if e, a := atom.Lookup([]byte(elem)), atom.Lookup([]byte(attr)); e != 0 && a != 0 {
		return c.ElemAttrAtomMatch(e, a, match)
	}

	if c.elemCustom == nil {
		c.elemCustom = make(map[string]map[string]*regexp.Regexp)
	}

	attrs := c.elemCustom[elem]
	if attrs == nil {
		attrs = make(map[string]*regexp.Regexp)
		c.elemCustom[elem] = attrs
	}

	attrs[attr] = match

	return c
}
Beispiel #6
0
// GlobalAttr allows an attribute name on all allowed elements. The
// receiver is returned to allow call chaining.
func (c *Config) GlobalAttr(names ...string) *Config {
	for _, name := range names {
		if a := atom.Lookup([]byte(name)); a != 0 {
			c.GlobalAttrAtom(a)
			continue
		}

		if c.attrCustom == nil {
			c.attrCustom = make(map[string]struct{})
		}

		c.attrCustom[name] = struct{}{}
	}

	return c
}
Beispiel #7
0
// WrapTextInside makes an element's children behave as if they are root nodes
// in the context of WrapText. The receiver is returned to allow call chaining.
func (c *Config) WrapTextInside(names ...string) *Config {
	if c.wrapCustom == nil {
		c.wrapCustom = make(map[string]struct{})
	}

	for _, name := range names {
		if a := atom.Lookup([]byte(name)); a != 0 {
			c.WrapTextInsideAtom(a)
			continue
		}

		c.wrapCustom[name] = struct{}{}
	}

	return c
}
Beispiel #8
0
func cleanNode(c *Config, n *html.Node) *html.Node {
	allowedAttr, ok1 := c.elem[n.DataAtom]
	customAttr, ok2 := c.elemCustom[n.Data]
	if ok1 || ok2 {
		cleanChildren(c, n)

		haveSrc := false

		attrs := n.Attr
		n.Attr = make([]html.Attribute, 0, len(attrs))
		for _, attr := range attrs {
			a := atom.Lookup([]byte(attr.Key))

			re1, ok1 := allowedAttr[a]
			re2, ok2 := customAttr[attr.Key]
			_, ok3 := c.attr[a]
			_, ok4 := c.attrCustom[attr.Key]

			if attr.Namespace != "" || (!ok1 && !ok2 && !ok3 && !ok4) {
				continue
			}

			if !cleanURL(c, a, &attr) {
				continue
			}

			if re1 != nil && !re1.MatchString(attr.Val) {
				continue
			}
			if re2 != nil && !re2.MatchString(attr.Val) {
				continue
			}

			haveSrc = haveSrc || a == atom.Src

			n.Attr = append(n.Attr, attr)
		}

		if n.DataAtom == atom.Img && !haveSrc {
			// replace it with an empty text node
			return &html.Node{Type: html.TextNode}
		}

		return n
	}
	return text(html.UnescapeString(Render(n)))
}
Beispiel #9
0
// Elem ensures an element name is allowed. The receiver is returned to
// allow call chaining.
func (c *Config) Elem(names ...string) *Config {
	for _, name := range names {
		if a := atom.Lookup([]byte(name)); a != 0 {
			c.ElemAtom(a)
			continue
		}

		if c.elemCustom == nil {
			c.elemCustom = make(map[string]map[string]*regexp.Regexp)
		}

		if _, ok := c.elemCustom[name]; !ok {
			c.elemCustom[name] = nil
		}
	}

	return c
}
Beispiel #10
0
// Token returns the next Token. The result's Data and Attr values remain valid
// after subsequent Next calls.
func (z *Tokenizer) Token() Token {
	t := Token{Type: z.tt}
	switch z.tt {
	case TextToken, CommentToken, DoctypeToken:
		t.Data = string(z.Text())
	case StartTagToken, SelfClosingTagToken, EndTagToken:
		name, moreAttr := z.TagName()
		for moreAttr {
			var key, val []byte
			key, val, moreAttr = z.TagAttr()
			t.Attr = append(t.Attr, Attribute{"", atom.String(key), string(val)})
		}
		if a := atom.Lookup(name); a != 0 {
			t.DataAtom, t.Data = a, a.String()
		} else {
			t.DataAtom, t.Data = 0, string(name)
		}
	}
	return t
}
Beispiel #11
0
// Section 12.2.5.5.
func parseForeignContent(p *parser) bool {
	switch p.tok.Type {
	case TextToken:
		if p.framesetOK {
			p.framesetOK = strings.TrimLeft(p.tok.Data, whitespaceOrNUL) == ""
		}
		p.tok.Data = strings.Replace(p.tok.Data, "\x00", "\ufffd", -1)
		p.addText(p.tok.Data)
	case CommentToken:
		p.addChild(&Node{
			Type: CommentNode,
			Data: p.tok.Data,
		})
	case StartTagToken:
		b := breakout[p.tok.Data]
		if p.tok.DataAtom == a.Font {
		loop:
			for _, attr := range p.tok.Attr {
				switch attr.Key {
				case "color", "face", "size":
					b = true
					break loop
				}
			}
		}
		if b {
			for i := len(p.oe) - 1; i >= 0; i-- {
				n := p.oe[i]
				if n.Namespace == "" || htmlIntegrationPoint(n) || mathMLTextIntegrationPoint(n) {
					p.oe = p.oe[:i+1]
					break
				}
			}
			return false
		}
		switch p.top().Namespace {
		case "math":
			adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments)
		case "svg":
			// Adjust SVG tag names. The tokenizer lower-cases tag names, but
			// SVG wants e.g. "foreignObject" with a capital second "O".
			if x := svgTagNameAdjustments[p.tok.Data]; x != "" {
				p.tok.DataAtom = a.Lookup([]byte(x))
				p.tok.Data = x
			}
			adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments)
		default:
			panic("html: bad parser state: unexpected namespace")
		}
		adjustForeignAttributes(p.tok.Attr)
		namespace := p.top().Namespace
		p.addElement()
		p.top().Namespace = namespace
		if namespace != "" {
			// Don't let the tokenizer go into raw text mode in foreign content
			// (e.g. in an SVG <title> tag).
			p.tokenizer.NextIsNotRawText()
		}
		if p.hasSelfClosingToken {
			p.oe.pop()
			p.acknowledgeSelfClosingTag()
		}
	case EndTagToken:
		for i := len(p.oe) - 1; i >= 0; i-- {
			if p.oe[i].Namespace == "" {
				return p.im(p)
			}
			if strings.EqualFold(p.oe[i].Data, p.tok.Data) {
				p.oe = p.oe[:i]
				break
			}
		}
		return true
	default:
		// Ignore the token.
	}
	return true
}
Beispiel #12
0
func main() {
	flag.Parse()

	words := make(map[dbutil.DBMS][]string)

	for dbms, url := range reservedWordsURL {
		resp, err := http.Get(url)
		if err != nil {
			log.Print(err)
			continue
		}
		if resp.StatusCode != http.StatusOK {
			log.Printf("bad GET status for %s: %d", url, resp.Status)
			continue
		}
		defer resp.Body.Close()

		doc := html.NewTokenizer(resp.Body)
		isStart := false

		switch dbms {
		case dbutil.MySQL:
			// <code class="literal">ACCESSIBLE</code>
			startText := []byte("ACCESSIBLE")

			// <code class="literal">ZEROFILL</code>
			endText := []byte("ZEROFILL")

			for {
				tokenType := doc.Next()
				if tokenType == html.ErrorToken {
					break
				}

				if tokenType == html.StartTagToken {
					name, hasAttr := doc.TagName()
					if !hasAttr || atom.Lookup(name) != atom.Code {
						continue
					}

					doc.Next()
					text := doc.Text()

					if !isStart {
						if !bytes.Equal(startText, text) {
							continue
						}
						isStart = true
					}
					words[dbms] = append(words[dbms], string(text))

					// Avoid possible code that could be matched in comments.
					if bytes.Equal(endText, text) {
						break
					}
				}
			}

		case dbutil.Postgres:
			// <tt class="TOKEN">A</tt>
			startText := []byte{'A'}

			for {
				tokenType := doc.Next()
				if tokenType == html.ErrorToken {
					break
				}

				if tokenType == html.StartTagToken {
					name, hasAttr := doc.TagName()
					if !hasAttr || atom.Lookup(name) != atom.Tt {
						continue
					}

					doc.Next()
					text := doc.Text()

					if !isStart {
						if !bytes.Equal(startText, text) {
							continue
						}
						isStart = true
					}
					words[dbms] = append(words[dbms], string(text))
				}
			}

		case dbutil.SQLite:
			// <td align="left" valign="top" width="20%">
			// ABORT<br>
			// CONSTRAINT<br></td>

			startText := []byte("\nABORT")

			for {
				tokenType := doc.Next()
				if tokenType == html.ErrorToken {
					break
				}

				if tokenType == html.StartTagToken {
					name, hasAttr := doc.TagName()
					if !hasAttr || atom.Lookup(name) != atom.Td {
						continue
					}

					doc.Next()
					text := doc.Text()

					if !isStart {
						if !bytes.Equal(startText, text) {
							continue
						}
						isStart = true
						text = bytes.TrimLeft(text, "\n")
					}
					words[dbms] = append(words[dbms], string(text))

					for {
						if doc.Next() == html.EndTagToken {
							break
						}

						text := bytes.TrimLeft(doc.Text(), "\n")
						if len(text) == 0 {
							continue
						}
						words[dbms] = append(words[dbms], string(text))
					}
				}
			}
		}

		if err = doc.Err(); err != nil && err != io.EOF {
			log.Printf("%s: %v", dbms, err)
		}
	}

	if err := write("z-reserved", words); err != nil {
		log.Fatal(err)
	}
}
Beispiel #13
0
// testParseCase tests one test case from the test files. If the test does not
// pass, it returns an error that explains the failure.
// text is the HTML to be parsed, want is a dump of the correct parse tree,
// and context is the name of the context node, if any.
func testParseCase(text, want, context string) (err error) {
	defer func() {
		if x := recover(); x != nil {
			switch e := x.(type) {
			case error:
				err = e
			default:
				err = fmt.Errorf("%v", e)
			}
		}
	}()

	var doc *Node
	if context == "" {
		doc, err = Parse(strings.NewReader(text))
		if err != nil {
			return err
		}
	} else {
		contextNode := &Node{
			Type:     ElementNode,
			DataAtom: atom.Lookup([]byte(context)),
			Data:     context,
		}
		nodes, err := ParseFragment(strings.NewReader(text), contextNode)
		if err != nil {
			return err
		}
		doc = &Node{
			Type: DocumentNode,
		}
		for _, n := range nodes {
			doc.AppendChild(n)
		}
	}

	if err := checkTreeConsistency(doc); err != nil {
		return err
	}

	got, err := dump(doc)
	if err != nil {
		return err
	}

	fmt.Println("got ", got)
	// Compare the parsed tree to the #document section.
	if got != want {
		return fmt.Errorf("got vs want:\n----\n%s----\n%s----", got, want)
	}

	if renderTestBlacklist[text] || context != "" {
		return nil
	}

	// Check that rendering and re-parsing results in an identical tree.
	pr, pw := io.Pipe()
	go func() {
		pw.CloseWithError(Render(pw, doc))
	}()
	doc1, err := Parse(pr)
	if err != nil {
		return err
	}
	got1, err := dump(doc1)
	if err != nil {
		return err
	}
	if got != got1 {
		return fmt.Errorf("got vs got1:\n----\n%s----\n%s----", got, got1)
	}

	return nil
}