// ProcessHTML parses given html from Reader interface and fills up OpenGraph structure func (og *OpenGraph) ProcessHTML(buffer io.Reader) error { z := html.NewTokenizer(buffer) for { tt := z.Next() switch tt { case html.ErrorToken: if z.Err() == io.EOF { return nil } return z.Err() case html.StartTagToken, html.SelfClosingTagToken, html.EndTagToken: name, hasAttr := z.TagName() if atom.Lookup(name) == atom.Body { return nil // OpenGraph is only in head, so we don't need body } if atom.Lookup(name) != atom.Meta || !hasAttr { continue } m := make(map[string]string) var key, val []byte for hasAttr { key, val, hasAttr = z.TagAttr() m[atom.String(key)] = string(val) } og.ProcessMeta(m) } } return nil }
// Preprocess escapes disallowed tags in a cleaner way, but does not fix // nesting problems. Use with Clean. func Preprocess(config *Config, fragment string) string { if config == nil { config = DefaultConfig } var buf bytes.Buffer write := func(raw string) { _, err := buf.WriteString(raw) // The only possible error is running out of memory. expectError(err, nil) } t := html.NewTokenizer(strings.NewReader(fragment)) for { switch tok := t.Next(); tok { case html.ErrorToken: err := t.Err() // The only possible errors are from the Reader or from // the buffer capacity being exceeded. Neither can // happen with strings.NewReader as the string must // already fit into memory. expectError(err, io.EOF) if err == io.EOF { write(html.EscapeString(string(t.Raw()))) return buf.String() } case html.TextToken: write(string(t.Raw())) case html.StartTagToken, html.EndTagToken, html.SelfClosingTagToken: raw := string(t.Raw()) tagName, _ := t.TagName() allowed := false if tag := atom.Lookup(tagName); tag != 0 { if _, ok := config.elem[tag]; ok { allowed = true } } if !allowed { if _, ok := config.elemCustom[string(tagName)]; ok { allowed = true } } if !allowed { raw = html.EscapeString(raw) } write(raw) case html.CommentToken: raw := string(t.Raw()) if config.EscapeComments || !strings.HasPrefix(raw, "<!--") || !strings.HasSuffix(raw, "-->") { raw = html.EscapeString(raw) } write(raw) default: write(html.EscapeString(string(t.Raw()))) } } }
func atomize(s string) (atom.Atom, string) { // wasteful due to []byte allocs? if a := atom.Lookup([]byte(s)); a != 0 { return a, a.String() } return 0, s }
// ParseFragment parses a fragment of HTML and returns the nodes that were // found. If the fragment is the InnerHTML for an existing element, pass that // element in context. func ParseFragment(r io.Reader, context *Node) ([]*Node, error) { contextTag := "" if context != nil { if context.Type != ElementNode { return nil, errors.New("html: ParseFragment of non-element Node") } // The next check isn't just context.DataAtom.String() == context.Data because // it is valid to pass an element whose tag isn't a known atom. For example, // DataAtom == 0 and Data = "tagfromthefuture" is perfectly consistent. if context.DataAtom != a.Lookup([]byte(context.Data)) { return nil, fmt.Errorf("html: inconsistent Node: DataAtom=%q, Data=%q", context.DataAtom, context.Data) } contextTag = context.DataAtom.String() } p := &parser{ tokenizer: NewTokenizerFragment(r, contextTag), doc: &Node{ Type: DocumentNode, }, scripting: true, fragment: true, context: context, } root := &Node{ Type: ElementNode, DataAtom: a.Html, Data: a.Html.String(), } p.doc.AppendChild(root) p.oe = nodeStack{root} p.resetInsertionMode() for n := context; n != nil; n = n.Parent { if n.Type == ElementNode && n.DataAtom == a.Form { p.form = n break } } err := p.parse() if err != nil { return nil, err } parent := p.doc if context != nil { parent = root } var result []*Node for c := parent.FirstChild; c != nil; { next := c.NextSibling parent.RemoveChild(c) result = append(result, c) c = next } return result, nil }
// ElemAttrMatch allows an attribute name on the specified element, but // only if the value matches a regular expression. The receiver is returned to // allow call chaining. func (c *Config) ElemAttrMatch(elem, attr string, match *regexp.Regexp) *Config { if e, a := atom.Lookup([]byte(elem)), atom.Lookup([]byte(attr)); e != 0 && a != 0 { return c.ElemAttrAtomMatch(e, a, match) } if c.elemCustom == nil { c.elemCustom = make(map[string]map[string]*regexp.Regexp) } attrs := c.elemCustom[elem] if attrs == nil { attrs = make(map[string]*regexp.Regexp) c.elemCustom[elem] = attrs } attrs[attr] = match return c }
// GlobalAttr allows an attribute name on all allowed elements. The // receiver is returned to allow call chaining. func (c *Config) GlobalAttr(names ...string) *Config { for _, name := range names { if a := atom.Lookup([]byte(name)); a != 0 { c.GlobalAttrAtom(a) continue } if c.attrCustom == nil { c.attrCustom = make(map[string]struct{}) } c.attrCustom[name] = struct{}{} } return c }
// WrapTextInside makes an element's children behave as if they are root nodes // in the context of WrapText. The receiver is returned to allow call chaining. func (c *Config) WrapTextInside(names ...string) *Config { if c.wrapCustom == nil { c.wrapCustom = make(map[string]struct{}) } for _, name := range names { if a := atom.Lookup([]byte(name)); a != 0 { c.WrapTextInsideAtom(a) continue } c.wrapCustom[name] = struct{}{} } return c }
func cleanNode(c *Config, n *html.Node) *html.Node { allowedAttr, ok1 := c.elem[n.DataAtom] customAttr, ok2 := c.elemCustom[n.Data] if ok1 || ok2 { cleanChildren(c, n) haveSrc := false attrs := n.Attr n.Attr = make([]html.Attribute, 0, len(attrs)) for _, attr := range attrs { a := atom.Lookup([]byte(attr.Key)) re1, ok1 := allowedAttr[a] re2, ok2 := customAttr[attr.Key] _, ok3 := c.attr[a] _, ok4 := c.attrCustom[attr.Key] if attr.Namespace != "" || (!ok1 && !ok2 && !ok3 && !ok4) { continue } if !cleanURL(c, a, &attr) { continue } if re1 != nil && !re1.MatchString(attr.Val) { continue } if re2 != nil && !re2.MatchString(attr.Val) { continue } haveSrc = haveSrc || a == atom.Src n.Attr = append(n.Attr, attr) } if n.DataAtom == atom.Img && !haveSrc { // replace it with an empty text node return &html.Node{Type: html.TextNode} } return n } return text(html.UnescapeString(Render(n))) }
// Elem ensures an element name is allowed. The receiver is returned to // allow call chaining. func (c *Config) Elem(names ...string) *Config { for _, name := range names { if a := atom.Lookup([]byte(name)); a != 0 { c.ElemAtom(a) continue } if c.elemCustom == nil { c.elemCustom = make(map[string]map[string]*regexp.Regexp) } if _, ok := c.elemCustom[name]; !ok { c.elemCustom[name] = nil } } return c }
// Token returns the next Token. The result's Data and Attr values remain valid // after subsequent Next calls. func (z *Tokenizer) Token() Token { t := Token{Type: z.tt} switch z.tt { case TextToken, CommentToken, DoctypeToken: t.Data = string(z.Text()) case StartTagToken, SelfClosingTagToken, EndTagToken: name, moreAttr := z.TagName() for moreAttr { var key, val []byte key, val, moreAttr = z.TagAttr() t.Attr = append(t.Attr, Attribute{"", atom.String(key), string(val)}) } if a := atom.Lookup(name); a != 0 { t.DataAtom, t.Data = a, a.String() } else { t.DataAtom, t.Data = 0, string(name) } } return t }
// Section 12.2.5.5. func parseForeignContent(p *parser) bool { switch p.tok.Type { case TextToken: if p.framesetOK { p.framesetOK = strings.TrimLeft(p.tok.Data, whitespaceOrNUL) == "" } p.tok.Data = strings.Replace(p.tok.Data, "\x00", "\ufffd", -1) p.addText(p.tok.Data) case CommentToken: p.addChild(&Node{ Type: CommentNode, Data: p.tok.Data, }) case StartTagToken: b := breakout[p.tok.Data] if p.tok.DataAtom == a.Font { loop: for _, attr := range p.tok.Attr { switch attr.Key { case "color", "face", "size": b = true break loop } } } if b { for i := len(p.oe) - 1; i >= 0; i-- { n := p.oe[i] if n.Namespace == "" || htmlIntegrationPoint(n) || mathMLTextIntegrationPoint(n) { p.oe = p.oe[:i+1] break } } return false } switch p.top().Namespace { case "math": adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments) case "svg": // Adjust SVG tag names. The tokenizer lower-cases tag names, but // SVG wants e.g. "foreignObject" with a capital second "O". if x := svgTagNameAdjustments[p.tok.Data]; x != "" { p.tok.DataAtom = a.Lookup([]byte(x)) p.tok.Data = x } adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments) default: panic("html: bad parser state: unexpected namespace") } adjustForeignAttributes(p.tok.Attr) namespace := p.top().Namespace p.addElement() p.top().Namespace = namespace if namespace != "" { // Don't let the tokenizer go into raw text mode in foreign content // (e.g. in an SVG <title> tag). p.tokenizer.NextIsNotRawText() } if p.hasSelfClosingToken { p.oe.pop() p.acknowledgeSelfClosingTag() } case EndTagToken: for i := len(p.oe) - 1; i >= 0; i-- { if p.oe[i].Namespace == "" { return p.im(p) } if strings.EqualFold(p.oe[i].Data, p.tok.Data) { p.oe = p.oe[:i] break } } return true default: // Ignore the token. } return true }
func main() { flag.Parse() words := make(map[dbutil.DBMS][]string) for dbms, url := range reservedWordsURL { resp, err := http.Get(url) if err != nil { log.Print(err) continue } if resp.StatusCode != http.StatusOK { log.Printf("bad GET status for %s: %d", url, resp.Status) continue } defer resp.Body.Close() doc := html.NewTokenizer(resp.Body) isStart := false switch dbms { case dbutil.MySQL: // <code class="literal">ACCESSIBLE</code> startText := []byte("ACCESSIBLE") // <code class="literal">ZEROFILL</code> endText := []byte("ZEROFILL") for { tokenType := doc.Next() if tokenType == html.ErrorToken { break } if tokenType == html.StartTagToken { name, hasAttr := doc.TagName() if !hasAttr || atom.Lookup(name) != atom.Code { continue } doc.Next() text := doc.Text() if !isStart { if !bytes.Equal(startText, text) { continue } isStart = true } words[dbms] = append(words[dbms], string(text)) // Avoid possible code that could be matched in comments. if bytes.Equal(endText, text) { break } } } case dbutil.Postgres: // <tt class="TOKEN">A</tt> startText := []byte{'A'} for { tokenType := doc.Next() if tokenType == html.ErrorToken { break } if tokenType == html.StartTagToken { name, hasAttr := doc.TagName() if !hasAttr || atom.Lookup(name) != atom.Tt { continue } doc.Next() text := doc.Text() if !isStart { if !bytes.Equal(startText, text) { continue } isStart = true } words[dbms] = append(words[dbms], string(text)) } } case dbutil.SQLite: // <td align="left" valign="top" width="20%"> // ABORT<br> // CONSTRAINT<br></td> startText := []byte("\nABORT") for { tokenType := doc.Next() if tokenType == html.ErrorToken { break } if tokenType == html.StartTagToken { name, hasAttr := doc.TagName() if !hasAttr || atom.Lookup(name) != atom.Td { continue } doc.Next() text := doc.Text() if !isStart { if !bytes.Equal(startText, text) { continue } isStart = true text = bytes.TrimLeft(text, "\n") } words[dbms] = append(words[dbms], string(text)) for { if doc.Next() == html.EndTagToken { break } text := bytes.TrimLeft(doc.Text(), "\n") if len(text) == 0 { continue } words[dbms] = append(words[dbms], string(text)) } } } } if err = doc.Err(); err != nil && err != io.EOF { log.Printf("%s: %v", dbms, err) } } if err := write("z-reserved", words); err != nil { log.Fatal(err) } }
// testParseCase tests one test case from the test files. If the test does not // pass, it returns an error that explains the failure. // text is the HTML to be parsed, want is a dump of the correct parse tree, // and context is the name of the context node, if any. func testParseCase(text, want, context string) (err error) { defer func() { if x := recover(); x != nil { switch e := x.(type) { case error: err = e default: err = fmt.Errorf("%v", e) } } }() var doc *Node if context == "" { doc, err = Parse(strings.NewReader(text)) if err != nil { return err } } else { contextNode := &Node{ Type: ElementNode, DataAtom: atom.Lookup([]byte(context)), Data: context, } nodes, err := ParseFragment(strings.NewReader(text), contextNode) if err != nil { return err } doc = &Node{ Type: DocumentNode, } for _, n := range nodes { doc.AppendChild(n) } } if err := checkTreeConsistency(doc); err != nil { return err } got, err := dump(doc) if err != nil { return err } fmt.Println("got ", got) // Compare the parsed tree to the #document section. if got != want { return fmt.Errorf("got vs want:\n----\n%s----\n%s----", got, want) } if renderTestBlacklist[text] || context != "" { return nil } // Check that rendering and re-parsing results in an identical tree. pr, pw := io.Pipe() go func() { pw.CloseWithError(Render(pw, doc)) }() doc1, err := Parse(pr) if err != nil { return err } got1, err := dump(doc1) if err != nil { return err } if got != got1 { return fmt.Errorf("got vs got1:\n----\n%s----\n%s----", got, got1) } return nil }