// Token returns the next Token. The result's Data and Attr values remain valid // after subsequent Next calls. func (z *Tokenizer) Token() Token { t := Token{Type: z.tt} switch z.tt { case TextToken, CommentToken, DoctypeToken: t.Data = string(z.Text()) case StartTagToken, SelfClosingTagToken: var attr []Attribute name, moreAttr := z.TagName() for moreAttr { var key, val []byte key, val, moreAttr = z.TagAttr() attr = append(attr, Attribute{"", atom.String(key), string(val)}) } if a := atom.Lookup(name); a != 0 { t.DataAtom, t.Data = a, a.String() } else { t.DataAtom, t.Data = 0, string(name) } t.Attr = attr case EndTagToken: name, _ := z.TagName() if a := atom.Lookup(name); a != 0 { t.DataAtom, t.Data = a, a.String() } else { t.DataAtom, t.Data = 0, string(name) } } return t }
// ParseFragment parses a fragment of HTML and returns the nodes that were // found. If the fragment is the InnerHTML for an existing element, pass that // element in context. func ParseFragment(r io.Reader, context *Node) ([]*Node, error) { contextTag := "" if context != nil { if context.Type != ElementNode { return nil, errors.New("html: ParseFragment of non-element Node") } // The next check isn't just context.DataAtom.String() == context.Data because // it is valid to pass an element whose tag isn't a known atom. For example, // DataAtom == 0 and Data = "tagfromthefuture" is perfectly consistent. if context.DataAtom != a.Lookup([]byte(context.Data)) { return nil, fmt.Errorf("html: inconsistent Node: DataAtom=%q, Data=%q", context.DataAtom, context.Data) } contextTag = context.DataAtom.String() } p := &parser{ tokenizer: NewTokenizerFragment(r, contextTag), doc: &Node{ Type: DocumentNode, }, scripting: true, fragment: true, context: context, } root := &Node{ Type: ElementNode, DataAtom: a.Html, Data: a.Html.String(), } p.doc.AppendChild(root) p.oe = nodeStack{root} p.resetInsertionMode() for n := context; n != nil; n = n.Parent { if n.Type == ElementNode && n.DataAtom == a.Form { p.form = n break } } err := p.parse() if err != nil { return nil, err } parent := p.doc if context != nil { parent = root } var result []*Node for c := parent.FirstChild; c != nil; { next := c.NextSibling parent.RemoveChild(c) result = append(result, c) c = next } return result, nil }
// ParseFragment parses a fragment of HTML and returns the nodes that were // found. If the fragment is the InnerHTML for an existing element, pass that // element in context. func ParseFragment(r io.Reader, context *Node) ([]*Node, error) { p := &parser{ tokenizer: NewTokenizer(r), doc: &Node{ Type: DocumentNode, }, scripting: true, context: context, } if context != nil { if context.Type != ElementNode { return nil, errors.New("html: ParseFragment of non-element Node") } // The next check isn't just context.DataAtom.String() == context.Data because // it is valid to pass an element whose tag isn't a known atom. For example, // DataAtom == 0 and Data = "tagfromthefuture" is perfectly consistent. if context.DataAtom != a.Lookup([]byte(context.Data)) { return nil, fmt.Errorf("html: inconsistent Node: DataAtom=%q, Data=%q", context.DataAtom, context.Data) } switch context.DataAtom { case a.Iframe, a.Noembed, a.Noframes, a.Noscript, a.Plaintext, a.Script, a.Style, a.Title, a.Textarea, a.Xmp: p.tokenizer.rawTag = context.DataAtom.String() } } root := &Node{ Type: ElementNode, DataAtom: a.Html, Data: a.Html.String(), } p.doc.Add(root) p.oe = nodeStack{root} p.resetInsertionMode() for n := context; n != nil; n = n.Parent { if n.Type == ElementNode && n.DataAtom == a.Form { p.form = n break } } err := p.parse() if err != nil { return nil, err } parent := p.doc if context != nil { parent = root } result := parent.Child parent.Child = nil for _, n := range result { n.Parent = nil } return result, nil }
// testParseCase tests one test case from the test files. If the test does not // pass, it returns an error that explains the failure. // text is the HTML to be parsed, want is a dump of the correct parse tree, // and context is the name of the context node, if any. func testParseCase(text, want, context string) (err error) { defer func() { if x := recover(); x != nil { switch e := x.(type) { case error: err = e default: err = fmt.Errorf("%v", e) } } }() var doc *Node if context == "" { doc, err = Parse(strings.NewReader(text)) if err != nil { return err } } else { contextNode := &Node{ Type: ElementNode, DataAtom: atom.Lookup([]byte(context)), Data: context, } nodes, err := ParseFragment(strings.NewReader(text), contextNode) if err != nil { return err } doc = &Node{ Type: DocumentNode, } for _, n := range nodes { doc.AppendChild(n) } } if err := checkTreeConsistency(doc); err != nil { return err } got, err := dump(doc) if err != nil { return err } // Compare the parsed tree to the #document section. if got != want { return fmt.Errorf("got vs want:\n----\n%s----\n%s----", got, want) } if renderTestBlacklist[text] || context != "" { return nil } // Check that rendering and re-parsing results in an identical tree. pr, pw := io.Pipe() go func() { pw.CloseWithError(Render(pw, doc)) }() doc1, err := Parse(pr) if err != nil { return err } got1, err := dump(doc1) if err != nil { return err } if got != got1 { return fmt.Errorf("got vs got1:\n----\n%s----\n%s----", got, got1) } return nil }
// Section 12.2.5.5. func parseForeignContent(p *parser) bool { switch p.tok.Type { case TextToken: if p.framesetOK { p.framesetOK = strings.TrimLeft(p.tok.Data, whitespaceOrNUL) == "" } p.tok.Data = strings.Replace(p.tok.Data, "\x00", "\ufffd", -1) p.addText(p.tok.Data) case CommentToken: p.addChild(&Node{ Type: CommentNode, Data: p.tok.Data, }) case StartTagToken: b := breakout[p.tok.Data] if p.tok.DataAtom == a.Font { loop: for _, attr := range p.tok.Attr { switch attr.Key { case "color", "face", "size": b = true break loop } } } if b { for i := len(p.oe) - 1; i >= 0; i-- { n := p.oe[i] if n.Namespace == "" || htmlIntegrationPoint(n) || mathMLTextIntegrationPoint(n) { p.oe = p.oe[:i+1] break } } return false } switch p.top().Namespace { case "math": adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments) case "svg": // Adjust SVG tag names. The tokenizer lower-cases tag names, but // SVG wants e.g. "foreignObject" with a capital second "O". if x := svgTagNameAdjustments[p.tok.Data]; x != "" { p.tok.DataAtom = a.Lookup([]byte(x)) p.tok.Data = x } adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments) default: panic("html: bad parser state: unexpected namespace") } adjustForeignAttributes(p.tok.Attr) namespace := p.top().Namespace p.addElement() p.top().Namespace = namespace if namespace != "" { // Don't let the tokenizer go into raw text mode in foreign content // (e.g. in an SVG <title> tag). p.tokenizer.NextIsNotRawText() } if p.hasSelfClosingToken { p.oe.pop() p.acknowledgeSelfClosingTag() } case EndTagToken: for i := len(p.oe) - 1; i >= 0; i-- { if p.oe[i].Namespace == "" { return p.im(p) } if strings.EqualFold(p.oe[i].Data, p.tok.Data) { p.oe = p.oe[:i] break } } return true default: // Ignore the token. } return true }
// testParseCase tests one test case from the test files. It returns a // parseTestResult indicating how much of the test passed. If the result // is not parseTestPassed, it also returns an error that explains the failure. // text is the HTML to be parsed, want is a dump of the correct parse tree, // and context is the name of the context node, if any. func testParseCase(text, want, context string) (result parseTestResult, err error) { defer func() { if x := recover(); x != nil { switch e := x.(type) { case error: err = e default: err = fmt.Errorf("%v", e) } } }() var doc *Node if context == "" { doc, err = Parse(strings.NewReader(text)) if err != nil { return parseTestFailed, err } } else { contextNode := &Node{ Type: ElementNode, DataAtom: atom.Lookup([]byte(context)), Data: context, } nodes, err := ParseFragment(strings.NewReader(text), contextNode) if err != nil { return parseTestFailed, err } doc = &Node{ Type: DocumentNode, } for _, n := range nodes { doc.Add(n) } } got, err := dump(doc) if err != nil { return parseTestFailed, err } // Compare the parsed tree to the #document section. if got != want { return parseTestFailed, fmt.Errorf("got vs want:\n----\n%s----\n%s----", got, want) } if renderTestBlacklist[text] || context != "" { return parseTestPassed, nil } // Set result so that if a panic occurs during the render and re-parse // the calling function will know that the parsing phase was successful. result = parseTestParseOnly // Check that rendering and re-parsing results in an identical tree. pr, pw := io.Pipe() go func() { pw.CloseWithError(Render(pw, doc)) }() doc1, err := Parse(pr) if err != nil { return parseTestParseOnly, err } got1, err := dump(doc1) if err != nil { return parseTestParseOnly, err } if got != got1 { return parseTestParseOnly, fmt.Errorf("got vs got1:\n----\n%s----\n%s----", got, got1) } return parseTestPassed, nil }