// ParseFragment parses a fragment of HTML and returns the nodes that were // found. If the fragment is the InnerHTML for an existing element, pass that // element in context. func ParseFragment(r io.Reader, context *Node) ([]*Node, error) { contextTag := "" if context != nil { if context.Type != ElementNode { return nil, errors.New("html: ParseFragment of non-element Node") } // The next check isn't just context.DataAtom.String() == context.Data because // it is valid to pass an element whose tag isn't a known atom. For example, // DataAtom == 0 and Data = "tagfromthefuture" is perfectly consistent. if context.DataAtom != a.Lookup([]byte(context.Data)) { return nil, fmt.Errorf("html: inconsistent Node: DataAtom=%q, Data=%q", context.DataAtom, context.Data) } contextTag = context.DataAtom.String() } p := &parser{ tokenizer: NewTokenizerFragment(r, contextTag), doc: &Node{ Type: DocumentNode, }, scripting: true, fragment: true, context: context, } root := &Node{ Type: ElementNode, DataAtom: a.Html, Data: a.Html.String(), } p.doc.AppendChild(root) p.oe = nodeStack{root} p.resetInsertionMode() for n := context; n != nil; n = n.Parent { if n.Type == ElementNode && n.DataAtom == a.Form { p.form = n break } } err := p.parse() if err != nil { return nil, err } parent := p.doc if context != nil { parent = root } var result []*Node for c := parent.FirstChild; c != nil; { next := c.NextSibling parent.RemoveChild(c) result = append(result, c) c = next } return result, nil }
// Token returns the next Token. The result's Data and Attr values remain valid // after subsequent Next calls. func (z *Tokenizer) Token() Token { t := Token{Type: z.tt} switch z.tt { case TextToken, CommentToken, DoctypeToken: t.Data = string(z.Text()) case StartTagToken, SelfClosingTagToken, EndTagToken: name, moreAttr := z.TagName() for moreAttr { var key, val []byte key, val, moreAttr = z.TagAttr() t.Attr = append(t.Attr, Attribute{"", atom.String(key), string(val)}) } if a := atom.Lookup(name); a != 0 { t.DataAtom, t.Data = a, a.String() } else { t.DataAtom, t.Data = 0, string(name) } } return t }
// Section 12.2.5.5. func parseForeignContent(p *parser) bool { switch p.tok.Type { case TextToken: if p.framesetOK { p.framesetOK = strings.TrimLeft(p.tok.Data, whitespaceOrNUL) == "" } p.tok.Data = strings.Replace(p.tok.Data, "\x00", "\ufffd", -1) p.addText(p.tok.Data) case CommentToken: p.addChild(&Node{ Type: CommentNode, Data: p.tok.Data, }) case StartTagToken: b := breakout[p.tok.Data] if p.tok.DataAtom == a.Font { loop: for _, attr := range p.tok.Attr { switch attr.Key { case "color", "face", "size": b = true break loop } } } if b { for i := len(p.oe) - 1; i >= 0; i-- { n := p.oe[i] if n.Namespace == "" || htmlIntegrationPoint(n) || mathMLTextIntegrationPoint(n) { p.oe = p.oe[:i+1] break } } return false } switch p.top().Namespace { case "math": adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments) case "svg": // Adjust SVG tag names. The tokenizer lower-cases tag names, but // SVG wants e.g. "foreignObject" with a capital second "O". if x := svgTagNameAdjustments[p.tok.Data]; x != "" { p.tok.DataAtom = a.Lookup([]byte(x)) p.tok.Data = x } adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments) default: panic("html: bad parser state: unexpected namespace") } adjustForeignAttributes(p.tok.Attr) namespace := p.top().Namespace p.addElement() p.top().Namespace = namespace if namespace != "" { // Don't let the tokenizer go into raw text mode in foreign content // (e.g. in an SVG <title> tag). p.tokenizer.NextIsNotRawText() } if p.hasSelfClosingToken { p.oe.pop() p.acknowledgeSelfClosingTag() } case EndTagToken: for i := len(p.oe) - 1; i >= 0; i-- { if p.oe[i].Namespace == "" { return p.im(p) } if strings.EqualFold(p.oe[i].Data, p.tok.Data) { p.oe = p.oe[:i] break } } return true default: // Ignore the token. } return true }