// findElement skips everything until we find an element of the given name. func findElement(d *xml.Decoder, names []xml.Name) (xml.StartElement, error) { for { t, err := d.RawToken() if err != nil { return xml.StartElement{}, err } if start, ok := t.(xml.StartElement); ok { for _, v := range names { if v == start.Name { return start, nil } } } } panic("unreachable") }
// parseList returns all child nodes of the given name, plus CharData. func parseList(d *xml.Decoder, names, stack []xml.Name) ([]Node, error) { var c []Node for len(stack) > 0 { t, err := d.RawToken() if err != nil { return nil, fmt.Errorf("unclosed tags: %v", stack) } // A token can be of the following types: // // xml.CharData // xml.Comment // xml.Directive // xml.EndElement // xml.ProcInst // xml.StartElement switch t := t.(type) { case xml.StartElement: found := false for _, v := range names { if v == t.Name { found = true list, err := parseList(d, names, []xml.Name{t.Name}) if err != nil { return nil, err } c = append(c, Node{Token: t, List: list}) } } if !found { stack = append(stack, t.Name) } case xml.EndElement: if stack, err = popName(stack, t.Name); err != nil { return nil, err } case xml.CharData: if b := bytes.TrimSpace(t); len(b) > 0 { // Need to make a copy of b. b1 := make(xml.CharData, len(b)) copy(b1, b) c = append(c, Node{Token: b1}) } } } return c, nil }
// xmlToTreeParser - load a 'clean' XML doc into a tree of *node. func xmlToTreeParser(skey string, a []xml.Attr, p *xml.Decoder) (*node, error) { n := new(node) n.nodes = make([]*node, 0) var seq int // for includeTagSeqNum if skey != "" { n.key = skey if len(a) > 0 { for _, v := range a { na := new(node) na.attr = true na.key = v.Name.Local na.val = v.Value n.nodes = append(n.nodes, na) } } } for { t, err := p.RawToken() if err != nil { if err != io.EOF { return nil, errors.New("xml.Decoder.Token() - " + err.Error()) } return nil, err } switch t.(type) { case xml.StartElement: tt := t.(xml.StartElement) var key string if tt.Name.Space != "" { key = tt.Name.Space + ":" + tt.Name.Local } else { key = tt.Name.Local } // handle root if n.key == "" { n.key = key if len(tt.Attr) > 0 { for _, v := range tt.Attr { na := new(node) na.attr = true na.key = v.Name.Local na.val = v.Value n.nodes = append(n.nodes, na) } } } else { nn, nnerr := xmlToTreeParser(key, tt.Attr, p) if nnerr != nil { return nil, nnerr } n.nodes = append(n.nodes, nn) if includeTagSeqNum { // 2014.11.09 sn := &node{false, false, "_seq", strconv.Itoa(seq), nil} nn.nodes = append(nn.nodes, sn) seq++ } } case xml.EndElement: // scan n.nodes for duplicate n.key values n.markDuplicateKeys() return n, nil case xml.CharData: tt := string(t.(xml.CharData)) // clean up possible noise tt = strings.Trim(tt, "\t\r\b\n ") if len(n.nodes) > 0 && len(tt) > 0 { // if len(n.nodes) > 0 { nn := new(node) nn.key = "_" nn.val = tt n.nodes = append(n.nodes, nn) } else { n.val = tt } if includeTagSeqNum { // 2014.11.09 if len(n.nodes) == 0 { // treat like a simple element with attributes nn := new(node) nn.key = "_" nn.val = tt n.nodes = append(n.nodes, nn) } sn := &node{false, false, "_seq", strconv.Itoa(seq), nil} n.nodes = append(n.nodes, sn) seq++ } default: // noop } } // Logically we can't get here, but provide an error message anyway. return nil, fmt.Errorf("Unknown parse error in xmlToTree() for: %s", n.key) }
// xmlSeqToMapParser - load a 'clean' XML doc into a map[string]interface{} directly. // Add #seq tag value for each element decoded - to be used for Encoding later. func xmlSeqToMapParser(skey string, a []xml.Attr, p *xml.Decoder, r bool) (map[string]interface{}, error) { // NOTE: all attributes and sub-elements parsed into 'na', 'na' is returned as value for 'skey' in 'n'. var n, na map[string]interface{} var seq int // for including seq num when decoding // Allocate maps and load attributes, if any. // NOTE: on entry from NewMapXml(), etc., skey=="", and we fall through // to get StartElement then recurse with skey==xml.StartElement.Name.Local // where we begin allocating map[string]interface{} values 'n' and 'na'. if skey != "" { // 'n' only needs one slot - save call to runtime•hashGrow() // 'na' we don't know n = make(map[string]interface{}, 1) na = make(map[string]interface{}) if len(a) > 0 { // xml.Attr is decoded into: map["#attr"]map[<attr_label>]interface{} // where interface{} is map[string]interface{}{"#text":<attr_val>, "#seq":<attr_seq>} aa := make(map[string]interface{}, len(a)) for i, v := range a { if len(v.Name.Space) > 0 { aa[v.Name.Space+`:`+v.Name.Local] = map[string]interface{}{"#text": cast(v.Value, r), "#seq": i} } else { aa[v.Name.Local] = map[string]interface{}{"#text": cast(v.Value, r), "#seq": i} } } na["#attr"] = aa } } for { t, err := p.RawToken() if err != nil { if err != io.EOF { return nil, errors.New("xml.Decoder.Token() - " + err.Error()) } return nil, err } switch t.(type) { case xml.StartElement: tt := t.(xml.StartElement) // First call to xmlSeqToMapParser() doesn't pass xml.StartElement - the map key. // So when the loop is first entered, the first token is the root tag along // with any attributes, which we process here. // // Subsequent calls to xmlSeqToMapParser() will pass in tag+attributes for // processing before getting the next token which is the element value, // which is done above. if skey == "" { if len(tt.Name.Space) > 0 { return xmlSeqToMapParser(tt.Name.Space+`:`+tt.Name.Local, tt.Attr, p, r) } else { return xmlSeqToMapParser(tt.Name.Local, tt.Attr, p, r) } } // If not initializing the map, parse the element. // len(nn) == 1, necessarily - it is just an 'n'. var nn map[string]interface{} if len(tt.Name.Space) > 0 { nn, err = xmlSeqToMapParser(tt.Name.Space+`:`+tt.Name.Local, tt.Attr, p, r) } else { nn, err = xmlSeqToMapParser(tt.Name.Local, tt.Attr, p, r) } if err != nil { return nil, err } // The nn map[string]interface{} value is a na[nn_key] value. // We need to see if nn_key already exists - means we're parsing a list. // This may require converting na[nn_key] value into []interface{} type. // First, extract the key:val for the map - it's a singleton. var key string var val interface{} for key, val = range nn { break } // add "#seq" k:v pair - // Sequence number included even in list elements - this should allow us // to properly resequence even something goofy like: // <list>item 1</list> // <subelement>item 2</subelement> // <list>item 3</list> // where all the "list" subelements are decoded into an array. switch val.(type) { case map[string]interface{}: val.(map[string]interface{})["#seq"] = seq seq++ case interface{}: // a non-nil simple element: string, float64, bool v := map[string]interface{}{"#text": val, "#seq": seq} seq++ val = v } // 'na' holding sub-elements of n. // See if 'key' already exists. // If 'key' exists, then this is a list, if not just add key:val to na. if v, ok := na[key]; ok { var a []interface{} switch v.(type) { case []interface{}: a = v.([]interface{}) default: // anything else - note: v.(type) != nil a = []interface{}{v} } a = append(a, val) na[key] = a } else { na[key] = val // save it as a singleton } case xml.EndElement: if skey != "" { tt := t.(xml.EndElement) var name string if len(tt.Name.Space) > 0 { name = tt.Name.Space + `:` + tt.Name.Local } else { name = tt.Name.Local } if skey != name { return nil, fmt.Errorf("element %s not properly terminated, got %s at #%d", skey, name, p.InputOffset()) } } // len(n) > 0 if this is a simple element w/o xml.Attrs - see xml.CharData case. if len(n) == 0 { // If len(na)==0 we have an empty element == ""; // it has no xml.Attr nor xml.CharData. // Empty element content will be map["etag"]map["#text"]"" // after #seq injection - map["etag"]map["#seq"]seq - after return. if len(na) > 0 { n[skey] = na } else { n[skey] = "" // empty element } } return n, nil case xml.CharData: // clean up possible noise tt := strings.Trim(string(t.(xml.CharData)), "\t\r\b\n ") if skey == "" { // per Adrian (http://www.adrianlungu.com/) catch stray text // in decoder stream - // https://github.com/clbanning/mxj/pull/14#issuecomment-182816374 // NOTE: CharSetReader must be set to non-UTF-8 CharSet or you'll get // a p.Token() decoding error when the BOM is UTF-16 or UTF-32. continue } if len(tt) > 0 { // every simple element is a #text and has #seq associated with it na["#text"] = cast(tt, r) na["#seq"] = seq seq++ } case xml.Comment: if n == nil { // no root 'key' n = map[string]interface{}{"#comment": string(t.(xml.Comment))} return n, NoRoot } cm := make(map[string]interface{}, 2) cm["#text"] = string(t.(xml.Comment)) cm["#seq"] = seq seq++ na["#comment"] = cm case xml.Directive: if n == nil { // no root 'key' n = map[string]interface{}{"#directive": string(t.(xml.Directive))} return n, NoRoot } dm := make(map[string]interface{}, 2) dm["#text"] = string(t.(xml.Directive)) dm["#seq"] = seq seq++ na["#directive"] = dm case xml.ProcInst: if n == nil { na = map[string]interface{}{"#target": t.(xml.ProcInst).Target, "#inst": string(t.(xml.ProcInst).Inst)} n = map[string]interface{}{"#procinst": na} return n, NoRoot } pm := make(map[string]interface{}, 3) pm["#target"] = t.(xml.ProcInst).Target pm["#inst"] = string(t.(xml.ProcInst).Inst) pm["#seq"] = seq seq++ na["#procinst"] = pm default: // noop - shouldn't ever get here, now, since we handle all token types } } }