func (this *flowdocument_maker) convert_flowdocument(frag *html.Node) { if frag.Type == html.TextNode { return } ignore_children := false switch frag.Data { case "img": frag.Type = html.CommentNode node_clear_children(frag) frag.Attr = nil case "a": frag.Data = "Hyperlink" frag.Attr = extract_ahref_attr(frag.Attr) case "article": frag.Data = "FlowDocument" // set namespace dont work frag.Attr = []html.Attribute{html.Attribute{Key: "xmlns", Val: fdocns}} case "object", "video", "audio", "embed": frag.Type = html.CommentNode node_clear_children(frag) frag.Attr = nil case "p": fallthrough default: frag.Data = "Paragraph" frag.Attr = nil if this.first_paragraph == nil { this.first_paragraph = frag } } for child := frag.FirstChild; ignore_children == false && child != nil; child = child.NextSibling { this.convert_flowdocument(child) } }
func addFiles(form uint8, parent *html.Node, files []string) { for _, file := range files { node := html.Node{ Type: html.ElementNode, } switch form { case SCRIPT: node.Data = "script" node.Attr = []html.Attribute{ html.Attribute{ Key: "src", Val: file, }, } case STYLE: node.Data = "link" node.Attr = []html.Attribute{ html.Attribute{ Key: "rel", Val: "stylesheet", }, html.Attribute{ Key: "href", Val: file, }, } default: panic("Type not understood") } parent.AppendChild(&node) } }
func try_update_class_attr(b *html.Node, class string) { if len(class) > 0 { ca := make([]html.Attribute, len(b.Attr)+1) copy(ca, b.Attr) ca[len(b.Attr)] = html.Attribute{Key: "class", Val: class} b.Attr = ca } }
// CloneNode makes a copy of a Node with all descendants. func CloneNode(n *exphtml.Node) *exphtml.Node { clone := new(exphtml.Node) clone.Type = n.Type clone.DataAtom = n.DataAtom clone.Data = n.Data clone.Attr = make([]exphtml.Attribute, len(n.Attr)) copy(clone.Attr, n.Attr) for c := n.FirstChild; c != nil; c = c.NextSibling { nc := CloneNode(c) clone.AppendChild(nc) } return clone }
// Remove all attributes on the provided node // that are not contained within this whitelist func (w *Whitelist) sanitizeAttributes(n *html.Node) { attributes := make([]html.Attribute, len(n.Attr)) i := 0 for _, attribute := range n.Attr { if w.HasAttributeForElement(n.Data, attribute.Key) { attributes[i] = attribute i += 1 } } n.Attr = attributes[0:i] }
// reserve id, class, href, src func (this *HtmlCleaner) clean_attributes(n *html.Node) { for child := n.FirstChild; child != nil; child = child.NextSibling { this.clean_attributes(child) } attrs := []html.Attribute{} for _, attr := range n.Attr { if attr.Key == "id" || attr.Key == "class" || attr.Key == "href" || attr.Key == "src" { attrs = append(attrs, attr) } } if len(attrs) != len(n.Attr) { n.Attr = attrs } }
func setAttributeValue(attrName string, val string, n *html.Node) { if n == nil { return } for i := range n.Attr { if attr := &n.Attr[i]; attr.Key == attrName { attr.Val = val return } } n.Attr = append(n.Attr, html.Attribute{Key: attrName, Val: val}) }
// reserve id, class, href, src, width, height, alt // class,id会用于后面正文内容的判定 // width/height/alt会用于判定image时候是正文 func (this *html_cleaner) clean_attributes(n *html.Node) { for child := n.FirstChild; child != nil; child = child.NextSibling { this.clean_attributes(child) } var attrs []html.Attribute for _, attr := range n.Attr { switch attr.Key { case "id", "class", "href", "src", "width", "height", "alt": attrs = append(attrs, attr) } } if len(attrs) != len(n.Attr) { n.Attr = attrs } }
func node_set_attribute(n *html.Node, name, val string) { v := node_get_attribute(n, name) if v == "" { n.Attr = append(n.Attr, html.Attribute{Key: name, Val: val}) } }
func init() { fCondenseNode = func(n *html.Node, depth int) (ret string) { if n.Type == html.ElementNode && n.Data == "script" { ret += fmt.Sprintf(" var script%v = '[script]'; ", nums) nums++ return } if n.Type == html.ElementNode && n.Data == "style" { ret += fmt.Sprintf(" .xxx {margin:2px;} ") return } if n.Type == html.ElementNode && n.Data == "img" { ret += fmt.Sprintf(" [img] %v %v | ", getAttrVal(n.Attr, "alt"), getAttrVal(n.Attr, "src")) } if n.Type == html.ElementNode && n.Data == "a" { ret += "[a]" } if n.Type == html.TextNode { s := n.Data // s = replTabsNewline.Replace(s) // s = strings.TrimSpace(s) if len(s) < 4 { ret += s } else if s != "" { if depth > 0 { ret += fmt.Sprintf(" [txt%v] %v", depth, s) } else { ret += " [txt] " + s } } } for c := n.FirstChild; c != nil; c = c.NextSibling { ret += fCondenseNode(c, depth+1) } return } fRecurse = func(n *html.Node) { if n.Type == html.ElementNode && n.Data == "form" { hidFld := new(html.Node) hidFld.Type = html.ElementNode hidFld.Data = "input" hidFld.Attr = []html.Attribute{html.Attribute{Key: "name", Val: "redirect-to"}, html.Attribute{Key: "value", Val: absolutize(getAttrVal(n.Attr, "action"))}} n.AppendChild(hidFld) n.Attr = rewriteAttributes(n.Attr, UnsyncedGlobalReq) } if n.Type == html.ElementNode && n.Data == "script" { for i := 0; i < len(n.Attr); i++ { if n.Attr[i].Key == "src" { n.Attr[i].Val = emptySrc } } } if n.Type == html.ElementNode && (n.Data == "a" || n.Data == "img" || n.Data == "script" || n.Data == "style") { s := fCondenseNode(n, 0) //fmt.Printf("found %v\n", s) textReplacement := new(html.Node) textReplacement.Type = html.TextNode textReplacement.Data = s if n.Data == "a" || n.Data == "img" { n.Attr = rewriteAttributes(n.Attr, UnsyncedGlobalReq) } // We want to remove all existing children. // Direct loop impossible, since "NextSibling" is set to nil by Remove(). // Therefore first assembling separately, then removing. children := make(map[*html.Node]struct{}) for c := n.FirstChild; c != nil; c = c.NextSibling { children[c] = struct{}{} } for k, _ := range children { n.RemoveChild(k) } // we can't put our replacement "under" an image, since img cannot have children if n.Type == html.ElementNode && n.Data == "img" { // n.Parent.InsertBefore(textReplacement,n) InsertAfter(n, textReplacement) RemoveNode(n) } else { n.AppendChild(textReplacement) } if n.Data == "a" { prev := n.PrevSibling if prev != nil { breaker0 := new(html.Node) breaker0.Type = html.TextNode breaker0.Data = " || " n.Parent.InsertBefore(breaker0, prev) breaker1 := new(html.Node) breaker1.Type = html.ElementNode // breaker1.Data = "||<br>\n" breaker1.Data = "br" n.Parent.InsertBefore(breaker1, prev) breaker2 := new(html.Node) breaker2.Type = html.TextNode breaker2.Data = "\n" n.Parent.InsertBefore(breaker2, prev) } } } for c := n.FirstChild; c != nil; c = c.NextSibling { fRecurse(c) } } }
func parseAttrs(node *html.Node, attrs string) { keyIdx := 0 pos := 0 for pos < len(attrs) { pos += countSpaces(attrs[pos:]) if pos >= len(attrs) { break } // try to parse key name keyStart, keyEnd := pos, pos for keyEnd < len(attrs) && isWord(attrs[keyEnd]) { keyEnd++ } eqPos := keyEnd + countSpaces(attrs[keyEnd:]) if eqPos < len(attrs) && attrs[eqPos] == '=' { // looks like a key-value pair var innerPos, innerEnd int valPos := eqPos + 1 + countSpaces(attrs[eqPos+1:]) valEnd := valPos if valPos < len(attrs) { // quoted attrib? if attrs[valPos] == '"' || attrs[valPos] == '\'' { innerEnd = valPos + 1 + strings.IndexRune(attrs[valPos+1:], rune(attrs[valPos])) if innerEnd > valPos { innerPos, valEnd = valPos+1, innerEnd+1 } } // unquoted attrib? if valEnd == valPos { for valEnd < len(attrs) && !isSpace(attrs[valEnd]) && attrs[valEnd] != '"' && attrs[valEnd] != '\'' { valEnd++ } innerPos = valPos innerEnd = valEnd } // if we have a value, add it to the node! if valEnd != valPos { node.Attr = append(node.Attr, html.Attribute{ Key: attrs[keyStart:keyEnd], Val: attrs[innerPos:innerEnd], }) pos = valEnd continue } } } // key-value pair didn't work out. try parsing as indexed value var innerPos, innerEnd int end := pos // quoted value? if attrs[pos] == '"' { innerEnd = pos + 1 + strings.IndexRune(attrs[pos+1:], '"') if innerEnd > pos { innerPos, end = pos+1, innerEnd+1 } } // if all else fails, try regular value if end == pos { for end < len(attrs) && !isSpace(attrs[end]) { end++ } innerPos, innerEnd = pos, end } node.Attr = append(node.Attr, html.Attribute{ Key: fmt.Sprintf("@%d", keyIdx), Val: attrs[innerPos:innerEnd], }) pos = end keyIdx++ } }