func (m *minificationHTML) toDiv(node *html.Node) (*html.Node, error) { node.DataAtom = atom.Div node.Data = "div" node.Attr = nil return m.parseChildren(node) }
// Attache attaches these attributes to an html.Node. func (g HTML) Attach(node *html.Node) { attrs := []html.Attribute{} if g.ContentEditable > 0 { if g.ContentEditable == OTrue { attrs = attr(attrs, "contenteditable", "true") } else { attrs = attr(attrs, "contenteditable", "false") } } if g.Hidden > 0 { if g.Hidden == OTrue { attrs = attr(attrs, "hidden", "true") } else { attrs = attr(attrs, "hidden", "false") } } if len(g.Data) > 0 { for k, v := range g.Data { attrs = attr(attrs, k, v) } } if len(g.Class) > 0 { v := strings.Join(g.Class, " ") attrs = attr(attrs, "class", v) } s := []string{"AccessKey", "Id", "Dir", "Lang", "Style", "TabIndex", "Title", "Translate"} attrs = append(attrs, structToAttrs(g, s...)...) node.Attr = append(node.Attr, attrs...) }
// cleansDom performs brute reduction and simplification // func cleanseDom(n *html.Node, lvl int) { n.Attr = removeAttr(n.Attr, unwantedAttrs) // Children for c := n.FirstChild; c != nil; c = c.NextSibling { cleanseDom(c, lvl+1) } if directlyRemoveUnwanted { removeUnwanted(n) } else { convertUnwanted(n) } // --- convertExotic(n) // one time text normalization if n.Type == html.TextNode { n.Data = stringspb.NormalizeInnerWhitespace(n.Data) } }
func RemoveAttributes(n *html.Node, keepAttrs []string) { attrs := make([]html.Attribute, 0, len(n.Attr)) dataAttrs := make(map[string]html.Attribute) originalAttrs := make(map[string]html.Attribute) for _, a := range n.Attr { if indexOfString(keepAttrs, a.Key) >= 0 { attrs = append(attrs, a) originalAttrs[a.Key] = a } else if i := strings.Index(a.Key, "data-"); i == 0 { a.Key = a.Key[5:] if len(a.Key) > 0 && indexOfString(keepAttrs, a.Key) >= 0 { dataAttrs[a.Key] = a } } } for k, v := range dataAttrs { if _, found := originalAttrs[k]; !found { attrs = append(attrs, v) } } n.Attr = attrs for c := n.FirstChild; c != nil; c = c.NextSibling { RemoveAttributes(c, keepAttrs) } }
func img2Link(img *html.Node) { if img.Data == "img" { img.Data = "a" for i := 0; i < len(img.Attr); i++ { if img.Attr[i].Key == "src" { img.Attr[i].Key = "href" } } double := closureTextNodeExists(img) imgContent := "" title := attrX(img.Attr, "title") if double { imgContent = fmt.Sprintf("[img] %v %v | ", "[ctdr]", // content title double removed urlBeautify(attrX(img.Attr, "href"))) } else { imgContent = fmt.Sprintf("[img] %v %v | ", title, urlBeautify(attrX(img.Attr, "href"))) } img.Attr = attrSet(img.Attr, "cfrom", "img") nd := dom.Nd("text", imgContent) img.AppendChild(nd) } }
func copyNode(to, from *html.Node) { to.Attr = from.Attr to.Data = from.Data to.DataAtom = from.DataAtom to.Namespace = from.Namespace to.Type = from.Type }
func cleanNode(c *Config, n *html.Node) *html.Node { allowedAttr, ok1 := c.elem[n.DataAtom] customAttr, ok2 := c.elemCustom[n.Data] if ok1 || ok2 { cleanChildren(c, n) haveSrc := false attrs := n.Attr n.Attr = make([]html.Attribute, 0, len(attrs)) for _, attr := range attrs { a := atom.Lookup([]byte(attr.Key)) re1, ok1 := allowedAttr[a] re2, ok2 := customAttr[attr.Key] _, ok3 := c.attr[a] _, ok4 := c.attrCustom[attr.Key] if attr.Namespace != "" || (!ok1 && !ok2 && !ok3 && !ok4) { continue } if !cleanURL(c, a, &attr) { continue } if re1 != nil && !re1.MatchString(attr.Val) { continue } if re2 != nil && !re2.MatchString(attr.Val) { continue } haveSrc = haveSrc || a == atom.Src n.Attr = append(n.Attr, attr) } if n.DataAtom == atom.Img && !haveSrc { // replace it with an empty text node return &html.Node{Type: html.TextNode} } return n } return text(html.UnescapeString(Render(n))) }
func setAttr(key, val string, n *html.Node) { attr := getAttr(key, n) if attr != nil { attr.Val = val } else { n.Attr = append(n.Attr, html.Attribute{Key: key, Val: val}) } }
func removeAttr(key string, n *html.Node) { for i := 0; i < len(n.Attr); i++ { if n.Attr[i].Key == key { n.Attr = append(n.Attr[:i], n.Attr[i+1:]...) return } } return }
func filterAttrs(n *html.Node, fn func(*html.Attribute) bool) { var out = make([]html.Attribute, 0) for _, a := range n.Attr { if fn(&a) { out = append(out, a) } } n.Attr = out }
// clean normalises styles/colspan and removes any CleanTags specified, along with newlines; // but also makes all the character handling (for example " " as utf-8) the same. // It returns the estimated number of treeRunes that will be used. // TODO more cleaning of the input HTML, as required. func (c *Config) clean(n *html.Node) int { size := 1 switch n.Type { case html.ElementNode: for ai := 0; ai < len(n.Attr); ai++ { a := n.Attr[ai] switch { case strings.ToLower(a.Key) == "style": if strings.TrimSpace(a.Val) == "" { // delete empty styles n.Attr = delAttr(n.Attr, ai) ai-- } else { // tidy non-empty styles // TODO there could be more here to make sure the style entries are in the same order etc. n.Attr[ai].Val = strings.Replace(a.Val, " ", "", -1) if !strings.HasSuffix(n.Attr[ai].Val, ";") { n.Attr[ai].Val += ";" } } case n.DataAtom == atom.Td && strings.ToLower(a.Key) == "colspan" && strings.TrimSpace(a.Val) == "1": n.Attr = delAttr(n.Attr, ai) ai-- } } case html.TextNode: n.Data = htm.UnescapeString(n.Data) size += utf8.RuneCountInString(n.Data) - 1 // len(n.Data) would be faster, but use more memory } searchChildren: for ch := n.FirstChild; ch != nil; ch = ch.NextSibling { switch ch.Type { case html.ElementNode: for _, rr := range c.CleanTags { if rr == ch.Data { n.RemoveChild(ch) goto searchChildren } } } size += c.clean(ch) } return size }
func RemoveSomeAttributes(n *html.Node, someAttrs []string) { attrMap := make(map[string]bool, len(someAttrs)) for _, attr := range someAttrs { attrMap[attr] = true } for i := len(n.Attr) - 1; i >= 0; i-- { a := n.Attr[i] flag, _ := attrMap[a.Key] if flag { if i < len(n.Attr)-1 { n.Attr = append(n.Attr[0:i], n.Attr[i+1:]...) } else { n.Attr = n.Attr[0:i] } } } for c := n.FirstChild; c != nil; c = c.NextSibling { RemoveSomeAttributes(n, someAttrs) } }
func TestIsStylesheetNode(t *testing.T) { n := html.Node{Data: "link"} if isStylesheetNode(&n) { t.Errorf("Exepected node to not be a stylesheet node") } n.Attr = []html.Attribute{ html.Attribute{Key: "rel", Val: "stylesheet"}, } if !isStylesheetNode(&n) { t.Errorf("Exepected node to be a stylesheet node") } }
// CloneNode makes a copy of a Node with all descendants. func CloneNode(n *exphtml.Node) *exphtml.Node { clone := new(exphtml.Node) clone.Type = n.Type clone.DataAtom = n.DataAtom clone.Data = n.Data clone.Attr = make([]exphtml.Attribute, len(n.Attr)) copy(clone.Attr, n.Attr) for c := n.FirstChild; c != nil; c = c.NextSibling { nc := CloneNode(c) clone.AppendChild(nc) } return clone }
func TestIsIcoNode(t *testing.T) { n := html.Node{Data: "link"} if isIcoNode(&n) { t.Errorf("Exepected node to not be a ico node") } n.Attr = []html.Attribute{ html.Attribute{Key: "rel", Val: "icon"}, } if !isIcoNode(&n) { t.Errorf("Exepected node to be a ico node") } }
func TestParseLinkTagNoRel(t *testing.T) { node := new(html.Node) node.Data = "link" attr1 := html.Attribute{"", "href", "1.css"} node.Attr = []html.Attribute{attr1} page := newWebPage(startUrl) page.parseLinkTag(node) expected1 := 0 val1 := page.styleSheets.Len() if val1 != expected1 { t.Error("Expected:", expected1, " Got:", val1) } }
func TestParseATagAbsoluteDiffHost(t *testing.T) { node := new(html.Node) node.Data = "a" attr := html.Attribute{"", "href", "http://www.google.com"} node.Attr = []html.Attribute{attr} page := newWebPage(startUrl) page.parseATag(node) expected := 0 val := page.links.Len() if val != expected { t.Error("Expected:", expected, " Got:", val) } }
func TestParseATagInvalidUrl(t *testing.T) { node := new(html.Node) node.Data = "a" attr := html.Attribute{"", "href", "%gh&%ij"} node.Attr = []html.Attribute{attr} page := newWebPage(startUrl) page.parseATag(node) expected1 := 0 val1 := page.links.Len() if val1 != expected1 { t.Error("Expected:", expected1, " Got:", val1) } }
func TestParseLinkTagInvalidUrl(t *testing.T) { node := new(html.Node) node.Data = "link" attr1 := html.Attribute{"", "href", "%gh&%ij"} attr2 := html.Attribute{"", "rel", "stylesheet"} node.Attr = []html.Attribute{attr1, attr2} page := newWebPage(startUrl) page.parseLinkTag(node) expected1 := 0 val1 := page.styleSheets.Len() if val1 != expected1 { t.Error("Expected:", expected1, " Got:", val1) } }
func TestParseScriptTagAbsolute(t *testing.T) { node := new(html.Node) node.Data = "script" attr := html.Attribute{"", "src", startUrl + "1.js"} node.Attr = []html.Attribute{attr} page := newWebPage(startUrl) page.parseScriptTag(node) expected1 := 1 val1 := page.scriptFiles.Len() if val1 != expected1 { t.Error("Expected:", expected1, " Got:", val1) } expected2 := startUrl + "1.js" val2 := page.scriptFiles.Front().Value if val2 != expected2 { t.Error("Expected:", expected2, " Got:", val2) } }
func TestParseATagRelative(t *testing.T) { node := new(html.Node) node.Data = "a" attr := html.Attribute{"", "href", "1.html"} node.Attr = []html.Attribute{attr} page := newWebPage(startUrl) page.parseATag(node) expected1 := 1 val1 := page.links.Len() if val1 != expected1 { t.Error("Expected:", expected1, " Got:", val1) } expected2 := startUrl + "1.html" val2 := page.links.Front().Value if val2 != expected2 { t.Error("Expected:", expected2, " Got:", val2) } }
func forceMaxDepth(n *html.Node, depth int) { if depth == 0 { n.Type = html.TextNode n.FirstChild, n.LastChild = nil, nil n.Attr = nil n.Data = "[omitted]" for n.NextSibling != nil { n.Parent.RemoveChild(n.NextSibling) } return } if n.Type != html.ElementNode { return } for c := n.FirstChild; c != nil; c = c.NextSibling { forceMaxDepth(c, depth-1) } }
func TestParseLinkTagRelative(t *testing.T) { node := new(html.Node) node.Data = "link" attr1 := html.Attribute{"", "href", "1.css"} attr2 := html.Attribute{"", "rel", "stylesheet"} node.Attr = []html.Attribute{attr1, attr2} page := newWebPage(startUrl) page.parseLinkTag(node) expected1 := 1 val1 := page.styleSheets.Len() if val1 != expected1 { t.Error("Expected:", expected1, " Got:", val1) } expected2 := startUrl + "1.css" val2 := page.styleSheets.Front().Value if val2 != expected2 { t.Error("Expected:", expected2, " Got:", val2) } }
// Get and normalize the "class" attribute from the node. func getClassesAndAttr(n *html.Node, create bool) (classes string, attr *html.Attribute) { // Applies only to element nodes if n.Type == html.ElementNode { attr = getAttributePtr("class", n) if attr == nil && create { n.Attr = append(n.Attr, html.Attribute{ Key: "class", Val: "", }) attr = &n.Attr[len(n.Attr)-1] } } if attr == nil { classes = " " } else { classes = rxClassTrim.ReplaceAllString(" "+attr.Val+" ", " ") } return }
func addIdAttr(n *html.Node, lvl int, argNum int) (num int) { num = argNum if lvl > cScaffoldLvls { if n.Type == html.ElementNode { attr := html.Attribute{"", "id", spf("%v", num)} prep := []html.Attribute{attr} n.Attr = append(prep, n.Attr...) } } // Children for c := n.FirstChild; c != nil; c = c.NextSibling { num = addIdAttr(c, lvl+1, num+1) } return }
func TestGetNodeAttrValue(t *testing.T) { var n html.Node n.Attr = []html.Attribute{ html.Attribute{Key: "key1", Val: "value1"}, html.Attribute{Key: "key2", Val: "value2"}, html.Attribute{Key: "key3", Val: "value3"}, } for _, attr := range n.Attr { val, err := getNodeAttrValue(&n, attr.Key) if err != nil { t.Errorf("Unexpected error %v", err) } else if val != attr.Val { t.Errorf("Exepected %s, got %s", attr.Val, val) } } _, err := getNodeAttrValue(&n, "bad-key") if err != errNodeAttrNotFound { t.Errorf("Expected error: %v", errNodeAttrNotFound) } }
func addOutlineAttr(n *html.Node, lvl int, argOutline []int) (outline []int) { outline = argOutline if n.Type == html.ElementNode && lvl > cScaffoldLvls { outline[len(outline)-1]++ s := "" for _, v := range outline { s = fmt.Sprintf("%v%v.", s, v) } if strings.HasSuffix(s, ".") { s = s[:len(s)-1] } attr := html.Attribute{"", "ol", s} newAttrs := make([]html.Attribute, 0, len(n.Attr)+2) // make space for outline now - and id later newAttrs = append(newAttrs, attr) newAttrs = append(newAttrs, n.Attr...) n.Attr = newAttrs outline = append(outline, 0) // add children lvl } // Children for c := n.FirstChild; c != nil; c = c.NextSibling { outline = addOutlineAttr(c, lvl+1, outline) } if n.Type == html.ElementNode && lvl > cScaffoldLvls { outline = outline[:len(outline)-1] // reset children lvl } return }
func (this *EpubMaker) checkNewChapter(node *html.Node) *Chapter { if node.Type != html.ElementNode { return nil } var c *Chapter = nil if c = this.checkChapterNode(node); c == nil { if c = checkHeaderNode(node); c == nil { return nil } if this.skip { this.skip = false return nil } if c.Level < this.by_header || hasClass(node, makeepub_not_chapter) { return nil } } // only chapters in TOC need a Link if c.Level > 0 && c.Level <= this.toc { id := findAttribute(node, "id") if id == nil { node.Attr = append(node.Attr, html.Attribute{Key: "id"}) id = &node.Attr[len(node.Attr)-1] } if len(id.Val) == 0 { id.Val = fmt.Sprintf(makeepub_chapter_id, this.chapter_id) this.chapter_id++ } c.Link = "#" + id.Val } c.Title = strings.TrimSpace(c.Title) return c }
// codebeat:disable[LOC,ABC] func (m *minificationHTML) parseElements(node *html.Node) (*html.Node, error) { switch node.DataAtom { case atom.A: return m.openNode(node, false) case atom.Abbr: title := m.getAttrValLower(node, "title") if title != "" { m.addChildTextNodeToBegining(node, " "+title+" ") } return m.openNode(node, true) case atom.Address: return m.openNode(node, true) case atom.Applet: return m.removeNode(node, true) case atom.Area: return m.removeNode(node, true) case atom.Article: return m.toDiv(node) case atom.Aside: return m.toDiv(node) case atom.Audio: return m.removeNode(node, true) case atom.B: return m.openNode(node, false) case atom.Base: return m.removeNode(node, false) case atom.Basefont: return m.removeNode(node, false) case atom.Bdi: return m.openNode(node, false) case atom.Bdo: return m.openNode(node, false) case atom.Bgsound: return m.removeNode(node, false) case atom.Blockquote: return m.toDiv(node) case atom.Big: return m.openNode(node, false) case atom.Body: node.Attr = nil return m.parseChildren(node) case atom.Blink: return m.openNode(node, false) case atom.Br: return m.removeNode(node, true) case atom.Button: return m.removeNode(node, true) case atom.Canvas: return m.removeNode(node, true) case atom.Caption: return m.openNode(node, true) case atom.Center: return m.toDiv(node) case atom.Cite: return m.openNode(node, false) case atom.Code: return m.openNode(node, false) case atom.Colgroup, atom.Col: return m.removeNode(node, true) case atom.Command: return m.removeNode(node, true) case atom.Data: return m.removeNode(node, false) case atom.Datalist: return m.removeNode(node, false) case atom.Dd: return m.openNode(node, true) case atom.Del: return m.openNode(node, false) case atom.Details: return m.toDiv(node) case atom.Dfn: return m.openNode(node, false) case atom.Dialog: return m.toDiv(node) case atom.Div: return m.toDiv(node) case atom.Dl: return m.toDiv(node) case atom.Dt: return m.openNode(node, true) case atom.Em: return m.openNode(node, false) case atom.Embed: return m.removeNode(node, true) case atom.Figcaption: return m.openNode(node, true) case atom.Figure: return m.toDiv(node) case atom.Font: return m.openNode(node, false) case atom.Footer: return m.toDiv(node) case atom.Form: return m.removeNode(node, true) case atom.Frame, atom.Frameset, atom.Noframes: return m.removeNode(node, false) case atom.H1: return m.toDiv(node) case atom.H2: return m.toDiv(node) case atom.H3: return m.toDiv(node) case atom.H4: return m.toDiv(node) case atom.H5: return m.toDiv(node) case atom.H6: return m.toDiv(node) case atom.Head: node.Attr = nil return m.parseChildren(node) case atom.Header: return m.toDiv(node) case atom.Hr: return m.removeNode(node, true) case atom.Html: node.Attr = nil return m.parseChildren(node) case atom.I: return m.openNode(node, false) case atom.Iframe: return m.removeNode(node, true) case atom.Img: return m.removeNode(node, true) case atom.Input: return m.removeNode(node, true) case atom.Ins: return m.openNode(node, false) case atom.Label: return m.toDiv(node) case atom.Li: return m.openNode(node, true) case atom.Link: return m.removeNode(node, false) case atom.Listing: return m.toDiv(node) case atom.Marquee: return m.openNode(node, true) case atom.Meta: return m.removeNode(node, false) case atom.Name: return m.openNode(node, false) case atom.Nav: return m.toDiv(node) case atom.Nobr: return m.openNode(node, false) case atom.Noscript: return m.removeNode(node, true) case atom.Object: return m.removeNode(node, true) case atom.Ol: return m.toDiv(node) case atom.P: return m.toDiv(node) case atom.Param: return m.removeNode(node, true) case atom.Pre: return m.toDiv(node) case atom.Q: return m.openNode(node, false) case atom.S: return m.openNode(node, false) case atom.Script: return m.removeNode(node, true) case atom.Section: return m.toDiv(node) case atom.Select: return m.removeNode(node, true) case atom.Small: return m.openNode(node, false) case atom.Span: return m.openNode(node, false) case atom.Strike: return m.openNode(node, false) case atom.Strong: return m.openNode(node, false) case atom.Style: return m.removeNode(node, true) case atom.Sub: return m.openNode(node, false) case atom.Sup: return m.openNode(node, false) case atom.Svg: return m.removeNode(node, true) case atom.Table: return m.toDiv(node) case atom.Tbody: return m.openNode(node, true) case atom.Td: return m.toDiv(node) case atom.Textarea: return m.removeNode(node, true) case atom.Tfoot: return m.openNode(node, true) case atom.Th: return m.toDiv(node) case atom.Thead: return m.openNode(node, true) case atom.Time: return m.openNode(node, false) case atom.Title: return m.removeNode(node, false) case atom.Tr: return m.openNode(node, true) case atom.Tt: return m.openNode(node, false) case atom.U: return m.openNode(node, false) case atom.Ul: return m.toDiv(node) case atom.Var: return m.openNode(node, false) case atom.Video: return m.removeNode(node, true) case atom.Wbr: return m.removeNode(node, false) default: if node.Data == "noindex" { return m.removeNode(node, true) } return m.toDiv(node) } }
//从nodes中找到node 根据index 和 属性 先index func findNodeformNodesbyIndexOrPro(nodes []*goquery.Selection, index *int, m map[string]string, Type string, visible bool) { switch { case Type == OPTION || Type == RADIO: for _, v := range nodes { for _, vv := range v.Get(0).Attr { if vv.Key == VALUE { if vv.Val == m[VALUE] { if Type == RADIO { v.SetAttr("checked", "checked") } else { v.SetAttr("selected", "selected") } return } } } } if visible { var node html.Node node.Data = nodes[0].Get(0).Data node.Type = nodes[0].Get(0).Type attr := make([]html.Attribute, 0, 2) var tr html.Attribute tr.Key = VALUE tr.Val = m[VALUE] attr = append(attr, tr) if Type == RADIO { tr.Key = "checked" tr.Val = "checked" } else { tr.Key = "selected" tr.Val = "selected" } attr = append(attr, tr) tr.Key = TYPE tr.Val = Type attr = append(attr, tr) node.Attr = attr nodes[0].Parent().AppendNodes(&node) } return default: } if len(nodes) <= *index { return } for k, v := range m { nodes[*index].SetAttr(k, v) } *index++ }