func copyNode(to, from *html.Node) { to.Attr = from.Attr to.Data = from.Data to.DataAtom = from.DataAtom to.Namespace = from.Namespace to.Type = from.Type }
func (this *contentExtractor) getSiblingsContent(currentSibling *goquery.Selection, baselinescoreSiblingsPara float64) []*goquery.Selection { ps := make([]*goquery.Selection, 0) if currentSibling.Get(0).DataAtom.String() == "p" && len(currentSibling.Text()) > 0 { ps = append(ps, currentSibling) return ps } else { potentialParagraphs := currentSibling.Find("p") potentialParagraphs.Each(func(i int, s *goquery.Selection) { text := s.Text() if len(text) > 0 { ws := this.config.stopWords.stopWordsCount(this.config.targetLanguage, text) paragraphScore := ws.stopWordCount siblingBaselineScore := 0.30 highLinkDensity := this.isHighLinkDensity(s) score := siblingBaselineScore * baselinescoreSiblingsPara if score < float64(paragraphScore) && !highLinkDensity { node := new(html.Node) node.Type = html.TextNode node.Data = text node.DataAtom = atom.P nodes := make([]*html.Node, 1) nodes[0] = node newSelection := new(goquery.Selection) newSelection.Nodes = nodes ps = append(ps, newSelection) } } }) } return ps }
func Nd(ntype string, content ...string) *html.Node { nd0 := new(html.Node) if ntype == "text" { nd0.Type = html.TextNode if len(content) > 0 { nd0.Data = content[0] } } else { nd0.Type = html.ElementNode nd0.Data = ntype if len(content) > 0 { runtimepb.StackTrace(4) log.Printf("Element nodes can't have content") } } return nd0 }
// CloneNode makes a copy of a Node with all descendants. func CloneNode(n *exphtml.Node) *exphtml.Node { clone := new(exphtml.Node) clone.Type = n.Type clone.DataAtom = n.DataAtom clone.Data = n.Data clone.Attr = make([]exphtml.Attribute, len(n.Attr)) copy(clone.Attr, n.Attr) for c := n.FirstChild; c != nil; c = c.NextSibling { nc := CloneNode(c) clone.AppendChild(nc) } return clone }
func removeEmptyNodes(n *html.Node, lvl int) { // children cc := []*html.Node{} for c := n.FirstChild; c != nil; c = c.NextSibling { cc = append(cc, c) } for _, c := range cc { removeEmptyNodes(c, lvl+1) } // processing // empty element nodes if n.Type == html.ElementNode && n.Data == "img" { src := attrX(n.Attr, "src") if src == "" { n.Parent.RemoveChild(n) } } if n.Type == html.ElementNode && n.FirstChild == nil && n.Data == "a" { href := attrX(n.Attr, "href") if href == "#" || href == "" { n.Parent.RemoveChild(n) } } if n.Type == html.ElementNode && n.FirstChild == nil && (n.Data == "em" || n.Data == "strong") { n.Parent.RemoveChild(n) } if n.Type == html.ElementNode && n.FirstChild == nil && (n.Data == "div" || n.Data == "span" || n.Data == "li" || n.Data == "p") { n.Parent.RemoveChild(n) } // spans with less than 2 characters inside => flatten to text only1Child := n.FirstChild != nil && n.FirstChild == n.LastChild if n.Type == html.ElementNode && n.Data == "span" && only1Child && n.FirstChild.Type == html.TextNode && len(strings.TrimSpace(n.FirstChild.Data)) < 3 { n.Type = html.TextNode n.Data = n.FirstChild.Data n.RemoveChild(n.FirstChild) } }
func dedupApply(n *html.Node, dedups map[string]bool) { // Children for c := n.FirstChild; c != nil; c = c.NextSibling { dedupApply(c, dedups) } if n.Type == html.ElementNode { outline := attrX(n.Attr, "ol") + "." if dedups[outline] { n.Type = html.CommentNode n.Data = n.Data + " replaced" } } }
func forceMaxDepth(n *html.Node, depth int) { if depth == 0 { n.Type = html.TextNode n.FirstChild, n.LastChild = nil, nil n.Attr = nil n.Data = "[omitted]" for n.NextSibling != nil { n.Parent.RemoveChild(n.NextSibling) } return } if n.Type != html.ElementNode { return } for c := n.FirstChild; c != nil; c = c.NextSibling { forceMaxDepth(c, depth-1) } }
func (c *Cleaner) convertDivsToParagraphs(doc *goquery.Document, domType string) *goquery.Document { if c.config.debug { log.Println("Starting to replace bad divs...") } badDivs := 0 convertedTextNodes := 0 divs := doc.Find(domType) divs.Each(func(i int, div *goquery.Selection) { divHTML, _ := div.Html() if divToPElementsPattern.Match([]byte(divHTML)) { c.replaceWithPara(div) badDivs++ } else { var replacementText []string nodesToRemove := list.New() children := div.Contents() if c.config.debug { log.Printf("Found %d children of div\n", children.Size()) } children.EachWithBreak(func(i int, kid *goquery.Selection) bool { text := kid.Text() kidNode := kid.Get(0) tag := kidNode.Data if tag == text { tag = "#text" } if tag == "#text" { text = strings.Replace(text, "\n", "", -1) text = tabsRegEx.ReplaceAllString(text, "") if text == "" { return true } if len(text) > 1 { prev := kidNode.PrevSibling if c.config.debug { log.Printf("PARENT CLASS: %s NODENAME: %s\n", c.config.parser.name("class", div), tag) log.Printf("TEXTREPLACE: %s\n", strings.Replace(text, "\n", "", -1)) } if prev != nil && prev.DataAtom == atom.A { nodeSelection := kid.HasNodes(prev) html, _ := nodeSelection.Html() replacementText = append(replacementText, html) if c.config.debug { log.Printf("SIBLING NODENAME ADDITION: %s TEXT: %s\n", prev.Data, html) } } replacementText = append(replacementText, text) nodesToRemove.PushBack(kidNode) convertedTextNodes++ } } return true }) newNode := new(html.Node) newNode.Type = html.ElementNode newNode.Data = strings.Join(replacementText, "") newNode.DataAtom = atom.P div.First().AddNodes(newNode) for s := nodesToRemove.Front(); s != nil; s = s.Next() { node := s.Value.(*html.Node) if node != nil && node.Parent != nil { node.Parent.RemoveChild(node) } } } }) if c.config.debug { log.Printf("Found %d total divs with %d bad divs replaced and %d textnodes converted inside divs", divs.Size(), badDivs, convertedTextNodes) } return doc }
//从nodes中找到node 根据index 和 属性 先index func findNodeformNodesbyIndexOrPro(nodes []*goquery.Selection, index *int, m map[string]string, Type string, visible bool) { switch { case Type == OPTION || Type == RADIO: for _, v := range nodes { for _, vv := range v.Get(0).Attr { if vv.Key == VALUE { if vv.Val == m[VALUE] { if Type == RADIO { v.SetAttr("checked", "checked") } else { v.SetAttr("selected", "selected") } return } } } } if visible { var node html.Node node.Data = nodes[0].Get(0).Data node.Type = nodes[0].Get(0).Type attr := make([]html.Attribute, 0, 2) var tr html.Attribute tr.Key = VALUE tr.Val = m[VALUE] attr = append(attr, tr) if Type == RADIO { tr.Key = "checked" tr.Val = "checked" } else { tr.Key = "selected" tr.Val = "selected" } attr = append(attr, tr) tr.Key = TYPE tr.Val = Type attr = append(attr, tr) node.Attr = attr nodes[0].Parent().AppendNodes(&node) } return default: } if len(nodes) <= *index { return } for k, v := range m { nodes[*index].SetAttr(k, v) } *index++ }
// convertUnwanted neutralizes a node. // Note: We can not directly Remove() nor Replace() // Since that breaks the recursion one step above! // At a later stage we employ horizontal traversal // to actually remove unwanted nodes. // // Meanwhile we have devised removeUnwanted() which // makes convertUnwanted-removeComment obsolete. // func convertUnwanted(n *html.Node) { if unwanteds[n.Data] { n.Type = html.CommentNode n.Data = n.Data + " replaced" } }
// r is the request to the proxy // u is the url, that the proxy has called func ModifyHTML(r *http.Request, u *url.URL, s string) string { var nums int // counter // needed to get the current request into the // "static" recursive functions var PackageProxyHost = r.Host // port included! var PackageRemoteHost = fetch.HostFromUrl(u) fCondenseNode = func(n *html.Node, depth int) (ret string) { if n.Type == html.ElementNode && n.Data == "script" { ret += fmt.Sprintf(" var script%v = '[script]'; ", nums) nums++ return } if n.Type == html.ElementNode && n.Data == "style" { ret += fmt.Sprintf(" .xxx {margin:2px;} ") return } if n.Type == html.ElementNode && n.Data == "img" { ret += fmt.Sprintf(" [img] %v %v | ", getAttrVal(n.Attr, "alt"), getAttrVal(n.Attr, "src")) } if n.Type == html.ElementNode && n.Data == "a" { ret += "[a]" } if n.Type == html.TextNode { s := n.Data // s = replTabsNewline.Replace(s) // s = strings.TrimSpace(s) if len(s) < 4 { ret += s } else if s != "" { if depth > 0 { ret += fmt.Sprintf(" [txt%v] %v", depth, s) } else { ret += " [txt] " + s } } } for c := n.FirstChild; c != nil; c = c.NextSibling { ret += fCondenseNode(c, depth+1) } return } // -------------------------- // ---------------------- fRecurse = func(n *html.Node) { if n.Type == html.ElementNode && n.Data == "form" { hidFld := new(html.Node) hidFld.Type = html.ElementNode hidFld.Data = "input" hidFld.Attr = []html.Attribute{ html.Attribute{Key: "name", Val: "redirect-to"}, html.Attribute{Key: "value", Val: absolutize(getAttrVal(n.Attr, "action"), PackageRemoteHost)}, } n.AppendChild(hidFld) submt := new(html.Node) submt.Type = html.ElementNode submt.Data = "input" submt.Attr = []html.Attribute{ html.Attribute{Key: "type", Val: "submit"}, html.Attribute{Key: "value", Val: "subm"}, html.Attribute{Key: "accesskey", Val: "f"}, } n.AppendChild(submt) n.Attr = rewriteAttributes(n.Attr, PackageProxyHost, PackageRemoteHost) } if n.Type == html.ElementNode && n.Data == "script" { for i := 0; i < len(n.Attr); i++ { if n.Attr[i].Key == "src" { n.Attr[i].Val = emptySrc } } } if n.Type == html.ElementNode && (n.Data == "a" || n.Data == "img" || n.Data == "script" || n.Data == "style") { s := fCondenseNode(n, 0) //fmt.Printf("found %v\n", s) textReplacement := new(html.Node) textReplacement.Type = html.TextNode textReplacement.Data = s attrStore := []html.Attribute{} if n.Data == "a" || n.Data == "img" { attrStore = rewriteAttributes(n.Attr, PackageProxyHost, PackageRemoteHost) } if n.Data == "img" { n.Data = "a" } if n.Data == "a" { n.Attr = attrStore } // We want to remove all existing children. // Direct loop impossible, since "NextSibling" is set to nil by Remove(). // Therefore first assembling separately, then removing. children := make(map[*html.Node]struct{}) for c := n.FirstChild; c != nil; c = c.NextSibling { children[c] = struct{}{} } for k, _ := range children { n.RemoveChild(k) } // we can't put our replacement "under" an image, since img cannot have children if n.Type == html.ElementNode && n.Data == "img" { // n.Parent.InsertBefore(textReplacement,n) dom.InsertAfter(n, textReplacement) dom.RemoveNode(n) } else { n.AppendChild(textReplacement) } // Insert a || and a newline before every <a...> if n.Data == "a" { prev := n breaker0 := dom.Nd("text", "||") n.Parent.InsertBefore(breaker0, prev) breaker1 := dom.Nd("br") n.Parent.InsertBefore(breaker1, prev) breaker2 := dom.Nd("text", "\n") n.Parent.InsertBefore(breaker2, prev) } } for c := n.FirstChild; c != nil; c = c.NextSibling { fRecurse(c) } } // -------------------------- // ---------------------- var docRoot *html.Node var err error rdr := strings.NewReader(s) docRoot, err = html.Parse(rdr) if err != nil { panic(fmt.Sprintf("3 %v \n", err)) } fRecurse(docRoot) var b bytes.Buffer err = html.Render(&b, docRoot) if err != nil { panic(fmt.Sprintf("4 %v \n", err)) } // log.Printf("len is %v\n", b.Len()) return b.String() }