func (this *flowdocument_maker) convert_flowdocument(frag *html.Node) { if frag.Type == html.TextNode { return } ignore_children := false switch frag.Data { case "img": frag.Type = html.CommentNode node_clear_children(frag) frag.Attr = nil case "a": frag.Data = "Hyperlink" frag.Attr = extract_ahref_attr(frag.Attr) case "article": frag.Data = "FlowDocument" // set namespace dont work frag.Attr = []html.Attribute{html.Attribute{Key: "xmlns", Val: fdocns}} case "object", "video", "audio", "embed": frag.Type = html.CommentNode node_clear_children(frag) frag.Attr = nil case "p": fallthrough default: frag.Data = "Paragraph" frag.Attr = nil if this.first_paragraph == nil { this.first_paragraph = frag } } for child := frag.FirstChild; ignore_children == false && child != nil; child = child.NextSibling { this.convert_flowdocument(child) } }
func (this *contentExtractor) getSiblingsContent(currentSibling *goquery.Selection, baselinescoreSiblingsPara float64) []*goquery.Selection { ps := make([]*goquery.Selection, 0) if currentSibling.Get(0).DataAtom.String() == "p" && len(currentSibling.Text()) > 0 { ps = append(ps, currentSibling) return ps } else { potentialParagraphs := currentSibling.Find("p") potentialParagraphs.Each(func(i int, s *goquery.Selection) { text := s.Text() if len(text) > 0 { ws := this.config.stopWords.stopWordsCount(this.config.targetLanguage, text) paragraphScore := ws.stopWordCount siblingBaselineScore := 0.30 highLinkDensity := this.isHighLinkDensity(s) score := siblingBaselineScore * baselinescoreSiblingsPara if score < float64(paragraphScore) && !highLinkDensity { node := new(html.Node) node.Type = html.TextNode node.Data = text node.DataAtom = atom.P nodes := make([]*html.Node, 1) nodes[0] = node newSelection := new(goquery.Selection) newSelection.Nodes = nodes ps = append(ps, newSelection) } } }) } return ps }
// CloneNode makes a copy of a Node with all descendants. func CloneNode(n *exphtml.Node) *exphtml.Node { clone := new(exphtml.Node) clone.Type = n.Type clone.DataAtom = n.DataAtom clone.Data = n.Data clone.Attr = make([]exphtml.Attribute, len(n.Attr)) copy(clone.Attr, n.Attr) for c := n.FirstChild; c != nil; c = c.NextSibling { nc := CloneNode(c) clone.AppendChild(nc) } return clone }
func (this *cleaner) convertDivsToParagraphs(doc *goquery.Document, domType string) *goquery.Document { if this.config.debug { log.Println("Starting to replace bad divs...") } badDivs := 0 convertedTextNodes := 0 divs := doc.Find(domType) tags := []string{"a", "blockquote", "dl", "div", "img", "ol", "p", "pre", "table", "ul"} divs.Each(func(i int, div *goquery.Selection) { if this.config.parser.getElementsByTags(div, tags).Size() == 0 { this.replaceWithPara(div) badDivs++ } else { replacementText := make([]string, 0) nodesToRemove := list.New() children := div.Contents() if this.config.debug { log.Printf("Found %d children of div\n", children.Size()) } children.EachWithBreak(func(i int, kid *goquery.Selection) bool { text := kid.Text() kidNode := kid.Get(0) tag := kidNode.Data if tag == text { tag = "#text" } if tag == "#text" { text = strings.Replace(text, "\n", "", -1) text = tabsRegEx.ReplaceAllString(text, "") if text == "" { return true } if len(text) > 1 { prev := kidNode.PrevSibling if this.config.debug { log.Printf("PARENT CLASS: %s NODENAME: %s\n", this.config.parser.name("class", div), tag) log.Printf("TEXTREPLACE: %s\n", strings.Replace(text, "\n", "", -1)) } if prev != nil && prev.DataAtom == atom.A { nodeSelection := kid.HasNodes(prev) html, _ := nodeSelection.Html() replacementText = append(replacementText, html) if this.config.debug { log.Printf("SIBLING NODENAME ADDITION: %s TEXT: %s\n", prev.Data, html) } } replacementText = append(replacementText, text) nodesToRemove.PushBack(kidNode) convertedTextNodes++ } } return true }) newNode := new(html.Node) newNode.Type = html.ElementNode newNode.Data = strings.Join(replacementText, "") newNode.DataAtom = atom.P div.First().AddNodes(newNode) for s := nodesToRemove.Front(); s != nil; s = s.Next() { node := s.Value.(*html.Node) if node != nil && node.Parent != nil { node.Parent.RemoveChild(node) } } } }) if this.config.debug { log.Printf("Found %d total divs with %d bad divs replaced and %d textnodes converted inside divs", divs.Size(), badDivs, convertedTextNodes) } return doc }
func init() { fCondenseNode = func(n *html.Node, depth int) (ret string) { if n.Type == html.ElementNode && n.Data == "script" { ret += fmt.Sprintf(" var script%v = '[script]'; ", nums) nums++ return } if n.Type == html.ElementNode && n.Data == "style" { ret += fmt.Sprintf(" .xxx {margin:2px;} ") return } if n.Type == html.ElementNode && n.Data == "img" { ret += fmt.Sprintf(" [img] %v %v | ", getAttrVal(n.Attr, "alt"), getAttrVal(n.Attr, "src")) } if n.Type == html.ElementNode && n.Data == "a" { ret += "[a]" } if n.Type == html.TextNode { s := n.Data // s = replTabsNewline.Replace(s) // s = strings.TrimSpace(s) if len(s) < 4 { ret += s } else if s != "" { if depth > 0 { ret += fmt.Sprintf(" [txt%v] %v", depth, s) } else { ret += " [txt] " + s } } } for c := n.FirstChild; c != nil; c = c.NextSibling { ret += fCondenseNode(c, depth+1) } return } fRecurse = func(n *html.Node) { if n.Type == html.ElementNode && n.Data == "form" { hidFld := new(html.Node) hidFld.Type = html.ElementNode hidFld.Data = "input" hidFld.Attr = []html.Attribute{html.Attribute{Key: "name", Val: "redirect-to"}, html.Attribute{Key: "value", Val: absolutize(getAttrVal(n.Attr, "action"))}} n.AppendChild(hidFld) n.Attr = rewriteAttributes(n.Attr, UnsyncedGlobalReq) } if n.Type == html.ElementNode && n.Data == "script" { for i := 0; i < len(n.Attr); i++ { if n.Attr[i].Key == "src" { n.Attr[i].Val = emptySrc } } } if n.Type == html.ElementNode && (n.Data == "a" || n.Data == "img" || n.Data == "script" || n.Data == "style") { s := fCondenseNode(n, 0) //fmt.Printf("found %v\n", s) textReplacement := new(html.Node) textReplacement.Type = html.TextNode textReplacement.Data = s if n.Data == "a" || n.Data == "img" { n.Attr = rewriteAttributes(n.Attr, UnsyncedGlobalReq) } // We want to remove all existing children. // Direct loop impossible, since "NextSibling" is set to nil by Remove(). // Therefore first assembling separately, then removing. children := make(map[*html.Node]struct{}) for c := n.FirstChild; c != nil; c = c.NextSibling { children[c] = struct{}{} } for k, _ := range children { n.RemoveChild(k) } // we can't put our replacement "under" an image, since img cannot have children if n.Type == html.ElementNode && n.Data == "img" { // n.Parent.InsertBefore(textReplacement,n) InsertAfter(n, textReplacement) RemoveNode(n) } else { n.AppendChild(textReplacement) } if n.Data == "a" { prev := n.PrevSibling if prev != nil { breaker0 := new(html.Node) breaker0.Type = html.TextNode breaker0.Data = " || " n.Parent.InsertBefore(breaker0, prev) breaker1 := new(html.Node) breaker1.Type = html.ElementNode // breaker1.Data = "||<br>\n" breaker1.Data = "br" n.Parent.InsertBefore(breaker1, prev) breaker2 := new(html.Node) breaker2.Type = html.TextNode breaker2.Data = "\n" n.Parent.InsertBefore(breaker2, prev) } } } for c := n.FirstChild; c != nil; c = c.NextSibling { fRecurse(c) } } }
func ParseHtmlFiles() { testDataDir := "./" testFiles, err := filepath.Glob(testDataDir + "test*.html") if err != nil { pf("%v \n", err) } for _, tf := range testFiles { pf("%v\n", tf) f, err := os.Open(tf) if err != nil { pf("1 %v \n", err) } defer f.Close() r1 := bufio.NewReader(f) var docRoot *html.Node docRoot, err = html.Parse(r1) if err != nil { pf("3 %v \n", err) } fRecurse = func(n *html.Node) { if n.Type == html.ElementNode && n.Data == "a" { s := strings.TrimSpace(fNodeModify(n)) //pf("found %v\n", s) nNew := new(html.Node) nNew.Type = html.TextNode nNew.Data = s // We want to remove all children. // Direct loop impossible, since "NextSibling" is set to nil // during Remove(). // Therefore first assembling separately, then removing. children := map[*html.Node]string{} for c := n.FirstChild; c != nil; c = c.NextSibling { children[c] = "xx" } for k, _ := range children { n.RemoveChild(k) // pf(" removed %q\n", strings.TrimSpace(k.Data)) } n.AppendChild(nNew) } for c := n.FirstChild; c != nil; c = c.NextSibling { fRecurse(c) } } fRecurse(docRoot) var b bytes.Buffer html.Render(&b, docRoot) util.WriteBytesToFilename("yy_"+tf, &b) //fixedHtml := b.String() //fmt.Printf("%s \n", spew.Sdump(docRoot)) } }