Golang Node.Data Examples

Programming Language: Golang

Namespace/Package Name: golang.org/x/net/html

Class/Type: Node

Method/Function: Data

Examples at hotexamples.com: 30

The "golang.org/x/net/html" package library provides the tools for parsing and manipulating HTML documents using Go. One of the useful features of this package is the Node Data structure, which is a representation of the content within a particular HTML node.

For example, if you have an HTML document that includes a paragraph element like this:

Hello, World!

You can access and manipulate the content of this element using the Node Data structure. Here's an example of how you might use Node Data to extract the text content of the paragraph element in Go:

package main

import (
    "fmt"
    "strings"

    "golang.org/x/net/html"
)

func main() {
    // Parse the HTML document
    doc := "Hello, World!"
    n, _ := html.Parse(strings.NewReader(doc))

    // Find and extract the text content of the paragraph element
    p := findFirstElementByTag(n, "p")
    if p != nil {
        // Use the Node Data structure to get the text content
        content := strings.TrimSpace(html.UnescapeString(p.FirstChild.Data))
        fmt.Println(content) // Output: Hello, World!
    }
}

func findFirstElementByTag(n *html.Node, tag string) *html.Node {
    if n.Type == html.ElementNode && n.Data == tag {
        return n
    }
    for c := n.FirstChild; c != nil; c = c.NextSibling {
        if result := findFirstElementByTag(c, tag); result != nil {
            return result
        }
    }
    return nil
}

In this example, we first parse the HTML document using the "html.Parse" function and then use our "findFirstElementByTag" function to locate the "p" element. Once we have a reference to the "p" element, we can use the Node Data structure to extract the text content. Overall, the "golang.org/x/net/html" package library provides a powerful set of tools for working with HTML documents in Go, and the Node Data structure is an important component of this package.

Golang Node.Data - 30 examples found. These are the top rated real world Golang examples of golang.org/x/net/html.Node.Data extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Attr(30)

Data(30)

RemoveChild(21)

AppendChild(18)

Type(11)

DataAtom(5)

InsertBefore(5)

FirstChild(4)

LastChild(1)

Namespace(1)

NextSibling(1)

Parent(1)

Example #1

Show file

File: 01_cleanse.go Project: aarzilli/tools

// cleansDom performs brute reduction and simplification
//
func cleanseDom(n *html.Node, lvl int) {

	n.Attr = removeAttr(n.Attr, unwantedAttrs)

	// Children
	for c := n.FirstChild; c != nil; c = c.NextSibling {
		cleanseDom(c, lvl+1)
	}

	if directlyRemoveUnwanted {
		removeUnwanted(n)
	} else {
		convertUnwanted(n)
	}

	// ---

	convertExotic(n)

	// one time text normalization
	if n.Type == html.TextNode {
		n.Data = stringspb.NormalizeInnerWhitespace(n.Data)
	}

}

Example #2

Show file

File: minification_html.go Project: ReanGD/go-web-search

func (m *minificationHTML) toDiv(node *html.Node) (*html.Node, error) {
	node.DataAtom = atom.Div
	node.Data = "div"
	node.Attr = nil

	return m.parseChildren(node)
}

Example #3

Show file

File: cleaner.go Project: Vetcher/pagedownloader

func FindTitleAndBody_MK(node *html.Node) (*html.Node, *html.Node) {
	var title, fulltext *html.Node

	if node.Type == html.ElementNode {
		for _, tag := range node.Attr {
			if tag.Key == "class" {
				if tag.Val == "content" {
					title = FindTitleMK(node)
					node.Data = "body"
					fulltext = node
					break
				}
			}
		}
	}

	for c := node.FirstChild; c != nil; c = c.NextSibling {
		ptitle, pfulltext := FindTitleAndBody_MK(c)
		if ptitle != nil {
			title = ptitle
			title.Data = "title"
		}
		if pfulltext != nil {
			fulltext = pfulltext
		}
		if title != nil && fulltext != nil {
			break
		}
	}
	return title, fulltext

}

Example #4

Show file

File: parser_utils.go Project: ReanGD/go-web-search

func (u *parserUtils) mergeNodes(parent, prev, next *html.Node, addSeparator bool) *html.Node {
	prevText := prev != nil && prev.Type == html.TextNode
	nextText := next != nil && next.Type == html.TextNode
	delim := ""
	if addSeparator {
		delim = " "
	}

	if prevText && nextText {
		prev.Data = prev.Data + delim + next.Data
		parent.RemoveChild(next)
		return prev.NextSibling
	}

	if prevText {
		prev.Data = prev.Data + delim
	} else if nextText {
		next.Data = delim + next.Data
	} else if addSeparator {
		newNode := &html.Node{
			Type: html.TextNode,
			Data: delim}
		parent.InsertBefore(newNode, next)
	}

	return next
}

Example #5

Show file

File: cleaner.go Project: Vetcher/pagedownloader

// finds article's title and body in ria.ru html style
// works cleary on 15.12.2015
func FindTitleAndBody_Ria(node *html.Node) (*html.Node, *html.Node) {
	var title, fulltext *html.Node

	if node.Type == html.ElementNode {
		for _, tag := range node.Attr {
			if tag.Key == "itemprop" {
				if tag.Val == "articleBody" {
					node.Data = "body"
					fulltext = node
					break
				}
				if tag.Val == "name" {
					node.Data = "title"
					title = node
					break
				}
			}
		}
	}

	for c := node.FirstChild; c != nil; c = c.NextSibling {
		ptitle, pfulltext := FindTitleAndBody_Ria(c)
		if ptitle != nil {
			title = ptitle
		}
		if pfulltext != nil {
			fulltext = pfulltext
		}
		if title != nil && fulltext != nil {
			break
		}
	}
	return title, fulltext

}

Example #6

Show file

File: 09_reformat_indent.go Project: aarzilli/tools

func reIndent(n *html.Node, lvl int) {

	if lvl > cScaffoldLvls && n.Parent == nil {
		bb := dom.PrintSubtree(n)
		_ = bb
		// log.Printf("%s", bb.Bytes())
		hint := ""
		if ml3[n] > 0 {
			hint = "   from ml3"
		}
		log.Print("reIndent: no parent ", hint)
		return
	}

	// Before children processing
	switch n.Type {
	case html.ElementNode:
		if lvl > cScaffoldLvls && n.Parent.Type == html.ElementNode {
			ind := strings.Repeat("\t", lvl-2)
			dom.InsertBefore(n, &html.Node{Type: html.TextNode, Data: "\n" + ind})
		}
	case html.CommentNode:
		dom.InsertBefore(n, &html.Node{Type: html.TextNode, Data: "\n"})
	case html.TextNode:
		n.Data = strings.TrimSpace(n.Data) + " "
		if !strings.HasPrefix(n.Data, ",") && !strings.HasPrefix(n.Data, ".") {
			n.Data = " " + n.Data
		}
		// link texts without trailing space
		if n.Parent != nil && n.Parent.Data == "a" {
			n.Data = strings.TrimSpace(n.Data)
		}
	}

	// Children
	for c := n.FirstChild; c != nil; c = c.NextSibling {
		reIndent(c, lvl+1)
	}

	// After children processing
	switch n.Type {
	case html.ElementNode:
		// I dont know why,
		// but this needs to happend AFTER the children
		if lvl > cScaffoldLvls && n.Parent.Type == html.ElementNode {
			ind := strings.Repeat("\t", lvl-2)
			ind = "\n" + ind
			// link texts without new line
			if n.Data == "a" {
				ind = ""
			}
			if n.LastChild != nil {
				dom.InsertAfter(n.LastChild, &html.Node{Type: html.TextNode, Data: ind})
			}
		}
	}

}

Example #7

Show file

File: extractor.go Project: hotei/GoOse

func (this *contentExtractor) getSiblingsContent(currentSibling *goquery.Selection, baselinescoreSiblingsPara float64) []*goquery.Selection {
	ps := make([]*goquery.Selection, 0)
	if currentSibling.Get(0).DataAtom.String() == "p" && len(currentSibling.Text()) > 0 {
		ps = append(ps, currentSibling)
		return ps
	} else {
		potentialParagraphs := currentSibling.Find("p")
		potentialParagraphs.Each(func(i int, s *goquery.Selection) {
			text := s.Text()
			if len(text) > 0 {
				ws := this.config.stopWords.stopWordsCount(this.config.targetLanguage, text)
				paragraphScore := ws.stopWordCount
				siblingBaselineScore := 0.30
				highLinkDensity := this.isHighLinkDensity(s)
				score := siblingBaselineScore * baselinescoreSiblingsPara
				if score < float64(paragraphScore) && !highLinkDensity {
					node := new(html.Node)
					node.Type = html.TextNode
					node.Data = text
					node.DataAtom = atom.P
					nodes := make([]*html.Node, 1)
					nodes[0] = node
					newSelection := new(goquery.Selection)
					newSelection.Nodes = nodes
					ps = append(ps, newSelection)
				}
			}

		})
	}
	return ps
}

Example #8

Show file

File: 06_img2link.go Project: aarzilli/tools

func img2Link(img *html.Node) {

	if img.Data == "img" {

		img.Data = "a"
		for i := 0; i < len(img.Attr); i++ {
			if img.Attr[i].Key == "src" {
				img.Attr[i].Key = "href"
			}
		}

		double := closureTextNodeExists(img)
		imgContent := ""
		title := attrX(img.Attr, "title")

		if double {
			imgContent = fmt.Sprintf("[img] %v %v | ",
				"[ctdr]", // content title double removed
				urlBeautify(attrX(img.Attr, "href")))

		} else {
			imgContent = fmt.Sprintf("[img] %v %v | ",
				title,
				urlBeautify(attrX(img.Attr, "href")))
		}

		img.Attr = attrSet(img.Attr, "cfrom", "img")
		nd := dom.Nd("text", imgContent)
		img.AppendChild(nd)
	}

}

Example #9

Show file

File: nodes.go Project: documize/html-diff

func copyNode(to, from *html.Node) {
	to.Attr = from.Attr
	to.Data = from.Data
	to.DataAtom = from.DataAtom
	to.Namespace = from.Namespace
	to.Type = from.Type
}

Example #10

Show file

File: minification_text.go Project: ReanGD/go-web-search

func (m *minificationText) parseText(node *html.Node) (*html.Node, error) {
	next := node.NextSibling
	text := m.processText(node.Data)
	if len(text) != 0 {
		node.Data = text
	} else {
		node.Parent.RemoveChild(node)
	}
	return next, nil
}

Example #11

Show file

File: mini_go_query.go Project: aarzilli/tools

func Nd(ntype string, content ...string) *html.Node {

	nd0 := new(html.Node)

	if ntype == "text" {
		nd0.Type = html.TextNode
		if len(content) > 0 {
			nd0.Data = content[0]
		}
	} else {
		nd0.Type = html.ElementNode
		nd0.Data = ntype
		if len(content) > 0 {
			runtimepb.StackTrace(4)
			log.Printf("Element nodes can't have content")
		}
	}

	return nd0

}

Example #12

Show file

File: utils.go Project: chzyer/pocket

func walkPrint(w io.Writer, i int, n *html.Node) {
	for ; n != nil; n = n.NextSibling {
		if n.Type == html.TextNode && strings.TrimSpace(n.Data) == "" {
			continue
		}

		d := getData(n)
		isMostChild := getData(n.Parent).Child == n
		if isMostChild {
			w.Write([]byte(`<div style="background: rgba(0, 0, 100, 0.1)">`))
		}
		if d.Chosen || d.ChosenBy {
			color := "rgb(40, 79, 40)"
			if d.ChosenBy {
				color = "rgba(90, 60, 30, 0.8)"
			}
			w.Write([]byte(`<div id="chosen" style="background: ` + color + `;color: #fff">`))
		}
		factor := 0
		if d.Count > 0 {
			factor = d.MaxChild * 100 / d.Count
		}

		if len([]rune(n.Data)) > 40 {
			n.Data = string([]rune(n.Data)[:40])
		}
		if n.Type == html.ElementNode {
			fmt.Fprintf(w, "%v&lt;%v&gt;", strings.Repeat("\t", i), n.Data)
			fmt.Fprintf(w, " (%v/%v = <b>%v%%</b>) - %v\n",
				d.MaxChild,
				d.Count,
				factor,

				n.Attr,
			)
		} else {
			fmt.Fprintf(w, "%v%v\n", strings.Repeat("\t", i), strconv.Quote(ghtml.EscapeString(n.Data)))
		}

		if n.FirstChild != nil {
			walkPrint(w, i+1, n.FirstChild)
		}
		if isMostChild {
			w.Write([]byte(`</div>`))
		}

		if d.Chosen || d.ChosenBy {
			w.Write([]byte("</div>"))
		}

	}
}

Example #13

Show file

File: go_crawl_test.go Project: zlisinski/go_crawl

func TestParseATagNoHref(t *testing.T) {
	node := new(html.Node)
	node.Data = "a"

	page := newWebPage(startUrl)
	page.parseATag(node)

	expected1 := 0
	val1 := page.links.Len()
	if val1 != expected1 {
		t.Error("Expected:", expected1, " Got:", val1)
	}
}

Example #14

Show file

File: node.go Project: kristofer/go-html-transform

// CloneNode makes a copy of a Node with all descendants.
func CloneNode(n *exphtml.Node) *exphtml.Node {
	clone := new(exphtml.Node)
	clone.Type = n.Type
	clone.DataAtom = n.DataAtom
	clone.Data = n.Data
	clone.Attr = make([]exphtml.Attribute, len(n.Attr))
	copy(clone.Attr, n.Attr)
	for c := n.FirstChild; c != nil; c = c.NextSibling {
		nc := CloneNode(c)
		clone.AppendChild(nc)
	}
	return clone
}

Example #15

Show file

File: go_crawl_test.go Project: zlisinski/go_crawl

func TestParseScriptTagNoSrc(t *testing.T) {
	node := new(html.Node)
	node.Data = "script"

	page := newWebPage(startUrl)
	page.parseScriptTag(node)

	expected1 := 0
	val1 := page.scriptFiles.Len()
	if val1 != expected1 {
		t.Error("Expected:", expected1, " Got:", val1)
	}
}

Example #16

Show file

File: 03_condense_top_down.go Project: aarzilli/tools

func removeEmptyNodes(n *html.Node, lvl int) {

	// children
	cc := []*html.Node{}
	for c := n.FirstChild; c != nil; c = c.NextSibling {
		cc = append(cc, c)
	}
	for _, c := range cc {
		removeEmptyNodes(c, lvl+1)
	}

	// processing
	// empty element nodes
	if n.Type == html.ElementNode && n.Data == "img" {
		src := attrX(n.Attr, "src")
		if src == "" {
			n.Parent.RemoveChild(n)
		}
	}

	if n.Type == html.ElementNode && n.FirstChild == nil && n.Data == "a" {
		href := attrX(n.Attr, "href")
		if href == "#" || href == "" {
			n.Parent.RemoveChild(n)
		}
	}

	if n.Type == html.ElementNode && n.FirstChild == nil &&
		(n.Data == "em" || n.Data == "strong") {
		n.Parent.RemoveChild(n)
	}

	if n.Type == html.ElementNode && n.FirstChild == nil &&
		(n.Data == "div" || n.Data == "span" || n.Data == "li" || n.Data == "p") {
		n.Parent.RemoveChild(n)
	}

	// spans with less than 2 characters inside => flatten to text
	only1Child := n.FirstChild != nil && n.FirstChild == n.LastChild
	if n.Type == html.ElementNode &&
		n.Data == "span" &&
		only1Child &&
		n.FirstChild.Type == html.TextNode &&
		len(strings.TrimSpace(n.FirstChild.Data)) < 3 {
		n.Type = html.TextNode
		n.Data = n.FirstChild.Data
		n.RemoveChild(n.FirstChild)
	}

}

Example #17

Show file

File: go_crawl_test.go Project: zlisinski/go_crawl

func TestParseATagAbsoluteDiffHost(t *testing.T) {
	node := new(html.Node)
	node.Data = "a"
	attr := html.Attribute{"", "href", "http://www.google.com"}
	node.Attr = []html.Attribute{attr}

	page := newWebPage(startUrl)
	page.parseATag(node)

	expected := 0
	val := page.links.Len()
	if val != expected {
		t.Error("Expected:", expected, " Got:", val)
	}
}

Example #18

Show file

File: go_crawl_test.go Project: zlisinski/go_crawl

func TestParseLinkTagNoRel(t *testing.T) {
	node := new(html.Node)
	node.Data = "link"
	attr1 := html.Attribute{"", "href", "1.css"}
	node.Attr = []html.Attribute{attr1}

	page := newWebPage(startUrl)
	page.parseLinkTag(node)

	expected1 := 0
	val1 := page.styleSheets.Len()
	if val1 != expected1 {
		t.Error("Expected:", expected1, " Got:", val1)
	}
}

Example #19

Show file

File: go_crawl_test.go Project: zlisinski/go_crawl

func TestParseATagInvalidUrl(t *testing.T) {
	node := new(html.Node)
	node.Data = "a"
	attr := html.Attribute{"", "href", "%gh&%ij"}
	node.Attr = []html.Attribute{attr}

	page := newWebPage(startUrl)
	page.parseATag(node)

	expected1 := 0
	val1 := page.links.Len()
	if val1 != expected1 {
		t.Error("Expected:", expected1, " Got:", val1)
	}
}

Example #20

Show file

File: go_crawl_test.go Project: zlisinski/go_crawl

func TestParseLinkTagInvalidUrl(t *testing.T) {
	node := new(html.Node)
	node.Data = "link"
	attr1 := html.Attribute{"", "href", "%gh&%ij"}
	attr2 := html.Attribute{"", "rel", "stylesheet"}
	node.Attr = []html.Attribute{attr1, attr2}

	page := newWebPage(startUrl)
	page.parseLinkTag(node)

	expected1 := 0
	val1 := page.styleSheets.Len()
	if val1 != expected1 {
		t.Error("Expected:", expected1, " Got:", val1)
	}
}

Example #21

Show file

File: 6_apply_dedup.go Project: aarzilli/tools

func dedupApply(n *html.Node, dedups map[string]bool) {

	// Children
	for c := n.FirstChild; c != nil; c = c.NextSibling {
		dedupApply(c, dedups)
	}

	if n.Type == html.ElementNode {
		outline := attrX(n.Attr, "ol") + "."

		if dedups[outline] {
			n.Type = html.CommentNode
			n.Data = n.Data + " replaced"
		}
	}

}

Example #22

Show file

File: clean.go Project: documize/html-diff

// clean normalises styles/colspan and removes any CleanTags specified, along with newlines;
// but also makes all the character handling (for example "&#160;" as utf-8) the same.
// It returns the estimated number of treeRunes that will be used.
// TODO more cleaning of the input HTML, as required.
func (c *Config) clean(n *html.Node) int {
	size := 1
	switch n.Type {
	case html.ElementNode:
		for ai := 0; ai < len(n.Attr); ai++ {
			a := n.Attr[ai]
			switch {
			case strings.ToLower(a.Key) == "style":
				if strings.TrimSpace(a.Val) == "" { // delete empty styles
					n.Attr = delAttr(n.Attr, ai)
					ai--
				} else { // tidy non-empty styles
					// TODO there could be more here to make sure the style entries are in the same order etc.
					n.Attr[ai].Val = strings.Replace(a.Val, " ", "", -1)
					if !strings.HasSuffix(n.Attr[ai].Val, ";") {
						n.Attr[ai].Val += ";"
					}
				}
			case n.DataAtom == atom.Td &&
				strings.ToLower(a.Key) == "colspan" &&
				strings.TrimSpace(a.Val) == "1":
				n.Attr = delAttr(n.Attr, ai)
				ai--
			}
		}
	case html.TextNode:
		n.Data = htm.UnescapeString(n.Data)
		size += utf8.RuneCountInString(n.Data) - 1 // len(n.Data) would be faster, but use more memory
	}
searchChildren:
	for ch := n.FirstChild; ch != nil; ch = ch.NextSibling {
		switch ch.Type {
		case html.ElementNode:
			for _, rr := range c.CleanTags {
				if rr == ch.Data {
					n.RemoveChild(ch)
					goto searchChildren
				}
			}
		}
		size += c.clean(ch)
	}
	return size
}

Example #23

Show file

File: go_crawl_test.go Project: zlisinski/go_crawl

func TestParseATagRelative(t *testing.T) {
	node := new(html.Node)
	node.Data = "a"
	attr := html.Attribute{"", "href", "1.html"}
	node.Attr = []html.Attribute{attr}

	page := newWebPage(startUrl)
	page.parseATag(node)

	expected1 := 1
	val1 := page.links.Len()
	if val1 != expected1 {
		t.Error("Expected:", expected1, " Got:", val1)
	}
	expected2 := startUrl + "1.html"
	val2 := page.links.Front().Value
	if val2 != expected2 {
		t.Error("Expected:", expected2, " Got:", val2)
	}
}

Example #24

Show file

File: go_crawl_test.go Project: zlisinski/go_crawl

func TestParseScriptTagAbsolute(t *testing.T) {
	node := new(html.Node)
	node.Data = "script"
	attr := html.Attribute{"", "src", startUrl + "1.js"}
	node.Attr = []html.Attribute{attr}

	page := newWebPage(startUrl)
	page.parseScriptTag(node)

	expected1 := 1
	val1 := page.scriptFiles.Len()
	if val1 != expected1 {
		t.Error("Expected:", expected1, " Got:", val1)
	}
	expected2 := startUrl + "1.js"
	val2 := page.scriptFiles.Front().Value
	if val2 != expected2 {
		t.Error("Expected:", expected2, " Got:", val2)
	}
}

Example #25

Show file

File: cleaner.go Project: BenLubar/htmlcleaner

func forceMaxDepth(n *html.Node, depth int) {
	if depth == 0 {
		n.Type = html.TextNode
		n.FirstChild, n.LastChild = nil, nil
		n.Attr = nil
		n.Data = "[omitted]"
		for n.NextSibling != nil {
			n.Parent.RemoveChild(n.NextSibling)
		}
		return
	}

	if n.Type != html.ElementNode {
		return
	}

	for c := n.FirstChild; c != nil; c = c.NextSibling {
		forceMaxDepth(c, depth-1)
	}
}

Example #26

Show file

File: append.go Project: documize/html-diff

// append1 actually appends to the merged HTML node tree.
func (ap *appendContext) append1(action rune, text string, proto *html.Node, pos posT) {
	if proto == nil {
		return
	}
	appendPoint, protoAncestor := ap.lastMatchingLeaf(proto, action, pos)
	if appendPoint == nil || protoAncestor == nil {
		return
	}
	if appendPoint.DataAtom != protoAncestor.DataAtom {
		return
	}
	newLeaf := new(html.Node)
	copyNode(newLeaf, proto)
	if proto.Type == html.TextNode {
		newLeaf.Data = text
	}
	if action != '=' {
		insertNode := &html.Node{
			Type:     html.ElementNode,
			DataAtom: atom.Span,
			Data:     "span",
		}
		switch action {
		case '+':
			insertNode.Attr = convertAttributes(ap.c.InsertedSpan)
		case '-':
			insertNode.Attr = convertAttributes(ap.c.DeletedSpan)
		case '~':
			insertNode.Attr = convertAttributes(ap.c.ReplacedSpan)
		}
		insertNode.AppendChild(newLeaf)
		newLeaf = insertNode
	}
	for proto = proto.Parent; proto != nil && proto != protoAncestor; proto = proto.Parent {
		above := new(html.Node)
		copyNode(above, proto)
		above.AppendChild(newLeaf)
		newLeaf = above
	}
	appendPoint.AppendChild(newLeaf)
}

Example #27

Show file

File: go_crawl_test.go Project: zlisinski/go_crawl

func TestParseLinkTagRelative(t *testing.T) {
	node := new(html.Node)
	node.Data = "link"
	attr1 := html.Attribute{"", "href", "1.css"}
	attr2 := html.Attribute{"", "rel", "stylesheet"}
	node.Attr = []html.Attribute{attr1, attr2}

	page := newWebPage(startUrl)
	page.parseLinkTag(node)

	expected1 := 1
	val1 := page.styleSheets.Len()
	if val1 != expected1 {
		t.Error("Expected:", expected1, " Got:", val1)
	}
	expected2 := startUrl + "1.css"
	val2 := page.styleSheets.Front().Value
	if val2 != expected2 {
		t.Error("Expected:", expected2, " Got:", val2)
	}
}

Example #28

Show file

File: 01_cleanse.go Project: aarzilli/tools

// convertExotic standardizes <section> or <header> nodes
// towards <div> nodes.
func convertExotic(n *html.Node) {
	if repl, ok := exotics[n.Data]; ok {
		n.Attr = append(n.Attr, html.Attribute{"", "cfrm", n.Data})
		n.Data = repl
	}
}

Example #29

Show file

File: cleaner.go Project: ejamesc/GoOse

func (c *Cleaner) convertDivsToParagraphs(doc *goquery.Document, domType string) *goquery.Document {
	if c.config.debug {
		log.Println("Starting to replace bad divs...")
	}
	badDivs := 0
	convertedTextNodes := 0
	divs := doc.Find(domType)

	divs.Each(func(i int, div *goquery.Selection) {
		divHTML, _ := div.Html()
		if divToPElementsPattern.Match([]byte(divHTML)) {
			c.replaceWithPara(div)
			badDivs++
		} else {
			var replacementText []string
			nodesToRemove := list.New()
			children := div.Contents()
			if c.config.debug {
				log.Printf("Found %d children of div\n", children.Size())
			}
			children.EachWithBreak(func(i int, kid *goquery.Selection) bool {
				text := kid.Text()
				kidNode := kid.Get(0)
				tag := kidNode.Data
				if tag == text {
					tag = "#text"
				}
				if tag == "#text" {
					text = strings.Replace(text, "\n", "", -1)
					text = tabsRegEx.ReplaceAllString(text, "")
					if text == "" {
						return true
					}
					if len(text) > 1 {
						prev := kidNode.PrevSibling
						if c.config.debug {
							log.Printf("PARENT CLASS: %s NODENAME: %s\n", c.config.parser.name("class", div), tag)
							log.Printf("TEXTREPLACE: %s\n", strings.Replace(text, "\n", "", -1))
						}
						if prev != nil && prev.DataAtom == atom.A {
							nodeSelection := kid.HasNodes(prev)
							html, _ := nodeSelection.Html()
							replacementText = append(replacementText, html)
							if c.config.debug {
								log.Printf("SIBLING NODENAME ADDITION: %s TEXT: %s\n", prev.Data, html)
							}
						}
						replacementText = append(replacementText, text)
						nodesToRemove.PushBack(kidNode)
						convertedTextNodes++
					}

				}
				return true
			})

			newNode := new(html.Node)
			newNode.Type = html.ElementNode
			newNode.Data = strings.Join(replacementText, "")
			newNode.DataAtom = atom.P
			div.First().AddNodes(newNode)

			for s := nodesToRemove.Front(); s != nil; s = s.Next() {
				node := s.Value.(*html.Node)
				if node != nil && node.Parent != nil {
					node.Parent.RemoveChild(node)
				}
			}
		}
	})
	if c.config.debug {
		log.Printf("Found %d total divs with %d bad divs replaced and %d textnodes converted inside divs", divs.Size(), badDivs, convertedTextNodes)
	}
	return doc

}

Example #30

Show file

File: outputdealwith.go Project: slygo/360baosdk

//从nodes中找到node  根据index  和 属性  先index
func findNodeformNodesbyIndexOrPro(nodes []*goquery.Selection, index *int, m map[string]string, Type string, visible bool) {

	switch {
	case Type == OPTION || Type == RADIO:

		for _, v := range nodes {
			for _, vv := range v.Get(0).Attr {
				if vv.Key == VALUE {

					if vv.Val == m[VALUE] {
						if Type == RADIO {
							v.SetAttr("checked", "checked")
						} else {
							v.SetAttr("selected", "selected")
						}

						return
					}

				}
			}
		}
		if visible {
			var node html.Node
			node.Data = nodes[0].Get(0).Data
			node.Type = nodes[0].Get(0).Type

			attr := make([]html.Attribute, 0, 2)
			var tr html.Attribute
			tr.Key = VALUE
			tr.Val = m[VALUE]

			attr = append(attr, tr)
			if Type == RADIO {
				tr.Key = "checked"
				tr.Val = "checked"
			} else {
				tr.Key = "selected"
				tr.Val = "selected"
			}

			attr = append(attr, tr)

			tr.Key = TYPE
			tr.Val = Type
			attr = append(attr, tr)

			node.Attr = attr
			nodes[0].Parent().AppendNodes(&node)
		}
		return
	default:
	}

	if len(nodes) <= *index {
		return
	}
	for k, v := range m {
		nodes[*index].SetAttr(k, v)
	}

	*index++
}