Beispiel #1
0
func closureTextNodeExists(img *html.Node) (found bool) {

	txt := attrX(img.Attr, "title")
	if len(txt) < 5 {
		return false
	}
	txt = stringspb.NormalizeInnerWhitespace(txt)
	txt = strings.TrimSpace(txt)

	// We dont search entire document, but three levels above image subtree
	grandParent := img
	for i := 0; i < 4; i++ {
		if grandParent.Parent != nil {
			grandParent = grandParent.Parent
		} else {
			// log.Printf("LevelsUp %v for %q", i, txt)
			break
		}
	}

	var recurseTextNodes func(n *html.Node)
	recurseTextNodes = func(n *html.Node) {

		if found {
			return
		}

		cc := []*html.Node{}
		for c := n.FirstChild; c != nil; c = c.NextSibling {
			cc = append(cc, c)
		}
		for _, c := range cc {
			recurseTextNodes(c)
		}

		if n.Type == html.TextNode {
			n.Data = stringspb.NormalizeInnerWhitespace(n.Data)
			if len(n.Data) >= len(txt) {
				// if strings.Contains(txt, "FDP") {
				// 	log.Printf("%25v     %v", stringspb.Ellipsoider(txt, 10), stringspb.Ellipsoider(n.Data, 10))
				// }
				fnd := strings.Contains(n.Data, txt)
				if fnd {
					found = true
					return
				}
			}
		}
	}
	recurseTextNodes(grandParent)

	return
}
Beispiel #2
0
// cleansDom performs brute reduction and simplification
//
func cleanseDom(n *html.Node, lvl int) {

	n.Attr = removeAttr(n.Attr, unwantedAttrs)

	// Children
	for c := n.FirstChild; c != nil; c = c.NextSibling {
		cleanseDom(c, lvl+1)
	}

	if directlyRemoveUnwanted {
		removeUnwanted(n)
	} else {
		convertUnwanted(n)
	}

	// ---

	convertExotic(n)

	// one time text normalization
	if n.Type == html.TextNode {
		n.Data = stringspb.NormalizeInnerWhitespace(n.Data)
	}

}
Beispiel #3
0
func removeAttr(attributes []html.Attribute, removeKeys map[string]bool) []html.Attribute {

	ret := []html.Attribute{}
	var alt, title string

	for _, a := range attributes {
		a.Key = strings.TrimSpace(a.Key)
		a.Val = strings.TrimSpace(a.Val)
		a.Val = stringspb.NormalizeInnerWhitespace(a.Val) // having encountered title or alt values with newlines
		if removeKeys[a.Key] || strings.HasPrefix(a.Key, "data") {
			//
		} else {
			if a.Key == "alt" {
				alt = a.Val
			}
			if a.Key == "title" {
				title = a.Val
			}
			attrDistinct[a.Key]++
			ret = append(ret, a)
		}
	}

	// normalize on title
	if alt != "" && alt == title {
		ret1 := []html.Attribute{}
		for i := 0; i < len(ret); i++ {
			if ret[i].Key != "alt" {
				ret1 = append(ret1, ret[i])
			}
		}
		ret = ret1
	}

	// remove both
	if alt == "" && alt == title {
		ret1 := []html.Attribute{}
		for i := 0; i < len(ret); i++ {
			if ret[i].Key != "alt" && ret[i].Key != "title" {
				ret1 = append(ret1, ret[i])
			}
		}
		ret = ret1
	}

	return ret
}
func textifyNodeSubtree(n *html.Node) {

	if n.Type == html.ElementNode {

		nd := dom.Nd("text")
		nd.Data = textifySubtreeBruteForce(n, 0)
		nd.Data = stringspb.NormalizeInnerWhitespace(nd.Data)

		cc := []*html.Node{}
		for c := n.FirstChild; c != nil; c = c.NextSibling {
			cc = append(cc, c)
		}
		for _, c := range cc {
			n.RemoveChild(c)
		}

		n.AppendChild(nd)

		nd2 := dom.Nd("br")
		dom.InsertAfter(n, nd2)

	}

}