Ejemplo n.º 1
1
func (this *flowdocument_maker) convert_flowdocument(frag *html.Node) {
	if frag.Type == html.TextNode {
		return
	}
	ignore_children := false
	switch frag.Data {
	case "img":
		frag.Type = html.CommentNode
		node_clear_children(frag)
		frag.Attr = nil
	case "a":
		frag.Data = "Hyperlink"
		frag.Attr = extract_ahref_attr(frag.Attr)
	case "article":
		frag.Data = "FlowDocument"
		// set namespace dont work
		frag.Attr = []html.Attribute{html.Attribute{Key: "xmlns", Val: fdocns}}
	case "object", "video", "audio", "embed":
		frag.Type = html.CommentNode
		node_clear_children(frag)
		frag.Attr = nil
	case "p":
		fallthrough
	default:
		frag.Data = "Paragraph"
		frag.Attr = nil
		if this.first_paragraph == nil {
			this.first_paragraph = frag
		}
	}
	for child := frag.FirstChild; ignore_children == false && child != nil; child = child.NextSibling {
		this.convert_flowdocument(child)
	}
}
Ejemplo n.º 2
0
Archivo: deploy.go Proyecto: gdb/Stout
func addFiles(form uint8, parent *html.Node, files []string) {
	for _, file := range files {
		node := html.Node{
			Type: html.ElementNode,
		}
		switch form {
		case SCRIPT:
			node.Data = "script"
			node.Attr = []html.Attribute{
				html.Attribute{
					Key: "src",
					Val: file,
				},
			}

		case STYLE:
			node.Data = "link"
			node.Attr = []html.Attribute{
				html.Attribute{
					Key: "rel",
					Val: "stylesheet",
				},
				html.Attribute{
					Key: "href",
					Val: file,
				},
			}
		default:
			panic("Type not understood")
		}

		parent.AppendChild(&node)
	}
}
Ejemplo n.º 3
0
func trim_small_image(img *html.Node) (drop bool) {
	width, height, _ := media_get_dim(img)

	if img.Parent == nil {
		return
	}
	if width > 0 && height > 0 && width*height < small_image_t*small_image_t && img.Parent.Data == "a" {
		img.Data = "input"
		drop = true
	} else if width == 1 && height == 1 {
		img.Data = "input"
		drop = true
	}
	return
}
Ejemplo n.º 4
0
func (this *contentExtractor) getSiblingsContent(currentSibling *goquery.Selection, baselinescoreSiblingsPara float64) []*goquery.Selection {
	ps := make([]*goquery.Selection, 0)
	if currentSibling.Get(0).DataAtom.String() == "p" && len(currentSibling.Text()) > 0 {
		ps = append(ps, currentSibling)
		return ps
	} else {
		potentialParagraphs := currentSibling.Find("p")
		potentialParagraphs.Each(func(i int, s *goquery.Selection) {
			text := s.Text()
			if len(text) > 0 {
				ws := this.config.stopWords.stopWordsCount(this.config.targetLanguage, text)
				paragraphScore := ws.stopWordCount
				siblingBaselineScore := 0.30
				highLinkDensity := this.isHighLinkDensity(s)
				score := siblingBaselineScore * baselinescoreSiblingsPara
				if score < float64(paragraphScore) && !highLinkDensity {
					node := new(html.Node)
					node.Type = html.TextNode
					node.Data = text
					node.DataAtom = atom.P
					nodes := make([]*html.Node, 1)
					nodes[0] = node
					newSelection := new(goquery.Selection)
					newSelection.Nodes = nodes
					ps = append(ps, newSelection)
				}
			}

		})
	}
	return ps
}
Ejemplo n.º 5
0
func trim_display_none(n *html.Node) {
	st := get_attribute(n, "style")
	if strings.Contains(st, "display") && (strings.Contains(st, "none")) {
		//		log.Println("hide-node display:none", n.Data)
		n.Data = "input"
	}
}
Ejemplo n.º 6
0
func processTextNode(node *html.Node, tags []openTag) (outTags []openTag, next *html.Node, err error) {
	i := 0
	for i < len(node.Data) {
		r, rsize := utf8.DecodeRuneInString(node.Data[i:])
		switch r {
		case '[':
			size, openClose, tag, rest := parseShortcode(node.Data[i+1:])
			if size != 0 {
				// looks like we found a shortcode!
				if tag == "" { // escape code?
					// remove the outer [] and continue
					node.Data = node.Data[:i] + rest + node.Data[i+1+size:]
					i += len(rest)
				} else {
					return handleShortcode(node, tags, i, i+1+size, openClose, tag, rest)
				}
			} else {
				i += rsize
			}

		default:
			i += rsize
		}
	}

	// default: no shortcode found
	outTags = tags
	next = node.NextSibling
	err = nil
	return
}
Ejemplo n.º 7
0
// Splits the html.TextNode "node" into two nodes: one that holds
// Data[:splitBefore], and one that holds Data[splitAfter:]. "node"
// is modified in place to be the first result node; the second node
// is the return value.
func splitTextNode(node *html.Node, splitBefore, splitAfter int) *html.Node {
	newNode := &html.Node{
		Type: html.TextNode,
		Data: node.Data[splitAfter:],
	}
	node.Data = node.Data[:splitBefore]
	node.Parent.InsertBefore(newNode, node.NextSibling)
	return newNode
}
Ejemplo n.º 8
0
// CloneNode makes a copy of a Node with all descendants.
func CloneNode(n *exphtml.Node) *exphtml.Node {
	clone := new(exphtml.Node)
	clone.Type = n.Type
	clone.DataAtom = n.DataAtom
	clone.Data = n.Data
	clone.Attr = make([]exphtml.Attribute, len(n.Attr))
	copy(clone.Attr, n.Attr)
	for c := n.FirstChild; c != nil; c = c.NextSibling {
		nc := CloneNode(c)
		clone.AppendChild(nc)
	}
	return clone
}
Ejemplo n.º 9
0
func trim_invisible_image(img *html.Node) (drop bool) {
	width, werr := strconv.ParseInt(node_get_attribute(img, "width"), 0, 32)
	height, herr := strconv.ParseInt(node_get_attribute(img, "height"), 0, 32)

	if werr != nil || herr != nil || img.Parent == nil {
		return
	}
	// set width height explicit zero
	if width == 0 || height == 0 {
		img.Data = "input"
		drop = true
	}
	return
}
Ejemplo n.º 10
0
Archivo: cleaner.go Proyecto: ngs/GoOse
func (this *cleaner) convertDivsToParagraphs(doc *goquery.Document, domType string) *goquery.Document {
	if this.config.debug {
		log.Println("Starting to replace bad divs...")
	}
	badDivs := 0
	convertedTextNodes := 0
	divs := doc.Find(domType)
	tags := []string{"a", "blockquote", "dl", "div", "img", "ol", "p", "pre", "table", "ul"}

	divs.Each(func(i int, div *goquery.Selection) {
		if this.config.parser.getElementsByTags(div, tags).Size() == 0 {
			this.replaceWithPara(div)
			badDivs++
		} else {
			replacementText := make([]string, 0)
			nodesToRemove := list.New()
			children := div.Contents()
			if this.config.debug {
				log.Printf("Found %d children of div\n", children.Size())
			}
			children.EachWithBreak(func(i int, kid *goquery.Selection) bool {
				text := kid.Text()
				kidNode := kid.Get(0)
				tag := kidNode.Data
				if tag == text {
					tag = "#text"
				}
				if tag == "#text" {
					text = strings.Replace(text, "\n", "", -1)
					text = tabsRegEx.ReplaceAllString(text, "")
					if text == "" {
						return true
					}
					if len(text) > 1 {
						prev := kidNode.PrevSibling
						if this.config.debug {
							log.Printf("PARENT CLASS: %s NODENAME: %s\n", this.config.parser.name("class", div), tag)
							log.Printf("TEXTREPLACE: %s\n", strings.Replace(text, "\n", "", -1))
						}
						if prev != nil && prev.DataAtom == atom.A {
							nodeSelection := kid.HasNodes(prev)
							html, _ := nodeSelection.Html()
							replacementText = append(replacementText, html)
							if this.config.debug {
								log.Printf("SIBLING NODENAME ADDITION: %s TEXT: %s\n", prev.Data, html)
							}
						}
						replacementText = append(replacementText, text)
						nodesToRemove.PushBack(kidNode)
						convertedTextNodes++
					}

				}
				return true
			})

			newNode := new(html.Node)
			newNode.Type = html.ElementNode
			newNode.Data = strings.Join(replacementText, "")
			newNode.DataAtom = atom.P
			div.First().AddNodes(newNode)

			for s := nodesToRemove.Front(); s != nil; s = s.Next() {
				node := s.Value.(*html.Node)
				if node != nil && node.Parent != nil {
					node.Parent.RemoveChild(node)
				}
			}
		}
	})
	if this.config.debug {
		log.Printf("Found %d total divs with %d bad divs replaced and %d textnodes converted inside divs", divs.Size(), badDivs, convertedTextNodes)
	}
	return doc

}
Ejemplo n.º 11
0
func trim_display_none(n *html.Node) {
	st := node_get_attribute(n, "style")
	if strings.Contains(st, "display") && (strings.Contains(st, "none")) {
		n.Data = "input"
	}
}
Ejemplo n.º 12
0
func init() {

	fCondenseNode = func(n *html.Node, depth int) (ret string) {

		if n.Type == html.ElementNode && n.Data == "script" {
			ret += fmt.Sprintf(" var script%v = '[script]'; ", nums)
			nums++
			return
		}
		if n.Type == html.ElementNode && n.Data == "style" {
			ret += fmt.Sprintf(" .xxx {margin:2px;} ")
			return
		}

		if n.Type == html.ElementNode && n.Data == "img" {
			ret += fmt.Sprintf(" [img] %v %v | ", getAttrVal(n.Attr, "alt"), getAttrVal(n.Attr, "src"))
		}

		if n.Type == html.ElementNode && n.Data == "a" {
			ret += "[a]"
		}

		if n.Type == html.TextNode {
			s := n.Data
			// s = replTabsNewline.Replace(s)
			// s = strings.TrimSpace(s)
			if len(s) < 4 {
				ret += s
			} else if s != "" {
				if depth > 0 {
					ret += fmt.Sprintf(" [txt%v] %v", depth, s)
				} else {
					ret += " [txt] " + s
				}
			}
		}

		for c := n.FirstChild; c != nil; c = c.NextSibling {
			ret += fCondenseNode(c, depth+1)
		}
		return
	}

	fRecurse = func(n *html.Node) {
		if n.Type == html.ElementNode && n.Data == "form" {

			hidFld := new(html.Node)
			hidFld.Type = html.ElementNode
			hidFld.Data = "input"
			hidFld.Attr = []html.Attribute{html.Attribute{Key: "name", Val: "redirect-to"}, html.Attribute{Key: "value", Val: absolutize(getAttrVal(n.Attr, "action"))}}
			n.AppendChild(hidFld)

			n.Attr = rewriteAttributes(n.Attr, UnsyncedGlobalReq)

		}
		if n.Type == html.ElementNode && n.Data == "script" {
			for i := 0; i < len(n.Attr); i++ {
				if n.Attr[i].Key == "src" {
					n.Attr[i].Val = emptySrc
				}
			}
		}
		if n.Type == html.ElementNode &&
			(n.Data == "a" || n.Data == "img" || n.Data == "script" || n.Data == "style") {

			s := fCondenseNode(n, 0)
			//fmt.Printf("found %v\n", s)
			textReplacement := new(html.Node)
			textReplacement.Type = html.TextNode
			textReplacement.Data = s

			if n.Data == "a" || n.Data == "img" {
				n.Attr = rewriteAttributes(n.Attr, UnsyncedGlobalReq)
			}

			// We want to remove all existing children.
			// Direct loop impossible, since "NextSibling" is set to nil by Remove().
			// Therefore first assembling separately, then removing.
			children := make(map[*html.Node]struct{})
			for c := n.FirstChild; c != nil; c = c.NextSibling {
				children[c] = struct{}{}
			}
			for k, _ := range children {
				n.RemoveChild(k)
			}

			// we can't put our replacement "under" an image, since img cannot have children
			if n.Type == html.ElementNode && n.Data == "img" {
				// n.Parent.InsertBefore(textReplacement,n)
				InsertAfter(n, textReplacement)
				RemoveNode(n)

			} else {
				n.AppendChild(textReplacement)
			}

			if n.Data == "a" {
				prev := n.PrevSibling
				if prev != nil {

					breaker0 := new(html.Node)
					breaker0.Type = html.TextNode
					breaker0.Data = " || "
					n.Parent.InsertBefore(breaker0, prev)

					breaker1 := new(html.Node)
					breaker1.Type = html.ElementNode
					// breaker1.Data =  "||<br>\n"
					breaker1.Data = "br"
					n.Parent.InsertBefore(breaker1, prev)

					breaker2 := new(html.Node)
					breaker2.Type = html.TextNode
					breaker2.Data = "\n"
					n.Parent.InsertBefore(breaker2, prev)

				}
			}

		}
		for c := n.FirstChild; c != nil; c = c.NextSibling {
			fRecurse(c)
		}
	}

}
Ejemplo n.º 13
0
// Strip whitespace if this whitelist is configured
// with the StripWhitespace configuration
func (w *Whitelist) handleText(n *html.Node) {
	if w.StripWhitespace {
		n.Data = strings.TrimSpace(n.Data)
	}
}
Ejemplo n.º 14
0
func ParseHtmlFiles() {

	testDataDir := "./"
	testFiles, err := filepath.Glob(testDataDir + "test*.html")
	if err != nil {
		pf("%v \n", err)
	}

	for _, tf := range testFiles {
		pf("%v\n", tf)

		f, err := os.Open(tf)
		if err != nil {
			pf("1 %v \n", err)
		}
		defer f.Close()
		r1 := bufio.NewReader(f)

		var docRoot *html.Node
		docRoot, err = html.Parse(r1)
		if err != nil {
			pf("3 %v \n", err)
		}

		fRecurse = func(n *html.Node) {
			if n.Type == html.ElementNode && n.Data == "a" {
				s := strings.TrimSpace(fNodeModify(n))
				//pf("found %v\n", s)
				nNew := new(html.Node)
				nNew.Type = html.TextNode
				nNew.Data = s

				// We want to remove all children.
				// Direct loop impossible, since "NextSibling" is set to nil
				// 		during Remove().
				// Therefore first assembling separately, then removing.
				children := map[*html.Node]string{}
				for c := n.FirstChild; c != nil; c = c.NextSibling {
					children[c] = "xx"
				}
				for k, _ := range children {
					n.RemoveChild(k)
					// pf("  removed  %q\n", strings.TrimSpace(k.Data))
				}
				n.AppendChild(nNew)

			}
			for c := n.FirstChild; c != nil; c = c.NextSibling {
				fRecurse(c)
			}
		}
		fRecurse(docRoot)

		var b bytes.Buffer
		html.Render(&b, docRoot)
		util.WriteBytesToFilename("yy_"+tf, &b)
		//fixedHtml := b.String()

		//fmt.Printf("%s \n", spew.Sdump(docRoot))

	}
}