Exemple #1
1
func (this *flowdocument_maker) convert_flowdocument(frag *html.Node) {
	if frag.Type == html.TextNode {
		return
	}
	ignore_children := false
	switch frag.Data {
	case "img":
		frag.Type = html.CommentNode
		node_clear_children(frag)
		frag.Attr = nil
	case "a":
		frag.Data = "Hyperlink"
		frag.Attr = extract_ahref_attr(frag.Attr)
	case "article":
		frag.Data = "FlowDocument"
		// set namespace dont work
		frag.Attr = []html.Attribute{html.Attribute{Key: "xmlns", Val: fdocns}}
	case "object", "video", "audio", "embed":
		frag.Type = html.CommentNode
		node_clear_children(frag)
		frag.Attr = nil
	case "p":
		fallthrough
	default:
		frag.Data = "Paragraph"
		frag.Attr = nil
		if this.first_paragraph == nil {
			this.first_paragraph = frag
		}
	}
	for child := frag.FirstChild; ignore_children == false && child != nil; child = child.NextSibling {
		this.convert_flowdocument(child)
	}
}
Exemple #2
0
func addFiles(form uint8, parent *html.Node, files []string) {
	for _, file := range files {
		node := html.Node{
			Type: html.ElementNode,
		}
		switch form {
		case SCRIPT:
			node.Data = "script"
			node.Attr = []html.Attribute{
				html.Attribute{
					Key: "src",
					Val: file,
				},
			}

		case STYLE:
			node.Data = "link"
			node.Attr = []html.Attribute{
				html.Attribute{
					Key: "rel",
					Val: "stylesheet",
				},
				html.Attribute{
					Key: "href",
					Val: file,
				},
			}
		default:
			panic("Type not understood")
		}

		parent.AppendChild(&node)
	}
}
Exemple #3
0
func try_update_class_attr(b *html.Node, class string) {
	if len(class) > 0 {
		ca := make([]html.Attribute, len(b.Attr)+1)
		copy(ca, b.Attr)
		ca[len(b.Attr)] = html.Attribute{Key: "class", Val: class}
		b.Attr = ca
	}
}
// CloneNode makes a copy of a Node with all descendants.
func CloneNode(n *exphtml.Node) *exphtml.Node {
	clone := new(exphtml.Node)
	clone.Type = n.Type
	clone.DataAtom = n.DataAtom
	clone.Data = n.Data
	clone.Attr = make([]exphtml.Attribute, len(n.Attr))
	copy(clone.Attr, n.Attr)
	for c := n.FirstChild; c != nil; c = c.NextSibling {
		nc := CloneNode(c)
		clone.AppendChild(nc)
	}
	return clone
}
Exemple #5
0
// Remove all attributes on the provided node
// that are not contained within this whitelist
func (w *Whitelist) sanitizeAttributes(n *html.Node) {
	attributes := make([]html.Attribute, len(n.Attr))

	i := 0
	for _, attribute := range n.Attr {
		if w.HasAttributeForElement(n.Data, attribute.Key) {
			attributes[i] = attribute
			i += 1
		}
	}
	n.Attr = attributes[0:i]

}
Exemple #6
0
// reserve id, class, href, src
func (this *HtmlCleaner) clean_attributes(n *html.Node) {
	for child := n.FirstChild; child != nil; child = child.NextSibling {
		this.clean_attributes(child)
	}
	attrs := []html.Attribute{}
	for _, attr := range n.Attr {
		if attr.Key == "id" || attr.Key == "class" || attr.Key == "href" || attr.Key == "src" {
			attrs = append(attrs, attr)
		}
	}
	if len(attrs) != len(n.Attr) {
		n.Attr = attrs
	}
}
Exemple #7
0
func setAttributeValue(attrName string, val string, n *html.Node) {
	if n == nil {
		return
	}

	for i := range n.Attr {
		if attr := &n.Attr[i]; attr.Key == attrName {
			attr.Val = val
			return
		}
	}

	n.Attr = append(n.Attr, html.Attribute{Key: attrName, Val: val})
}
Exemple #8
0
// reserve id, class, href, src, width, height, alt
// class,id会用于后面正文内容的判定
// width/height/alt会用于判定image时候是正文
func (this *html_cleaner) clean_attributes(n *html.Node) {
	for child := n.FirstChild; child != nil; child = child.NextSibling {
		this.clean_attributes(child)
	}
	var attrs []html.Attribute
	for _, attr := range n.Attr {
		switch attr.Key {
		case "id", "class", "href", "src", "width", "height", "alt":
			attrs = append(attrs, attr)
		}
	}
	if len(attrs) != len(n.Attr) {
		n.Attr = attrs
	}
}
Exemple #9
0
func node_set_attribute(n *html.Node, name, val string) {
	v := node_get_attribute(n, name)
	if v == "" {
		n.Attr = append(n.Attr, html.Attribute{Key: name, Val: val})
	}
}
Exemple #10
0
func init() {

	fCondenseNode = func(n *html.Node, depth int) (ret string) {

		if n.Type == html.ElementNode && n.Data == "script" {
			ret += fmt.Sprintf(" var script%v = '[script]'; ", nums)
			nums++
			return
		}
		if n.Type == html.ElementNode && n.Data == "style" {
			ret += fmt.Sprintf(" .xxx {margin:2px;} ")
			return
		}

		if n.Type == html.ElementNode && n.Data == "img" {
			ret += fmt.Sprintf(" [img] %v %v | ", getAttrVal(n.Attr, "alt"), getAttrVal(n.Attr, "src"))
		}

		if n.Type == html.ElementNode && n.Data == "a" {
			ret += "[a]"
		}

		if n.Type == html.TextNode {
			s := n.Data
			// s = replTabsNewline.Replace(s)
			// s = strings.TrimSpace(s)
			if len(s) < 4 {
				ret += s
			} else if s != "" {
				if depth > 0 {
					ret += fmt.Sprintf(" [txt%v] %v", depth, s)
				} else {
					ret += " [txt] " + s
				}
			}
		}

		for c := n.FirstChild; c != nil; c = c.NextSibling {
			ret += fCondenseNode(c, depth+1)
		}
		return
	}

	fRecurse = func(n *html.Node) {
		if n.Type == html.ElementNode && n.Data == "form" {

			hidFld := new(html.Node)
			hidFld.Type = html.ElementNode
			hidFld.Data = "input"
			hidFld.Attr = []html.Attribute{html.Attribute{Key: "name", Val: "redirect-to"}, html.Attribute{Key: "value", Val: absolutize(getAttrVal(n.Attr, "action"))}}
			n.AppendChild(hidFld)

			n.Attr = rewriteAttributes(n.Attr, UnsyncedGlobalReq)

		}
		if n.Type == html.ElementNode && n.Data == "script" {
			for i := 0; i < len(n.Attr); i++ {
				if n.Attr[i].Key == "src" {
					n.Attr[i].Val = emptySrc
				}
			}
		}
		if n.Type == html.ElementNode &&
			(n.Data == "a" || n.Data == "img" || n.Data == "script" || n.Data == "style") {

			s := fCondenseNode(n, 0)
			//fmt.Printf("found %v\n", s)
			textReplacement := new(html.Node)
			textReplacement.Type = html.TextNode
			textReplacement.Data = s

			if n.Data == "a" || n.Data == "img" {
				n.Attr = rewriteAttributes(n.Attr, UnsyncedGlobalReq)
			}

			// We want to remove all existing children.
			// Direct loop impossible, since "NextSibling" is set to nil by Remove().
			// Therefore first assembling separately, then removing.
			children := make(map[*html.Node]struct{})
			for c := n.FirstChild; c != nil; c = c.NextSibling {
				children[c] = struct{}{}
			}
			for k, _ := range children {
				n.RemoveChild(k)
			}

			// we can't put our replacement "under" an image, since img cannot have children
			if n.Type == html.ElementNode && n.Data == "img" {
				// n.Parent.InsertBefore(textReplacement,n)
				InsertAfter(n, textReplacement)
				RemoveNode(n)

			} else {
				n.AppendChild(textReplacement)
			}

			if n.Data == "a" {
				prev := n.PrevSibling
				if prev != nil {

					breaker0 := new(html.Node)
					breaker0.Type = html.TextNode
					breaker0.Data = " || "
					n.Parent.InsertBefore(breaker0, prev)

					breaker1 := new(html.Node)
					breaker1.Type = html.ElementNode
					// breaker1.Data =  "||<br>\n"
					breaker1.Data = "br"
					n.Parent.InsertBefore(breaker1, prev)

					breaker2 := new(html.Node)
					breaker2.Type = html.TextNode
					breaker2.Data = "\n"
					n.Parent.InsertBefore(breaker2, prev)

				}
			}

		}
		for c := n.FirstChild; c != nil; c = c.NextSibling {
			fRecurse(c)
		}
	}

}
Exemple #11
0
func parseAttrs(node *html.Node, attrs string) {
	keyIdx := 0
	pos := 0
	for pos < len(attrs) {
		pos += countSpaces(attrs[pos:])
		if pos >= len(attrs) {
			break
		}

		// try to parse key name
		keyStart, keyEnd := pos, pos
		for keyEnd < len(attrs) && isWord(attrs[keyEnd]) {
			keyEnd++
		}

		eqPos := keyEnd + countSpaces(attrs[keyEnd:])
		if eqPos < len(attrs) && attrs[eqPos] == '=' {
			// looks like a key-value pair
			var innerPos, innerEnd int
			valPos := eqPos + 1 + countSpaces(attrs[eqPos+1:])
			valEnd := valPos

			if valPos < len(attrs) {
				// quoted attrib?
				if attrs[valPos] == '"' || attrs[valPos] == '\'' {
					innerEnd = valPos + 1 + strings.IndexRune(attrs[valPos+1:], rune(attrs[valPos]))
					if innerEnd > valPos {
						innerPos, valEnd = valPos+1, innerEnd+1
					}
				}

				// unquoted attrib?
				if valEnd == valPos {
					for valEnd < len(attrs) && !isSpace(attrs[valEnd]) && attrs[valEnd] != '"' && attrs[valEnd] != '\'' {
						valEnd++
					}
					innerPos = valPos
					innerEnd = valEnd
				}

				// if we have a value, add it to the node!
				if valEnd != valPos {
					node.Attr = append(node.Attr, html.Attribute{
						Key: attrs[keyStart:keyEnd],
						Val: attrs[innerPos:innerEnd],
					})
					pos = valEnd
					continue
				}
			}
		}

		// key-value pair didn't work out. try parsing as indexed value
		var innerPos, innerEnd int
		end := pos

		// quoted value?
		if attrs[pos] == '"' {
			innerEnd = pos + 1 + strings.IndexRune(attrs[pos+1:], '"')
			if innerEnd > pos {
				innerPos, end = pos+1, innerEnd+1
			}
		}

		// if all else fails, try regular value
		if end == pos {
			for end < len(attrs) && !isSpace(attrs[end]) {
				end++
			}
			innerPos, innerEnd = pos, end
		}

		node.Attr = append(node.Attr, html.Attribute{
			Key: fmt.Sprintf("@%d", keyIdx),
			Val: attrs[innerPos:innerEnd],
		})
		pos = end
		keyIdx++
	}
}