// removeCommentsAndIntertagWhitespace employs horizontal traversal using a queue
func removeCommentsAndIntertagWhitespace(lp interface{}) {

	var queue = util.NewQueue(10)

	for lp != nil {

		lpn := lp.(NdX).Nd
		lvl := lp.(NdX).Lvl

		for c := lpn.FirstChild; c != nil; c = c.NextSibling {
			queue.EnQueue(NdX{c, lvl + 1})
		}

		// processing
		if lpn.Type == html.CommentNode {
			dom.RemoveNode(lpn)
		}

		// extinguish textnodes that do only formatting (spaces, tabs, line breaks)
		if lpn.Type == html.TextNode && isSpacey(lpn.Data) {
			dom.RemoveNode(lpn)
		}

		// next node
		lp = queue.DeQueue()
	}
}
示例#2
0
// r is the request to the proxy
// u is the url, that the proxy has called
func ModifyHTML(r *http.Request, u *url.URL, s string) string {

	var nums int // counter

	// needed to get the current request into the
	// "static" recursive functions
	var PackageProxyHost = r.Host // port included!
	var PackageRemoteHost = fetch.HostFromUrl(u)

	fCondenseNode = func(n *html.Node, depth int) (ret string) {

		if n.Type == html.ElementNode && n.Data == "script" {
			ret += fmt.Sprintf(" var script%v = '[script]'; ", nums)
			nums++
			return
		}
		if n.Type == html.ElementNode && n.Data == "style" {
			ret += fmt.Sprintf(" .xxx {margin:2px;} ")
			return
		}

		if n.Type == html.ElementNode && n.Data == "img" {
			ret += fmt.Sprintf(" [img] %v %v | ", getAttrVal(n.Attr, "alt"), getAttrVal(n.Attr, "src"))
		}

		if n.Type == html.ElementNode && n.Data == "a" {
			ret += "[a]"
		}

		if n.Type == html.TextNode {
			s := n.Data
			// s = replTabsNewline.Replace(s)
			// s = strings.TrimSpace(s)
			if len(s) < 4 {
				ret += s
			} else if s != "" {
				if depth > 0 {
					ret += fmt.Sprintf(" [txt%v] %v", depth, s)
				} else {
					ret += " [txt] " + s
				}
			}
		}

		for c := n.FirstChild; c != nil; c = c.NextSibling {
			ret += fCondenseNode(c, depth+1)
		}
		return
	}

	// --------------------------
	// ----------------------

	fRecurse = func(n *html.Node) {

		if n.Type == html.ElementNode && n.Data == "form" {
			hidFld := new(html.Node)
			hidFld.Type = html.ElementNode
			hidFld.Data = "input"
			hidFld.Attr = []html.Attribute{
				html.Attribute{Key: "name", Val: "redirect-to"},
				html.Attribute{Key: "value", Val: absolutize(getAttrVal(n.Attr, "action"), PackageRemoteHost)},
			}
			n.AppendChild(hidFld)

			submt := new(html.Node)
			submt.Type = html.ElementNode
			submt.Data = "input"
			submt.Attr = []html.Attribute{
				html.Attribute{Key: "type", Val: "submit"},
				html.Attribute{Key: "value", Val: "subm"},
				html.Attribute{Key: "accesskey", Val: "f"},
			}
			n.AppendChild(submt)

			n.Attr = rewriteAttributes(n.Attr, PackageProxyHost, PackageRemoteHost)

		}
		if n.Type == html.ElementNode && n.Data == "script" {
			for i := 0; i < len(n.Attr); i++ {
				if n.Attr[i].Key == "src" {
					n.Attr[i].Val = emptySrc
				}
			}
		}
		if n.Type == html.ElementNode &&
			(n.Data == "a" || n.Data == "img" || n.Data == "script" || n.Data == "style") {

			s := fCondenseNode(n, 0)
			//fmt.Printf("found %v\n", s)
			textReplacement := new(html.Node)
			textReplacement.Type = html.TextNode
			textReplacement.Data = s

			attrStore := []html.Attribute{}
			if n.Data == "a" || n.Data == "img" {
				attrStore = rewriteAttributes(n.Attr, PackageProxyHost, PackageRemoteHost)
			}
			if n.Data == "img" {
				n.Data = "a"
			}
			if n.Data == "a" {
				n.Attr = attrStore
			}

			// We want to remove all existing children.
			// Direct loop impossible, since "NextSibling" is set to nil by Remove().
			// Therefore first assembling separately, then removing.
			children := make(map[*html.Node]struct{})
			for c := n.FirstChild; c != nil; c = c.NextSibling {
				children[c] = struct{}{}
			}
			for k, _ := range children {
				n.RemoveChild(k)
			}

			// we can't put our replacement "under" an image, since img cannot have children
			if n.Type == html.ElementNode && n.Data == "img" {
				// n.Parent.InsertBefore(textReplacement,n)
				dom.InsertAfter(n, textReplacement)
				dom.RemoveNode(n)

			} else {
				n.AppendChild(textReplacement)
			}

			// Insert a  || and a newline before every <a...>
			if n.Data == "a" {
				prev := n

				breaker0 := dom.Nd("text", "||")
				n.Parent.InsertBefore(breaker0, prev)

				breaker1 := dom.Nd("br")
				n.Parent.InsertBefore(breaker1, prev)

				breaker2 := dom.Nd("text", "\n")
				n.Parent.InsertBefore(breaker2, prev)
			}

		}
		for c := n.FirstChild; c != nil; c = c.NextSibling {
			fRecurse(c)
		}
	}

	// --------------------------
	// ----------------------
	var docRoot *html.Node
	var err error
	rdr := strings.NewReader(s)
	docRoot, err = html.Parse(rdr)
	if err != nil {
		panic(fmt.Sprintf("3 %v \n", err))
	}

	fRecurse(docRoot)

	var b bytes.Buffer
	err = html.Render(&b, docRoot)
	if err != nil {
		panic(fmt.Sprintf("4 %v \n", err))
	}
	// log.Printf("len is %v\n", b.Len())

	return b.String()
}