Exemple #1
0
// r is the request to the proxy
// u is the url, that the proxy has called
func closuredProxifier(argProxyHostPort string, urlSrc *url.URL) FuncType2 {

	// needed to get the current request into the
	// "static" recursive functions
	var closProxyHostPort = argProxyHostPort // port included!

	var closRemoteHost = fetch.HostFromUrl(urlSrc)
	// log.Printf("ProxyHost %v, RemoteHost %v (%s)", closProxyHostPort, closRemoteHost, urlSrc)

	// --------------------------
	// ----------------------

	var fRecurse FuncType2
	fRecurse = func(n *html.Node) {

		switch {
		case n.Type == html.ElementNode && n.Data == "form":
			hidFld := dom.Nd("input")
			hidFld.Attr = []html.Attribute{
				html.Attribute{Key: "name", Val: "redirect-to"},
				html.Attribute{Key: "value", Val: attrX(n.Attr, "action")},
			}
			n.AppendChild(hidFld)

			submt := dom.Nd("input")
			submt.Attr = []html.Attribute{
				html.Attribute{Key: "type", Val: "submit"},
				html.Attribute{Key: "value", Val: "subm"},
				html.Attribute{Key: "accesskey", Val: "f"},
			}
			n.AppendChild(submt)

			n.Attr = attrSet(n.Attr, "method", "post")
			n.Attr = attrSet(n.Attr, "was", "rewritten")

			n.Attr = attrsAbsoluteAndProxified(n.Attr, closProxyHostPort, closRemoteHost)

		case n.Type == html.ElementNode && (n.Data == "a" || n.Data == "img"):

			if n.Data == "a" || n.Data == "img" {
				attrStore := attrsAbsoluteAndProxified(n.Attr, closProxyHostPort, closRemoteHost)
				n.Attr = attrStore
			}

		default:
		}

		for c := n.FirstChild; c != nil; c = c.NextSibling {
			fRecurse(c)
		}
	}

	return fRecurse

}
Exemple #2
0
func img2Link(img *html.Node) {

	if img.Data == "img" {

		img.Data = "a"
		for i := 0; i < len(img.Attr); i++ {
			if img.Attr[i].Key == "src" {
				img.Attr[i].Key = "href"
			}
		}

		double := closureTextNodeExists(img)
		imgContent := ""
		title := attrX(img.Attr, "title")

		if double {
			imgContent = fmt.Sprintf("[img] %v %v | ",
				"[ctdr]", // content title double removed
				urlBeautify(attrX(img.Attr, "href")))

		} else {
			imgContent = fmt.Sprintf("[img] %v %v | ",
				title,
				urlBeautify(attrX(img.Attr, "href")))
		}

		img.Attr = attrSet(img.Attr, "cfrom", "img")
		nd := dom.Nd("text", imgContent)
		img.AppendChild(nd)
	}

}
func textifyNodeSubtree(n *html.Node) {

	if n.Type == html.ElementNode {

		nd := dom.Nd("text")
		nd.Data = textifySubtreeBruteForce(n, 0)
		nd.Data = stringspb.NormalizeInnerWhitespace(nd.Data)

		cc := []*html.Node{}
		for c := n.FirstChild; c != nil; c = c.NextSibling {
			cc = append(cc, c)
		}
		for _, c := range cc {
			n.RemoveChild(c)
		}

		n.AppendChild(nd)

		nd2 := dom.Nd("br")
		dom.InsertAfter(n, nd2)

	}

}
func flattenSubtreeV3Inner(n, nClone *html.Node, lvl int) {

	// log.Printf("fsbi\n")

	for ch := n.FirstChild; ch != nil; ch = ch.NextSibling {

		chClone := dom.CloneNode(ch)

		switch {

		case ch.Type == html.ElementNode && standard[ch.Data]:
			nClone.AppendChild(chClone)
			flattenSubtreeV3Inner(ch, chClone, lvl+1)

		case ch.Type == html.ElementNode && ch.Data == "a":
			nClone.AppendChild(chClone)
			flattenSubtreeV3Inner(ch, chClone, lvl+1)

		case ch.Type == html.ElementNode && ch.Data == "img":
			nClone.AppendChild(chClone)

		case ch.Data == "span":
			// log.Printf(strings.Repeat("  ", lvl) + "span \n")
			for cch := ch.FirstChild; cch != nil; cch = cch.NextSibling {
				// log.Printf(strings.Repeat("    ", lvl)+"span child %v", cch.Data)
				cchClone := dom.CloneNode(cch)
				nClone.AppendChild(cchClone)
				nClone.AppendChild(dom.Nd("text", " "))
				flattenSubtreeV3Inner(cch, cchClone, lvl+1)
			}

		case ch.Type == html.TextNode && ch.Data != "":
			chClone.Data = strings.TrimSpace(chClone.Data)
			chClone.Data += " "
			nClone.AppendChild(chClone)

		default:
			//			nClone.AppendChild(chClone)
			log.Printf("unhandled %s %s\n", dom.NodeTypeStr(ch.Type), ch.Data)

		}

	}

}
Exemple #5
0
// r is the request to the proxy
// u is the url, that the proxy has called
func ModifyHTML(r *http.Request, u *url.URL, s string) string {

	var nums int // counter

	// needed to get the current request into the
	// "static" recursive functions
	var PackageProxyHost = r.Host // port included!
	var PackageRemoteHost = fetch.HostFromUrl(u)

	fCondenseNode = func(n *html.Node, depth int) (ret string) {

		if n.Type == html.ElementNode && n.Data == "script" {
			ret += fmt.Sprintf(" var script%v = '[script]'; ", nums)
			nums++
			return
		}
		if n.Type == html.ElementNode && n.Data == "style" {
			ret += fmt.Sprintf(" .xxx {margin:2px;} ")
			return
		}

		if n.Type == html.ElementNode && n.Data == "img" {
			ret += fmt.Sprintf(" [img] %v %v | ", getAttrVal(n.Attr, "alt"), getAttrVal(n.Attr, "src"))
		}

		if n.Type == html.ElementNode && n.Data == "a" {
			ret += "[a]"
		}

		if n.Type == html.TextNode {
			s := n.Data
			// s = replTabsNewline.Replace(s)
			// s = strings.TrimSpace(s)
			if len(s) < 4 {
				ret += s
			} else if s != "" {
				if depth > 0 {
					ret += fmt.Sprintf(" [txt%v] %v", depth, s)
				} else {
					ret += " [txt] " + s
				}
			}
		}

		for c := n.FirstChild; c != nil; c = c.NextSibling {
			ret += fCondenseNode(c, depth+1)
		}
		return
	}

	// --------------------------
	// ----------------------

	fRecurse = func(n *html.Node) {

		if n.Type == html.ElementNode && n.Data == "form" {
			hidFld := new(html.Node)
			hidFld.Type = html.ElementNode
			hidFld.Data = "input"
			hidFld.Attr = []html.Attribute{
				html.Attribute{Key: "name", Val: "redirect-to"},
				html.Attribute{Key: "value", Val: absolutize(getAttrVal(n.Attr, "action"), PackageRemoteHost)},
			}
			n.AppendChild(hidFld)

			submt := new(html.Node)
			submt.Type = html.ElementNode
			submt.Data = "input"
			submt.Attr = []html.Attribute{
				html.Attribute{Key: "type", Val: "submit"},
				html.Attribute{Key: "value", Val: "subm"},
				html.Attribute{Key: "accesskey", Val: "f"},
			}
			n.AppendChild(submt)

			n.Attr = rewriteAttributes(n.Attr, PackageProxyHost, PackageRemoteHost)

		}
		if n.Type == html.ElementNode && n.Data == "script" {
			for i := 0; i < len(n.Attr); i++ {
				if n.Attr[i].Key == "src" {
					n.Attr[i].Val = emptySrc
				}
			}
		}
		if n.Type == html.ElementNode &&
			(n.Data == "a" || n.Data == "img" || n.Data == "script" || n.Data == "style") {

			s := fCondenseNode(n, 0)
			//fmt.Printf("found %v\n", s)
			textReplacement := new(html.Node)
			textReplacement.Type = html.TextNode
			textReplacement.Data = s

			attrStore := []html.Attribute{}
			if n.Data == "a" || n.Data == "img" {
				attrStore = rewriteAttributes(n.Attr, PackageProxyHost, PackageRemoteHost)
			}
			if n.Data == "img" {
				n.Data = "a"
			}
			if n.Data == "a" {
				n.Attr = attrStore
			}

			// We want to remove all existing children.
			// Direct loop impossible, since "NextSibling" is set to nil by Remove().
			// Therefore first assembling separately, then removing.
			children := make(map[*html.Node]struct{})
			for c := n.FirstChild; c != nil; c = c.NextSibling {
				children[c] = struct{}{}
			}
			for k, _ := range children {
				n.RemoveChild(k)
			}

			// we can't put our replacement "under" an image, since img cannot have children
			if n.Type == html.ElementNode && n.Data == "img" {
				// n.Parent.InsertBefore(textReplacement,n)
				dom.InsertAfter(n, textReplacement)
				dom.RemoveNode(n)

			} else {
				n.AppendChild(textReplacement)
			}

			// Insert a  || and a newline before every <a...>
			if n.Data == "a" {
				prev := n

				breaker0 := dom.Nd("text", "||")
				n.Parent.InsertBefore(breaker0, prev)

				breaker1 := dom.Nd("br")
				n.Parent.InsertBefore(breaker1, prev)

				breaker2 := dom.Nd("text", "\n")
				n.Parent.InsertBefore(breaker2, prev)
			}

		}
		for c := n.FirstChild; c != nil; c = c.NextSibling {
			fRecurse(c)
		}
	}

	// --------------------------
	// ----------------------
	var docRoot *html.Node
	var err error
	rdr := strings.NewReader(s)
	docRoot, err = html.Parse(rdr)
	if err != nil {
		panic(fmt.Sprintf("3 %v \n", err))
	}

	fRecurse(docRoot)

	var b bytes.Buffer
	err = html.Render(&b, docRoot)
	if err != nil {
		panic(fmt.Sprintf("4 %v \n", err))
	}
	// log.Printf("len is %v\n", b.Len())

	return b.String()
}
func breakoutImagesFromAnchorTrees(n *html.Node) {

	for c := n.FirstChild; c != nil; c = c.NextSibling {
		breakoutImagesFromAnchorTrees(c)
	}

	if n.Type == html.ElementNode && n.Data == "a" {

		img, lvl := searchImg(n, nil, 0)

		if img != nil {

			only1Child := n.FirstChild != nil && n.FirstChild == n.LastChild
			if lvl == 1 && only1Child {
				// log.Printf("only child image lvl %v a\n", lvl)
				n.RemoveChild(img)
				n.Parent.InsertBefore(img, n.NextSibling) // "insert after; if n.NextSibling==nil => insert at the end"
				contnt := urlBeautify(attrX(n.Attr, "href"))
				if len(contnt) < 6 {
					contnt = "[was img] " + contnt
				}
				n.AppendChild(dom.Nd("text", contnt))
			} else {

				if debugBreakOut {
					b0 := dom.PrintSubtree(n)
					log.Printf("\n%s\n", b0)
				}

				// log.Printf("  got it  %v\n", img.Data)
				a1 := dom.CloneNodeWithSubtree(n)
				fc1 := closureDeleter(true)
				fc1(n, 0, false)
				if debugBreakOut {
					b1 := dom.PrintSubtree(n)
					log.Printf("\n%s\n", b1)
				}

				fc2 := closureDeleter(false)
				fc2(a1, 0, false)
				if debugBreakOut {
					b2 := dom.PrintSubtree(a1)
					log.Printf("\n%s\n", b2)
					log.Printf("--------------------\n")
				}

				if true {
					n.Parent.InsertBefore(img, n.NextSibling) // "insert after; if n.NextSibling==nil => insert at the end"
					n.Parent.InsertBefore(a1, img.NextSibling)
				} else {
					// old way ; sequence corrpution if n had rightwise siblings.
					n.Parent.AppendChild(img)
					n.Parent.AppendChild(a1)

				}

			}

			// changing image to link later

		} else {
			// log.Printf("no img in a\n")
		}
	}

}
func condenseBottomUpV2(n *html.Node, lvl, lvlDo int, types map[string]bool) {

	if lvl < lvlDo {

		cs := []*html.Node{}
		for c := n.FirstChild; c != nil; c = c.NextSibling {
			cs = append(cs, c)
		}
		for _, c := range cs {
			condenseBottomUpV2(c, lvl+1, lvlDo, types)
		}

	} else {

		// log.Printf("action on %v %v\n", lvl, lvlDo)

		switch {

		case n.Type == html.ElementNode && types[n.Data]:

			oldPar := n.Parent
			if oldPar == nil {
				return
			}

			b, newPar := flattenSubtreeV2(n, nil, 0, nil)

			// placeholder := dom.Nd("div")
			// par := n.Parent
			// par.InsertBefore(placeholder, n.NextSibling)
			// par.RemoveChild(n)
			// par.InsertBefore(n2, placeholder)

			for c := oldPar.FirstChild; c != nil; c = c.NextSibling {
				oldPar.RemoveChild(c)
			}

			for c := newPar.FirstChild; c != nil; c = c.NextSibling {
				newPar.RemoveChild(c)
				oldPar.AppendChild(c)
			}

			if lvlDo > 4 {
				bx := dom.PrintSubtree(newPar)
				fmt.Printf("%s", bx)
			}

			// n = n2

			nodeRepl := dom.Nd("text", b.String())

			if false {

				// Remove all existing children.
				// Direct loop impossible, since "NextSibling" is set to nil by Remove().
				children := []*html.Node{}
				for c := n.FirstChild; c != nil; c = c.NextSibling {
					children = append(children, c) //  assembling separately, before removing.
				}
				for _, c := range children {
					log.Printf("c %4v rem from %4v ", c.Data, n.Data)
					n.RemoveChild(c)
				}

				// we can't put our replacement "under" an image, since img cannot have children
				if n.Type == html.ElementNode && n.Data == "img" {
					n.Parent.InsertBefore(nodeRepl, n.NextSibling) // if n.NextSibling==nil => insert at the end
					n.Parent.RemoveChild(n)
				} else {
					n.AppendChild(nodeRepl)
				}

				// Insert a  || and a newline before every <a...>
				// if n.Data == "a" {
				// 	n.Parent.InsertBefore(dom.Nd("text", " || "), n)
				// }
			}

		default:
		}

	}

}
Exemple #8
0
// Now this third implementation finally condenses *selectively*.
// Not all boats from each pond are lifted equally.
// We achieve tremendous structural simplification.
// It also starts from top, pulling lower levels up.
// Unlike implementation #1, that started from the middle.
func topDownV3(l1 *html.Node, l2Types map[string]bool, l3Types map[string]bool) {

	if l1.Type != html.ElementNode &&
		l1.Type != html.DocumentNode {
		return // cannot assign to - do not unable to have children
	}
	if l1.Data == "span" || l1.Data == "a" {
		return // want not condense into
	}

	// dig two levels deep

	// isolate l2,l3
	l2s := []*html.Node{}
	l3s := map[*html.Node][]*html.Node{}

	for l2 := l1.FirstChild; l2 != nil; l2 = l2.NextSibling {

		l2s = append(l2s, l2)
		// l2s = append([]*html.Node{l2}, l2s...) // order inversion

		for l3 := l2.FirstChild; l3 != nil; l3 = l3.NextSibling {
			l3s[l2] = append(l3s[l2], l3)
			// l3s[l2] = append(map[*html.Node][]*html.Node{l2: []*html.Node{l3}}, l3s[l2]...) // order inversion
		}
	}

	postponedRemoval := map[*html.Node]bool{}

	//
	//
	// check types for each l2 subtree distinctively
	for _, l2 := range l2s {

		l2Match := l2.Type == html.ElementNode && l2Types[l2.Data] // l2 is a div

		l3Match := true
		for _, l3 := range l3s[l2] {
			l3Match = l3Match && (l3.Type == html.ElementNode && l3Types[l3.Data])
		}

		// act
		if l2Match && l3Match {

			// detach l3 from l2
			for _, l3 := range l3s[l2] {
				// if ml3[l3] > 0 {
				// 	fmt.Printf("rmd_%v_%v ", ml3[l3], l3.Data)
				// }
				l2.RemoveChild(l3)
				// ml3[l3]++
			}

			// Since we still need l2 below
			// We have to postpone detaching l2 from l1
			// to the bottom
			// NOT HERE: l1.RemoveChild(l2)
			postponedRemoval[l2] = true

			for _, l3 := range l3s[l2] {
				// attach l3 to l1

				if l3.Data != "a" && l3.Data != "span" {
					l1.InsertBefore(l3, l2)
				} else {
					wrap := dom.Nd("p")
					wrap.Attr = []html.Attribute{html.Attribute{Key: "cfrm", Val: "noth"}}
					wrap.AppendChild(l3)
					// NOT  wrap.FirstChild = l3
					l1.InsertBefore(wrap, l2)
				}
			}

		}

	}

	for k, _ := range postponedRemoval {
		l1.RemoveChild(k) // detach l2 from l1
	}

}