// r is the request to the proxy // u is the url, that the proxy has called func closuredProxifier(argProxyHostPort string, urlSrc *url.URL) FuncType2 { // needed to get the current request into the // "static" recursive functions var closProxyHostPort = argProxyHostPort // port included! var closRemoteHost = fetch.HostFromUrl(urlSrc) // log.Printf("ProxyHost %v, RemoteHost %v (%s)", closProxyHostPort, closRemoteHost, urlSrc) // -------------------------- // ---------------------- var fRecurse FuncType2 fRecurse = func(n *html.Node) { switch { case n.Type == html.ElementNode && n.Data == "form": hidFld := dom.Nd("input") hidFld.Attr = []html.Attribute{ html.Attribute{Key: "name", Val: "redirect-to"}, html.Attribute{Key: "value", Val: attrX(n.Attr, "action")}, } n.AppendChild(hidFld) submt := dom.Nd("input") submt.Attr = []html.Attribute{ html.Attribute{Key: "type", Val: "submit"}, html.Attribute{Key: "value", Val: "subm"}, html.Attribute{Key: "accesskey", Val: "f"}, } n.AppendChild(submt) n.Attr = attrSet(n.Attr, "method", "post") n.Attr = attrSet(n.Attr, "was", "rewritten") n.Attr = attrsAbsoluteAndProxified(n.Attr, closProxyHostPort, closRemoteHost) case n.Type == html.ElementNode && (n.Data == "a" || n.Data == "img"): if n.Data == "a" || n.Data == "img" { attrStore := attrsAbsoluteAndProxified(n.Attr, closProxyHostPort, closRemoteHost) n.Attr = attrStore } default: } for c := n.FirstChild; c != nil; c = c.NextSibling { fRecurse(c) } } return fRecurse }
func img2Link(img *html.Node) { if img.Data == "img" { img.Data = "a" for i := 0; i < len(img.Attr); i++ { if img.Attr[i].Key == "src" { img.Attr[i].Key = "href" } } double := closureTextNodeExists(img) imgContent := "" title := attrX(img.Attr, "title") if double { imgContent = fmt.Sprintf("[img] %v %v | ", "[ctdr]", // content title double removed urlBeautify(attrX(img.Attr, "href"))) } else { imgContent = fmt.Sprintf("[img] %v %v | ", title, urlBeautify(attrX(img.Attr, "href"))) } img.Attr = attrSet(img.Attr, "cfrom", "img") nd := dom.Nd("text", imgContent) img.AppendChild(nd) } }
func textifyNodeSubtree(n *html.Node) { if n.Type == html.ElementNode { nd := dom.Nd("text") nd.Data = textifySubtreeBruteForce(n, 0) nd.Data = stringspb.NormalizeInnerWhitespace(nd.Data) cc := []*html.Node{} for c := n.FirstChild; c != nil; c = c.NextSibling { cc = append(cc, c) } for _, c := range cc { n.RemoveChild(c) } n.AppendChild(nd) nd2 := dom.Nd("br") dom.InsertAfter(n, nd2) } }
func flattenSubtreeV3Inner(n, nClone *html.Node, lvl int) { // log.Printf("fsbi\n") for ch := n.FirstChild; ch != nil; ch = ch.NextSibling { chClone := dom.CloneNode(ch) switch { case ch.Type == html.ElementNode && standard[ch.Data]: nClone.AppendChild(chClone) flattenSubtreeV3Inner(ch, chClone, lvl+1) case ch.Type == html.ElementNode && ch.Data == "a": nClone.AppendChild(chClone) flattenSubtreeV3Inner(ch, chClone, lvl+1) case ch.Type == html.ElementNode && ch.Data == "img": nClone.AppendChild(chClone) case ch.Data == "span": // log.Printf(strings.Repeat(" ", lvl) + "span \n") for cch := ch.FirstChild; cch != nil; cch = cch.NextSibling { // log.Printf(strings.Repeat(" ", lvl)+"span child %v", cch.Data) cchClone := dom.CloneNode(cch) nClone.AppendChild(cchClone) nClone.AppendChild(dom.Nd("text", " ")) flattenSubtreeV3Inner(cch, cchClone, lvl+1) } case ch.Type == html.TextNode && ch.Data != "": chClone.Data = strings.TrimSpace(chClone.Data) chClone.Data += " " nClone.AppendChild(chClone) default: // nClone.AppendChild(chClone) log.Printf("unhandled %s %s\n", dom.NodeTypeStr(ch.Type), ch.Data) } } }
// r is the request to the proxy // u is the url, that the proxy has called func ModifyHTML(r *http.Request, u *url.URL, s string) string { var nums int // counter // needed to get the current request into the // "static" recursive functions var PackageProxyHost = r.Host // port included! var PackageRemoteHost = fetch.HostFromUrl(u) fCondenseNode = func(n *html.Node, depth int) (ret string) { if n.Type == html.ElementNode && n.Data == "script" { ret += fmt.Sprintf(" var script%v = '[script]'; ", nums) nums++ return } if n.Type == html.ElementNode && n.Data == "style" { ret += fmt.Sprintf(" .xxx {margin:2px;} ") return } if n.Type == html.ElementNode && n.Data == "img" { ret += fmt.Sprintf(" [img] %v %v | ", getAttrVal(n.Attr, "alt"), getAttrVal(n.Attr, "src")) } if n.Type == html.ElementNode && n.Data == "a" { ret += "[a]" } if n.Type == html.TextNode { s := n.Data // s = replTabsNewline.Replace(s) // s = strings.TrimSpace(s) if len(s) < 4 { ret += s } else if s != "" { if depth > 0 { ret += fmt.Sprintf(" [txt%v] %v", depth, s) } else { ret += " [txt] " + s } } } for c := n.FirstChild; c != nil; c = c.NextSibling { ret += fCondenseNode(c, depth+1) } return } // -------------------------- // ---------------------- fRecurse = func(n *html.Node) { if n.Type == html.ElementNode && n.Data == "form" { hidFld := new(html.Node) hidFld.Type = html.ElementNode hidFld.Data = "input" hidFld.Attr = []html.Attribute{ html.Attribute{Key: "name", Val: "redirect-to"}, html.Attribute{Key: "value", Val: absolutize(getAttrVal(n.Attr, "action"), PackageRemoteHost)}, } n.AppendChild(hidFld) submt := new(html.Node) submt.Type = html.ElementNode submt.Data = "input" submt.Attr = []html.Attribute{ html.Attribute{Key: "type", Val: "submit"}, html.Attribute{Key: "value", Val: "subm"}, html.Attribute{Key: "accesskey", Val: "f"}, } n.AppendChild(submt) n.Attr = rewriteAttributes(n.Attr, PackageProxyHost, PackageRemoteHost) } if n.Type == html.ElementNode && n.Data == "script" { for i := 0; i < len(n.Attr); i++ { if n.Attr[i].Key == "src" { n.Attr[i].Val = emptySrc } } } if n.Type == html.ElementNode && (n.Data == "a" || n.Data == "img" || n.Data == "script" || n.Data == "style") { s := fCondenseNode(n, 0) //fmt.Printf("found %v\n", s) textReplacement := new(html.Node) textReplacement.Type = html.TextNode textReplacement.Data = s attrStore := []html.Attribute{} if n.Data == "a" || n.Data == "img" { attrStore = rewriteAttributes(n.Attr, PackageProxyHost, PackageRemoteHost) } if n.Data == "img" { n.Data = "a" } if n.Data == "a" { n.Attr = attrStore } // We want to remove all existing children. // Direct loop impossible, since "NextSibling" is set to nil by Remove(). // Therefore first assembling separately, then removing. children := make(map[*html.Node]struct{}) for c := n.FirstChild; c != nil; c = c.NextSibling { children[c] = struct{}{} } for k, _ := range children { n.RemoveChild(k) } // we can't put our replacement "under" an image, since img cannot have children if n.Type == html.ElementNode && n.Data == "img" { // n.Parent.InsertBefore(textReplacement,n) dom.InsertAfter(n, textReplacement) dom.RemoveNode(n) } else { n.AppendChild(textReplacement) } // Insert a || and a newline before every <a...> if n.Data == "a" { prev := n breaker0 := dom.Nd("text", "||") n.Parent.InsertBefore(breaker0, prev) breaker1 := dom.Nd("br") n.Parent.InsertBefore(breaker1, prev) breaker2 := dom.Nd("text", "\n") n.Parent.InsertBefore(breaker2, prev) } } for c := n.FirstChild; c != nil; c = c.NextSibling { fRecurse(c) } } // -------------------------- // ---------------------- var docRoot *html.Node var err error rdr := strings.NewReader(s) docRoot, err = html.Parse(rdr) if err != nil { panic(fmt.Sprintf("3 %v \n", err)) } fRecurse(docRoot) var b bytes.Buffer err = html.Render(&b, docRoot) if err != nil { panic(fmt.Sprintf("4 %v \n", err)) } // log.Printf("len is %v\n", b.Len()) return b.String() }
func breakoutImagesFromAnchorTrees(n *html.Node) { for c := n.FirstChild; c != nil; c = c.NextSibling { breakoutImagesFromAnchorTrees(c) } if n.Type == html.ElementNode && n.Data == "a" { img, lvl := searchImg(n, nil, 0) if img != nil { only1Child := n.FirstChild != nil && n.FirstChild == n.LastChild if lvl == 1 && only1Child { // log.Printf("only child image lvl %v a\n", lvl) n.RemoveChild(img) n.Parent.InsertBefore(img, n.NextSibling) // "insert after; if n.NextSibling==nil => insert at the end" contnt := urlBeautify(attrX(n.Attr, "href")) if len(contnt) < 6 { contnt = "[was img] " + contnt } n.AppendChild(dom.Nd("text", contnt)) } else { if debugBreakOut { b0 := dom.PrintSubtree(n) log.Printf("\n%s\n", b0) } // log.Printf(" got it %v\n", img.Data) a1 := dom.CloneNodeWithSubtree(n) fc1 := closureDeleter(true) fc1(n, 0, false) if debugBreakOut { b1 := dom.PrintSubtree(n) log.Printf("\n%s\n", b1) } fc2 := closureDeleter(false) fc2(a1, 0, false) if debugBreakOut { b2 := dom.PrintSubtree(a1) log.Printf("\n%s\n", b2) log.Printf("--------------------\n") } if true { n.Parent.InsertBefore(img, n.NextSibling) // "insert after; if n.NextSibling==nil => insert at the end" n.Parent.InsertBefore(a1, img.NextSibling) } else { // old way ; sequence corrpution if n had rightwise siblings. n.Parent.AppendChild(img) n.Parent.AppendChild(a1) } } // changing image to link later } else { // log.Printf("no img in a\n") } } }
func condenseBottomUpV2(n *html.Node, lvl, lvlDo int, types map[string]bool) { if lvl < lvlDo { cs := []*html.Node{} for c := n.FirstChild; c != nil; c = c.NextSibling { cs = append(cs, c) } for _, c := range cs { condenseBottomUpV2(c, lvl+1, lvlDo, types) } } else { // log.Printf("action on %v %v\n", lvl, lvlDo) switch { case n.Type == html.ElementNode && types[n.Data]: oldPar := n.Parent if oldPar == nil { return } b, newPar := flattenSubtreeV2(n, nil, 0, nil) // placeholder := dom.Nd("div") // par := n.Parent // par.InsertBefore(placeholder, n.NextSibling) // par.RemoveChild(n) // par.InsertBefore(n2, placeholder) for c := oldPar.FirstChild; c != nil; c = c.NextSibling { oldPar.RemoveChild(c) } for c := newPar.FirstChild; c != nil; c = c.NextSibling { newPar.RemoveChild(c) oldPar.AppendChild(c) } if lvlDo > 4 { bx := dom.PrintSubtree(newPar) fmt.Printf("%s", bx) } // n = n2 nodeRepl := dom.Nd("text", b.String()) if false { // Remove all existing children. // Direct loop impossible, since "NextSibling" is set to nil by Remove(). children := []*html.Node{} for c := n.FirstChild; c != nil; c = c.NextSibling { children = append(children, c) // assembling separately, before removing. } for _, c := range children { log.Printf("c %4v rem from %4v ", c.Data, n.Data) n.RemoveChild(c) } // we can't put our replacement "under" an image, since img cannot have children if n.Type == html.ElementNode && n.Data == "img" { n.Parent.InsertBefore(nodeRepl, n.NextSibling) // if n.NextSibling==nil => insert at the end n.Parent.RemoveChild(n) } else { n.AppendChild(nodeRepl) } // Insert a || and a newline before every <a...> // if n.Data == "a" { // n.Parent.InsertBefore(dom.Nd("text", " || "), n) // } } default: } } }
// Now this third implementation finally condenses *selectively*. // Not all boats from each pond are lifted equally. // We achieve tremendous structural simplification. // It also starts from top, pulling lower levels up. // Unlike implementation #1, that started from the middle. func topDownV3(l1 *html.Node, l2Types map[string]bool, l3Types map[string]bool) { if l1.Type != html.ElementNode && l1.Type != html.DocumentNode { return // cannot assign to - do not unable to have children } if l1.Data == "span" || l1.Data == "a" { return // want not condense into } // dig two levels deep // isolate l2,l3 l2s := []*html.Node{} l3s := map[*html.Node][]*html.Node{} for l2 := l1.FirstChild; l2 != nil; l2 = l2.NextSibling { l2s = append(l2s, l2) // l2s = append([]*html.Node{l2}, l2s...) // order inversion for l3 := l2.FirstChild; l3 != nil; l3 = l3.NextSibling { l3s[l2] = append(l3s[l2], l3) // l3s[l2] = append(map[*html.Node][]*html.Node{l2: []*html.Node{l3}}, l3s[l2]...) // order inversion } } postponedRemoval := map[*html.Node]bool{} // // // check types for each l2 subtree distinctively for _, l2 := range l2s { l2Match := l2.Type == html.ElementNode && l2Types[l2.Data] // l2 is a div l3Match := true for _, l3 := range l3s[l2] { l3Match = l3Match && (l3.Type == html.ElementNode && l3Types[l3.Data]) } // act if l2Match && l3Match { // detach l3 from l2 for _, l3 := range l3s[l2] { // if ml3[l3] > 0 { // fmt.Printf("rmd_%v_%v ", ml3[l3], l3.Data) // } l2.RemoveChild(l3) // ml3[l3]++ } // Since we still need l2 below // We have to postpone detaching l2 from l1 // to the bottom // NOT HERE: l1.RemoveChild(l2) postponedRemoval[l2] = true for _, l3 := range l3s[l2] { // attach l3 to l1 if l3.Data != "a" && l3.Data != "span" { l1.InsertBefore(l3, l2) } else { wrap := dom.Nd("p") wrap.Attr = []html.Attribute{html.Attribute{Key: "cfrm", Val: "noth"}} wrap.AppendChild(l3) // NOT wrap.FirstChild = l3 l1.InsertBefore(wrap, l2) } } } } for k, _ := range postponedRemoval { l1.RemoveChild(k) // detach l2 from l1 } }