func Test2(t *testing.T) { start := time.Now() lg, b := loghttp.BuffLoggerUniversal(nil, nil) _ = b // closureOverBuf := func(bUnused *bytes.Buffer) { // loghttp.Pf(nil, nil, b.String()) // } // defer closureOverBuf(b) // the argument is ignored, var c aetest.Context if false { var err error c, err = aetest.NewContext(nil) lg(err) if err != nil { return } defer c.Close() } fs := GetFS(c, 2) lg("took1 %4.2v secs", time.Now().Sub(start).Seconds()) least3Files := FetchAndDecodeJSON(nil, URLs[0], "", lg, fs) lg("took2 %4.2v secs", time.Now().Sub(start).Seconds()) doc := Dedup(least3Files, lg, fs) fNamer := domclean2.FileNamer(logDir, 0) fNamer() // first call yields key fsPerm := GetFS(c, 2) fileDump(lg, fsPerm, doc, fNamer, "_fin.html") pf("MapSimiliarCompares: %v SimpleCompares: %v LevenstheinComp: %v\n", breakMapsTooDistinct, appliedLevenshtein, appliedCompare) pf("Finish\n") }
// Puttting it all together func Dedup(oURL *url.URL, least3Files []repo.FullArticle, lg loghttp.FuncBufUniv, fs fsi.FileSystem) *html.Node { opts := domclean2.CleaningOptions{Proxify: true, Beautify: true} // opts.FNamer = fNamer opts.AddOutline = true // opts.RemoteHost = fetch.HostFromStringUrl(least3Files[0].Url) opts.RemoteHost = oURL.Host // // domclean for i := 0; i < len(least3Files); i++ { fNamer := domclean2.FileNamer(logDir, i) fNamer() // first call yields key lg("cleaning %4.1fkB from %v", float64(len(least3Files[i].Body))/1024, stringspb.ToLenR(least3Files[i].Url, 60)) doc, err := domclean2.DomClean(least3Files[i].Body, opts) lg(err) fileDump(lg, fs, doc, fNamer, ".html") } if false { // // Textify with brute force for i := 0; i < len(least3Files); i++ { fNamer := domclean2.FileNamer(logDir, i) fNamer() // first call yields key bts, err := fs.ReadFile(fNamer() + ".html") lg(err) doc, err := html.Parse(bytes.NewReader(bts)) lg(err) textifyBruteForce(doc) var buf bytes.Buffer err = html.Render(&buf, doc) lg(err) b := buf.Bytes() b = bytes.Replace(b, []byte("[br]"), []byte("\n"), -1) fileDump(lg, fs, b, fNamer, "_raw.txt") } } // // Textify with more finetuning. // Save result to memory. textsByArticOutl := map[string][]*TextifiedTree{} for i := 0; i < len(least3Files); i++ { fNamer := domclean2.FileNamer(logDir, i) fnKey := fNamer() // first call yields key bts, err := fs.ReadFile(fNamer() + ".html") doc, err := html.Parse(bytes.NewReader(bts)) lg(err) fNamer() // one more // mp, bts := BubbledUpTextExtraction(doc, fnKey) fileDump(lg, fs, bts, fNamer, ".txt") mpSorted, dump := orderByOutline(mp) fileDump(lg, fs, dump, fNamer, ".txt") textsByArticOutl[fnKey] = mpSorted // for k, v := range mpSorted { // if k%33 != 0 { // continue // } // log.Printf("%3v: %v %14v %v\n", k, v.SourceID, v.Outline, v.Lvl) // } } // // // We progress from level 1 downwards. // Lower levels skip weeded out higher levels, // to save expensive levenshtein comparisons var skipPrefixes = map[string]bool{} for weedStage := 1; weedStage <= stageMax; weedStage++ { fNamer := domclean2.FileNamer(logDir, 0) fnKey := fNamer() // first call yields key levelsToProcess = map[int]bool{weedStage: true} frags := similarTextifiedTrees(textsByArticOutl, skipPrefixes, map[string]bool{fnKey: true}) similaritiesToFile(fs, logDir, frags, weedStage) for _, frag := range frags { if len(frag.Similars) >= numTotal-1 && frag.SumRelLevenshtein/(numTotal-1) < 0.2 { skipPrefixes[frag.Outline+"."] = true } } b := new(bytes.Buffer) for k, _ := range skipPrefixes { b.WriteString(k) b.WriteByte(32) } // log.Printf("%v\n", b.String()) } // // Apply dedup fNamer := domclean2.FileNamer(logDir, 0) fNamer() // first call yields key bts, err := fs.ReadFile(fNamer() + ".html") lg(err) doc, err := html.Parse(bytes.NewReader(bts)) lg(err) dedupApply(doc, skipPrefixes) // A special after dedup cleaning: // Remove ol and cfrm attributes var fr func(*html.Node) fr = func(n *html.Node) { if n.Type == html.ElementNode { attr2 := make([]html.Attribute, 0, len(n.Attr)) for _, attr := range n.Attr { if attr.Key != "ol" && attr.Key != "cfrm" { attr2 = append(attr2, attr) } } n.Attr = attr2 } for c := n.FirstChild; c != nil; c = c.NextSibling { fr(c) } } fr(doc) if false { // does not add value var b7 bytes.Buffer err := html.Render(&b7, doc) lg(err) doc, err = domclean2.DomClean(b7.Bytes(), opts) lg(err) } else { domclean2.DomFormat(doc) } return doc }
// dedupHTTP wraps Dedup() func dedupHTTP(w http.ResponseWriter, r *http.Request, m map[string]interface{}) { lg, b := loghttp.BuffLoggerUniversal(w, r) closureOverBuf := func(bUnused *bytes.Buffer) { loghttp.Pf(w, r, b.String()) } defer closureOverBuf(b) // the argument is ignored, r.Header.Set("X-Custom-Header-Counter", "nocounter") wpf(b, tplx.ExecTplHelper(tplx.Head, map[string]interface{}{"HtmlTitle": "Deduplicating redundant stuff"})) defer wpf(b, tplx.Foot) wpf(b, "<pre>") defer wpf(b, "</pre>") err := r.ParseForm() lg(err) surl := r.FormValue(routes.URLParamKey) ourl, err := fetch.URLFromString(surl) lg(err) if err != nil { return } if ourl.Host == "" { lg("host is empty (%v)", surl) return } knownProtocol := "" if r.FormValue("prot") != "" { knownProtocol = r.FormValue("prot") } lg("Host %q, Path %q", ourl.Host, ourl.Path) fs := GetFS(appengine.NewContext(r), 0) least3Files := FetchAndDecodeJSON(r, ourl.String(), knownProtocol, lg, fs) lg("Fetched and decoded; found %v", len(least3Files)) if len(least3Files) > 0 { doc := Dedup(ourl, least3Files, lg, fs) fNamer := domclean2.FileNamer(logDir, 0) fNamer() // first call yields key fsPerm := GetFS(appengine.NewContext(r), 0) fileDump(lg, fsPerm, doc, fNamer, "_fin.html") lg("MapSimiliarCompares: %v SimpleCompares: %v LevenstheinComp: %v\n", breakMapsTooDistinct, appliedLevenshtein, appliedCompare) lg("Finish\n") var b2 bytes.Buffer err := html.Render(&b2, doc) lg(err) if err != nil { return } b = new(bytes.Buffer) // w.Write([]byte("aa")) w.Header().Set("Content-type", "text/html; charset=utf-8") w.Write(b2.Bytes()) } }
func Test1(t *testing.T) { lg, b := loghttp.BuffLoggerUniversal(nil, nil) _ = b c, err := aetest.NewContext(nil) lg(err) if err != nil { return } defer c.Close() fs := GetFS(c, 2) remoteHostname := "www.welt.de" remoteHostname = "www.welt.de/politik/ausland" dirs1, _, msg, err := fileserver.GetDirContents(repo.RepoURL, remoteHostname) if err != nil { lg(err) lg("%s", msg) } lg("dirs1") for _, v := range dirs1 { lg(" %v", v) } least3URLs := []string{} for _, v1 := range dirs1 { p := path.Join(remoteHostname, v1) dirs2, fils2, msg, err := fileserver.GetDirContents(repo.RepoURL, p) _ = dirs2 if err != nil { lg(err) lg("%s", msg) } // lg(" dirs2 %v", stringspb.IndentedDump(dirs2)) // lg(" fils2 %v", stringspb.IndentedDump(fils2)) for _, v2 := range fils2 { least3URLs = append(least3URLs, path.Join(remoteHostname, v1, v2)) } } if len(least3URLs) < numTotal { lg("not enough files in rss fetcher cache") return } else { least3URLs = least3URLs[:numTotal+1] } lg("fils2") for _, v := range least3URLs { lg(" %v", v) } // domclean least3Files := make([]repo.FullArticle, 0, len(least3URLs)) for i := 0; i < len(least3URLs); i++ { surl := spf("%v/%v", repo.RepoURL, least3URLs[i]) fNamer := domclean2.FileNamer(logDir, i) fNamer() // first call yields key resBytes, inf, err := fetch.UrlGetter(nil, fetch.Options{URL: surl}) if err != nil { lg(err) return } lg("fetched %4.1fkB from %v", float64(len(resBytes))/1024, stringspb.ToLenR(inf.URL.String(), 60)) fa := repo.FullArticle{} fa.Url = inf.URL.String() fa.Mod = inf.Mod fa.Body = resBytes least3Files = append(least3Files, fa) } doc := Dedup(least3Files, lg, fs) fNamer := domclean2.FileNamer(logDir, 0) fNamer() // first call yields key fsPerm := GetFS(c, 2) fileDump(lg, fsPerm, doc, fNamer, "_fin.html") pf("MapSimiliarCompares: %v SimpleCompares: %v LevenstheinComp: %v\n", breakMapsTooDistinct, appliedLevenshtein, appliedCompare) pf("correct finish\n") }