// dedupHTTP wraps Dedup() func dedupHTTP(w http.ResponseWriter, r *http.Request, m map[string]interface{}) { lg, b := loghttp.BuffLoggerUniversal(w, r) closureOverBuf := func(bUnused *bytes.Buffer) { loghttp.Pf(w, r, b.String()) } defer closureOverBuf(b) // the argument is ignored, r.Header.Set("X-Custom-Header-Counter", "nocounter") wpf(b, tplx.ExecTplHelper(tplx.Head, map[string]interface{}{"HtmlTitle": "Deduplicating redundant stuff"})) defer wpf(b, tplx.Foot) wpf(b, "<pre>") defer wpf(b, "</pre>") err := r.ParseForm() lg(err) surl := r.FormValue(routes.URLParamKey) ourl, err := fetch.URLFromString(surl) lg(err) if err != nil { return } if ourl.Host == "" { lg("host is empty (%v)", surl) return } knownProtocol := "" if r.FormValue("prot") != "" { knownProtocol = r.FormValue("prot") } lg("Host %q, Path %q", ourl.Host, ourl.Path) fs := GetFS(appengine.NewContext(r), 0) least3Files := FetchAndDecodeJSON(r, ourl.String(), knownProtocol, lg, fs) lg("Fetched and decoded; found %v", len(least3Files)) if len(least3Files) > 0 { doc := Dedup(ourl, least3Files, lg, fs) fNamer := domclean2.FileNamer(logDir, 0) fNamer() // first call yields key fsPerm := GetFS(appengine.NewContext(r), 0) fileDump(lg, fsPerm, doc, fNamer, "_fin.html") lg("MapSimiliarCompares: %v SimpleCompares: %v LevenstheinComp: %v\n", breakMapsTooDistinct, appliedLevenshtein, appliedCompare) lg("Finish\n") var b2 bytes.Buffer err := html.Render(&b2, doc) lg(err) if err != nil { return } b = new(bytes.Buffer) // w.Write([]byte("aa")) w.Header().Set("Content-type", "text/html; charset=utf-8") w.Write(b2.Bytes()) } }
// FetchSimilar is an extended version of Fetch // It is uses a DirTree of crawled *links*, not actual files. // As it moves up the DOM, it crawls every document for additional links. // It first moves up to find similar URLs on the same depth // /\ // /\ / \ // /\ / \ / \ // It then moves up the ladder again - to accept higher URLs // /\ // /\ // /\ func FetchSimilar(w http.ResponseWriter, r *http.Request, m map[string]interface{}) { lg, b := loghttp.BuffLoggerUniversal(w, r) closureOverBuf := func(bUnused *bytes.Buffer) { loghttp.Pf(w, r, b.String()) } defer closureOverBuf(b) // the argument is ignored, r.Header.Set("X-Custom-Header-Counter", "nocounter") start := time.Now() wpf(b, tplx.ExecTplHelper(tplx.Head, map[string]interface{}{"HtmlTitle": "Find similar HTML URLs"})) defer wpf(b, tplx.Foot) wpf(b, "<pre>") defer wpf(b, "</pre>") fs1 := GetFS(appengine.NewContext(r)) err := r.ParseForm() lg(err) countSimilar := 3 sCountSimilar := r.FormValue("cnt") if sCountSimilar != "" { i, err := strconv.Atoi(strings.TrimSpace(sCountSimilar)) if err == nil { countSimilar = i } } surl := r.FormValue(routes.URLParamKey) ourl, err := fetch.URLFromString(surl) lg(err) if err != nil { return } if ourl.Host == "" { lg("host is empty (%v)", surl) return } knownProtocol := "" if r.FormValue("prot") != "" { knownProtocol = r.FormValue("prot") } numWorkers := 0 sNumWorkers := r.FormValue("numworkers") if sNumWorkers != "" { i, err := strconv.Atoi(strings.TrimSpace(sNumWorkers)) if err == nil { numWorkers = i } } srcDepth := strings.Count(ourl.Path, "/") cmd := FetchCommand{} cmd.Host = ourl.Host cmd.SearchPrefix = ourl.Path cmd = addDefaults(cmd) dirTree := &DirTree{Name: "/", Dirs: map[string]DirTree{}, EndPoint: true} fnDigest := path.Join(docRoot, cmd.Host, "digest2.json") loadDigest(w, r, lg, fs1, fnDigest, dirTree) // previous lg("dirtree 400 chars is %v end of dirtree\t\t", stringspb.ToLen(dirTree.String(), 400)) m1 := new(MyWorker) m1.r = r m1.lg = lg m1.fs1 = fs1 m1.SURL = path.Join(cmd.Host, ourl.Path) m1.Protocol = knownProtocol btsSrc, modSrc, usedExisting, err := fetchSave(m1) if !usedExisting { addAnchors(lg, cmd.Host, btsSrc, dirTree) } lg(err) if err != nil { return } lg("\t\t%4.2v secs so far 1", time.Now().Sub(start).Seconds()) var treePath string treePath = "/blogs/freeexchange" treePath = "/news/europe" treePath = path.Dir(ourl.Path) opt := LevelWiseDeeperOptions{} opt.Rump = treePath opt.ExcludeDir = "/news/americas" opt.ExcludeDir = "/blogs/buttonwood" opt.ExcludeDir = "/something-impossible" opt.MinDepthDiff = 1 opt.MaxDepthDiff = 1 opt.CondenseTrailingDirs = cmd.CondenseTrailingDirs opt.MaxNumber = cmd.DesiredNumber + 1 // one more for "self" opt.MaxNumber = cmd.DesiredNumber + 40 // collect more, 'cause we filter out those too old later var subtree *DirTree links := []FullArticle{} alreadyCrawled := map[string]struct{}{} MarkOuter: for j := 0; j < srcDepth; j++ { treePath = path.Dir(ourl.Path) MarkInner: // for i := 1; i < srcDepth; i++ { for i := 1; i < (srcDepth + 5); i++ { subtree, treePath = DiveToDeepestMatch(dirTree, treePath) lg("Looking from height %v to level %v - %v", srcDepth-i, srcDepth-j, treePath) if _, ok := alreadyCrawled[treePath]; ok { // lg("\t already digested %v", treePath) continue } m2 := new(MyWorker) m2.r = r m2.lg = lg m2.fs1 = fs1 m2.SURL = path.Join(cmd.Host, treePath) m2.Protocol = knownProtocol btsPar, _, usedExisting, err := fetchSave(m2) lg(err) if err != nil { return } alreadyCrawled[treePath] = struct{}{} if !usedExisting { addAnchors(lg, cmd.Host, btsPar, dirTree) } if subtree == nil { lg("\n#%v treePath %q ; subtree is nil", i, treePath) } else { // lg("\n#%v treePath %q ; subtree exists", i, treePath) opt.Rump = treePath opt.MinDepthDiff = i - j opt.MaxDepthDiff = i - j lvlLinks := LevelWiseDeeper(nil, nil, subtree, opt) links = append(links, lvlLinks...) for _, art := range lvlLinks { _ = art // lg("#%v fnd %v", i, stringspb.ToLen(art.Url, 100)) } if len(links) >= opt.MaxNumber { lg("found enough links") break MarkOuter } pathPrev := treePath treePath = path.Dir(treePath) // lg("#%v bef %v - aft %v", i, pathPrev, treePath) if pathPrev == "." && treePath == "." || pathPrev == "/" && treePath == "/" || pathPrev == "" && treePath == "." { lg("break to innner") break MarkInner } } } } // // // // lg("%v links after %4.2v secs", len(links), time.Now().Sub(start).Seconds()) lg("============================") lg("Now reading/fetching actual similar files - not just the links") // tried := 0 selecteds := []FullArticle{} nonExisting := []FullArticle{} nonExistFetched := []FullArticle{} for _, art := range links { if art.Url == ourl.Path { lg("skipping self\t%v", art.Url) continue } tried++ useExisting := false semanticUri := condenseTrailingDir(art.Url, cmd.CondenseTrailingDirs) p := path.Join(docRoot, cmd.Host, semanticUri) f, err := fs1.Open(p) // lg(err) // its no error if file does not exist if err != nil { // lg("!nstore %q", semanticUri) } else { // lg("reading %q", semanticUri) // lets put this into a func, so that f.close it called at the end of this func // otherwise defer f.close() spans the entire func and prevents // overwrites chmods further down f := func() { defer f.Close() fi, err := f.Stat() lg(err) if err != nil { } else { age := time.Now().Sub(fi.ModTime()) if age.Hours() < 10 { lg("\t\tusing existing file with age %4.2v hrs", age.Hours()) art.Mod = fi.ModTime() bts, err := ioutil.ReadAll(f) lg(err) art.Body = bts if len(bts) < 200 { if bytes.Contains(bts, []byte(fetch.MsgNoRdirects)) { return } } selecteds = append(selecteds, art) useExisting = true } } } f() } if !useExisting { nonExisting = append(nonExisting, art) } if len(selecteds) >= countSimilar { break } } lg("============================") lg("tried %v links - yielding %v existing similars; not existing in datastore: %v, %v were requested.", tried, len(selecteds), len(nonExisting), countSimilar) if len(selecteds) < countSimilar { jobs := make([]distrib.Worker, 0, len(nonExisting)) for _, art := range nonExisting { surl := path.Join(cmd.Host, art.Url) wrkr := MyWorker{SURL: surl} wrkr.Protocol = knownProtocol wrkr.r = r wrkr.lg = lg wrkr.fs1 = fs1 job := distrib.Worker(&wrkr) jobs = append(jobs, job) } opt := distrib.NewDefaultOptions() opt.TimeOutDur = 3500 * time.Millisecond opt.Want = int32(countSimilar - len(selecteds) + 4) // get some more, in case we have "redirected" bodies opt.NumWorkers = int(opt.Want) // 5s query limit; => hurry; spawn as many as we want if numWorkers > 0 { opt.NumWorkers = numWorkers } lg("Preparing %v simultaneous, wanting %v fetches; at %4.2v secs.", opt.NumWorkers, opt.Want, time.Now().Sub(start).Seconds()) opt.CollectRemainder = false // 5s query limit; => hurry; dont wait for stragglers ret, msg := distrib.Distrib(jobs, opt) lg("Distrib returned at %4.2v secs with %v results.", time.Now().Sub(start).Seconds(), len(ret)) lg("\n" + msg.String()) for _, v := range ret { v1, _ := v.Worker.(*MyWorker) if v1.FA != nil { age := time.Now().Sub(v1.FA.Mod) if age.Hours() < 10 { lg("\t\tusing fetched file with age %4.2v hrs", age.Hours()) nonExistFetched = append(nonExistFetched, *v1.FA) if len(nonExistFetched) > (countSimilar - len(selecteds)) { break } } } if v1.err != nil { lg(err) } } lg("tried %v links - yielding %v fetched - jobs %v", len(nonExisting), len(nonExistFetched), len(jobs)) selecteds = append(selecteds, nonExistFetched...) // // // Extract links for _, v := range nonExistFetched { // lg("links -> memory dirtree for %q", v.Url) addAnchors(lg, cmd.Host, v.Body, dirTree) } } // if time.Now().Sub(dirTree.LastFound).Seconds() < 10 { lg("saving accumulated (new) links to digest") saveDigest(lg, fs1, fnDigest, dirTree) } lg("\t\t%4.2v secs so far 3", time.Now().Sub(start).Seconds()) mp := map[string][]byte{} mp["msg"] = b.Bytes() mp["url_self"] = []byte(condenseTrailingDir(ourl.Path, cmd.CondenseTrailingDirs)) mp["mod_self"] = []byte(modSrc.Format(http.TimeFormat)) mp["bod_self"] = btsSrc for i, v := range selecteds { mp["url__"+spf("%02v", i)] = []byte(v.Url) mp["mod__"+spf("%02v", i)] = []byte(v.Mod.Format(http.TimeFormat)) mp["bod__"+spf("%02v", i)] = v.Body } mp["lensimilar"] = []byte(spf("%02v", len(selecteds))) // smp, err := json.MarshalIndent(mp, "", "\t") if err != nil { lg(b, "marshalling mp to []byte failed\n") return } r.Header.Set("X-Custom-Header-Counter", "nocounter") w.Header().Set("Content-Type", "application/json") w.Write(smp) b.Reset() // this keeps the buf pointer intact; outgoing defers are still heeded b = new(bytes.Buffer) // creates a *new* buf pointer; outgoing defers write into the *old* buf lg("\t\t%4.2v secs so far 4 (json resp written as []byte)", time.Now().Sub(start).Seconds()) return }
// Fetches URL if local file is outdated. // saves fetched file // // link extraction, link addition to treeX now accumulated one level higher // bool return value: use existing => true func fetchSave(m *MyWorker) ([]byte, time.Time, bool, error) { // w http.ResponseWriter, // r *http.Request, // Determine FileName ourl, err := fetch.URLFromString(m.SURL) fc := FetchCommand{} fc.Host = ourl.Host fc = addDefaults(fc) semanticUri := condenseTrailingDir(m.SURL, fc.CondenseTrailingDirs) fn := path.Join(docRoot, semanticUri) m.lg("crawlin %q", m.SURL) // File already exists? // Open file for age check var bts []byte var mod time.Time f := func() error { file1, err := m.fs1.Open(fn) // m.lg(err) // file may simply not exist if err != nil { return err // file may simply not exist } defer file1.Close() // file close *fast* at the end of *this* anonymous func fi, err := file1.Stat() m.lg(err) if err != nil { return err } if fi.IsDir() { m.lg("\t\t file is a directory, skipping - %v", fn) return fmt.Errorf("is directory: %v", fn) } mod = fi.ModTime() age := time.Now().Sub(mod) if age.Hours() > 10 { m.lg("\t\t file %4.2v hours old, refetch ", age.Hours()) return fmt.Errorf("too old: %v", fn) } m.lg("\t\t file only %4.2v hours old, take %4.2vkB from datastore", age.Hours(), fi.Size()/1024) bts, err = ioutil.ReadAll(file1) if err != nil { return err } return nil } err = f() if err == nil { return bts, mod, true, err } // // Fetch bts, inf, err := fetch.UrlGetter(m.r, fetch.Options{URL: m.SURL, KnownProtocol: m.Protocol, RedirectHandling: 1}) m.lg(err) if err != nil { if inf.Status != http.StatusNotFound { m.lg("tried to fetch %v, %v", m.SURL, inf.URL) m.lg("msg %v", inf.Msg) return []byte{}, inf.Mod, false, err } // In our traversing upwards, we might encounter "directory links" that have no index.html. // For a *derived* URL, this is no error. bts = []byte(" ... not found ... ") } if inf.Mod.IsZero() { inf.Mod = time.Now().Add(-75 * time.Minute) } // // // main request still exists? if false { var cx context.Context cx = util_appengine.SafelyExtractGaeContext(m.r) if cx == nil { m.lg("timed out - returning") return bts, inf.Mod, false, fmt.Errorf("req timed out") } } m.lg("retrivd+saved %q; %vkB ", inf.URL.Host+inf.URL.Path, len(bts)/1024) if len(bts) > 1024*1024-1 { bts = removeScriptsAndComments(m.lg, bts) m.lg("size reduced_1 to %vkB ", len(bts)/1024) // if len(bts) > 1024*1024-1 { // bts = snappy.Encode(nil, bts) // fn = strings.Replace(fn, ".html", ".snap.html", -1) // m.lg("size reduced_2 to %vkB ", len(bts)/1024) // } } // // dir := path.Dir(fn) err = m.fs1.MkdirAll(dir, 0755) m.lg(err) err = m.fs1.Chtimes(dir, time.Now(), time.Now()) m.lg(err) err = m.fs1.WriteFile(fn, bts, 0644) m.lg(err) err = m.fs1.Chtimes(fn, inf.Mod, inf.Mod) m.lg(err) return bts, inf.Mod, false, nil }