func Test1(t *testing.T) { lg, b := loghttp.BuffLoggerUniversal(nil, nil) _ = b c, err := aetest.NewContext(nil) lg(err) if err != nil { return } defer c.Close() fs := GetFS(c, 2) remoteHostname := "www.welt.de" remoteHostname = "www.welt.de/politik/ausland" dirs1, _, msg, err := fileserver.GetDirContents(repo.RepoURL, remoteHostname) if err != nil { lg(err) lg("%s", msg) } lg("dirs1") for _, v := range dirs1 { lg(" %v", v) } least3URLs := []string{} for _, v1 := range dirs1 { p := path.Join(remoteHostname, v1) dirs2, fils2, msg, err := fileserver.GetDirContents(repo.RepoURL, p) _ = dirs2 if err != nil { lg(err) lg("%s", msg) } // lg(" dirs2 %v", stringspb.IndentedDump(dirs2)) // lg(" fils2 %v", stringspb.IndentedDump(fils2)) for _, v2 := range fils2 { least3URLs = append(least3URLs, path.Join(remoteHostname, v1, v2)) } } if len(least3URLs) < numTotal { lg("not enough files in rss fetcher cache") return } else { least3URLs = least3URLs[:numTotal+1] } lg("fils2") for _, v := range least3URLs { lg(" %v", v) } // domclean least3Files := make([]repo.FullArticle, 0, len(least3URLs)) for i := 0; i < len(least3URLs); i++ { surl := spf("%v/%v", repo.RepoURL, least3URLs[i]) fNamer := domclean2.FileNamer(logDir, i) fNamer() // first call yields key resBytes, inf, err := fetch.UrlGetter(nil, fetch.Options{URL: surl}) if err != nil { lg(err) return } lg("fetched %4.1fkB from %v", float64(len(resBytes))/1024, stringspb.ToLenR(inf.URL.String(), 60)) fa := repo.FullArticle{} fa.Url = inf.URL.String() fa.Mod = inf.Mod fa.Body = resBytes least3Files = append(least3Files, fa) } doc := Dedup(least3Files, lg, fs) fNamer := domclean2.FileNamer(logDir, 0) fNamer() // first call yields key fsPerm := GetFS(c, 2) fileDump(lg, fsPerm, doc, fNamer, "_fin.html") pf("MapSimiliarCompares: %v SimpleCompares: %v LevenstheinComp: %v\n", breakMapsTooDistinct, appliedLevenshtein, appliedCompare) pf("correct finish\n") }
func Test1(t *testing.T) { lg, lge := loghttp.Logger(nil, nil) // c := prepare(t) // defer c.Close() lg("waiting for webserver") time.Sleep(2 * time.Millisecond) remoteHostname := "www.welt.de" dirs1, _, msg, err := fileserver.GetDirContents(hostWithPref, remoteHostname) if err != nil { lge(err) lg("%s", msg) } lg("dirs1") for _, v := range dirs1 { lg(" %v", v) } least3Files := []string{} for _, v1 := range dirs1 { dirs2, fils2, msg, err := fileserver.GetDirContents(hostWithPref, path.Join(remoteHostname, v1)) _ = dirs2 if err != nil { lge(err) lg("%s", msg) } // lg(" dirs2 %v", stringspb.IndentedDump(dirs2)) // lg(" fils2 %v", stringspb.IndentedDump(fils2)) if len(fils2) > numTotal-1 { for i2, v2 := range fils2 { least3Files = append(least3Files, path.Join(remoteHostname, v1, v2)) if i2 == numTotal-1 { break } } break } } if len(least3Files) < numTotal { lg("not enough files in rss fetcher cache") return } lg("fils2") for _, v := range least3Files { lg(" %v", v) } logdir := prepareLogDir() iter := make([]int, numTotal) for i, _ := range iter { surl := spf("%v/%v", hostWithPref, least3Files[i]) fNamer := FileNamer(logdir, i) fnKey := fNamer() // first call yields key _ = fnKey resBytes, effUrl, err := fetch.UrlGetter(nil, fetch.Options{URL: surl}) if err != nil { lge(err) return } lg("fetched %4.1fkB from %v", float64(len(resBytes))/1024, stringspb.ToLenR(effUrl.String(), 60)) opts := CleaningOptions{Proxify: true} opts.FNamer = fNamer opts.RemoteHost = remoteHostname doc, err := DomClean(resBytes, opts) lge(err) _ = doc } // statistics on elements and attributes sorted1 := sortmap.SortMapByCount(attrDistinct) sorted1.Print(6) fmt.Println() sorted2 := sortmap.SortMapByCount(nodeDistinct) sorted2.Print(6) pf("correct finish\n") }