// Fetch takes a RSS XML uri and fetches some of its documents. // It uses a three staged pipeline for parallel fetching. // Results are stored into the given filesystem fs. // Config points to the source of RSS XML, // and has some rules for conflating URI directories. // uriPrefix and config.DesiredNumber tell the func // which subdirs of the RSS dir should be fetched - and how many at max. func FetchUsingRSS(w http.ResponseWriter, r *http.Request, fs fsi.FileSystem, config FetchCommand, ) { lg, b := loghttp.BuffLoggerUniversal(w, r) closureOverBuf := func(bUnused *bytes.Buffer) { loghttp.Pf(w, r, b.String()) } defer closureOverBuf(b) // the argument is ignored, if config.Host == "" { lg(" empty host; returning") return } config = addDefaults(config) // Fetching the rssXML takes time. // We do it before the timouts of the pipeline stages are set off. lg(" ") lg(config.Host) if config.Host == "test.economist.com" { switchTData(w, r) } // lg(stringspb.IndentedDump(config)) dirTree := &DirTree{Name: "/", Dirs: map[string]DirTree{}, EndPoint: true} fnDigest := path.Join(docRoot, config.Host, "digest2.json") loadDigest(w, r, lg, fs, fnDigest, dirTree) // previous age := time.Now().Sub(dirTree.LastFound) lg("DirTree is %5.2v hours old (%v)", age.Hours(), dirTree.LastFound.Format(time.ANSIC)) if age.Hours() > 0.001 { rssUrl := matchingRSSURI(w, r, config) if rssUrl == "" { m := new(MyWorker) m.r = r m.lg = lg m.fs1 = fs m.SURL = path.Join(config.Host, config.SearchPrefix) _, _, _, err := fetchSave(m) lg(err) if err != nil { return } } else { rssUrl = path.Join(config.Host, rssUrl) rssDoc, rssUrlObj := rssXMLFile(w, r, fs, rssUrl) _ = rssUrlObj rssDoc2DirTree(w, r, dirTree, rssDoc, config.Host) } saveDigest(lg, fs, fnDigest, dirTree) } // lg(dirTree.String()) // // // setting up a 3 staged pipeline from bottom up // var fullArticles []FullArticle var inn chan *FullArticle = make(chan *FullArticle) // jobs are stuffed in here var out chan *FullArticle = make(chan *FullArticle) // completed jobs are delivered here var fin chan struct{} = make(chan struct{}) // downstream signals end to upstream var stage3Wait sync.WaitGroup // stage 3 // fire up the "collector", a fan-in go func() { stage3Wait.Add(1) // 400 good value; critical point at 35 // economist.com required 800 ms const delayInitial = 1200 const delayRefresh = 800 cout := time.After(time.Millisecond * delayInitial) for { select { case fa := <-out: fullArticles = append(fullArticles, *fa) pth := fetch.PathFromStringUrl(fa.Url) lg(" fetched %v - %v ", fa.Mod.Format("15:04:05"), stringspb.Ellipsoider(pth, 50)) cout = time.After(time.Millisecond * delayRefresh) // refresh timeout case <-cout: lg("timeout after %v articles", len(fullArticles)) // we are using channel == nil - channel closed combinations // inspired by http://dave.cheney.net/2013/04/30/curious-channels out = nil // not close(out) => case above is now blocked close(fin) lg("fin closed; out nilled") stage3Wait.Done() return } } }() // // stage 2 for i := 0; i < numWorkers; i++ { // fire up a dedicated fetcher routine, a worker // we are using channel == nil - channel closed combinations // inspired by http://dave.cheney.net/2013/04/30/curious-channels go func() { var a *FullArticle for { select { case a = <-inn: var err error var inf fetch.Info a.Body, inf, err = fetch.UrlGetter(r, fetch.Options{URL: a.Url}) lg(err) if a.Mod.IsZero() { a.Mod = inf.Mod } select { case out <- a: case <-fin: lg(" worker spinning down; branch 1; abandoning %v", a.Url) return } a = new(FullArticle) case <-fin: if a != nil && a.Url != "" { u, _ := url.Parse(a.Url) lg(" abandoned %v", u.Path) } else { lg(" worker spinning down; branch 2") } return } } }() } // // // // loading stage 1 uriPrefix := config.SearchPrefix found := 0 uriPrefixExcl := "impossible" for i := 0; i < 15; i++ { lg(" searching for prefix %v - excl %q - %v of %v", uriPrefix, uriPrefixExcl, found, config.DesiredNumber) found += stuffStage1(w, r, config, inn, fin, dirTree, uriPrefixExcl, uriPrefix, config.DesiredNumber-found) if found >= config.DesiredNumber { break } if uriPrefix == "/" || uriPrefix == "." { lg(" root exhausted") break } newPrefix := path.Dir(uriPrefix) uriPrefixExcl = uriPrefix uriPrefix = newPrefix } lg(" found %v of %v", found, config.DesiredNumber) // lg("stage3Wait.Wait() before") stage3Wait.Wait() lg("stage3Wait.Wait() after") // workers spin down earlier - // but ae log writer and response writer need some time // to record the spin-down messages time.Sleep(120 * time.Millisecond) // compile out directory statistics histoDir := map[string]int{} for _, a := range fullArticles { u, err := url.Parse(a.Url) lg(err) semanticUri := condenseTrailingDir(u.Path, config.CondenseTrailingDirs) dir := path.Dir(semanticUri) histoDir[dir]++ } sr := sortmap.SortMapByCount(histoDir) _ = sr // Create dirs for k, _ := range histoDir { dir := path.Join(docRoot, k) // config.Host already contained in k err := fs.MkdirAll(dir, 0755) lg(err) err = fs.Chtimes(dir, time.Now(), time.Now()) lg(err) } // Saving as files for _, a := range fullArticles { if len(a.Body) == 0 { continue } u, err := url.Parse(a.Url) u.Fragment = "" u.RawQuery = "" lg(err) semanticUri := condenseTrailingDir(u.RequestURI(), config.CondenseTrailingDirs) p := path.Join(docRoot, semanticUri) err = fs.WriteFile(p, a.Body, 0644) lg(err) err = fs.Chtimes(p, a.Mod, a.Mod) lg(err) } { b, err := json.MarshalIndent(histoDir, " ", "\t") lg(err) fnDigest := path.Join(docRoot, config.Host, "fetchDigest.json") err = fs.WriteFile(fnDigest, b, 0755) lg(err) } // fsm, ok := memfs.Unwrap(fs) // if ok { // fsm.Dump() // } }
func Test1(t *testing.T) { lg, lge := loghttp.Logger(nil, nil) // c := prepare(t) // defer c.Close() lg("waiting for webserver") time.Sleep(2 * time.Millisecond) remoteHostname := "www.welt.de" dirs1, _, msg, err := fileserver.GetDirContents(hostWithPref, remoteHostname) if err != nil { lge(err) lg("%s", msg) } lg("dirs1") for _, v := range dirs1 { lg(" %v", v) } least3Files := []string{} for _, v1 := range dirs1 { dirs2, fils2, msg, err := fileserver.GetDirContents(hostWithPref, path.Join(remoteHostname, v1)) _ = dirs2 if err != nil { lge(err) lg("%s", msg) } // lg(" dirs2 %v", stringspb.IndentedDump(dirs2)) // lg(" fils2 %v", stringspb.IndentedDump(fils2)) if len(fils2) > numTotal-1 { for i2, v2 := range fils2 { least3Files = append(least3Files, path.Join(remoteHostname, v1, v2)) if i2 == numTotal-1 { break } } break } } if len(least3Files) < numTotal { lg("not enough files in rss fetcher cache") return } lg("fils2") for _, v := range least3Files { lg(" %v", v) } logdir := prepareLogDir() iter := make([]int, numTotal) for i, _ := range iter { surl := spf("%v/%v", hostWithPref, least3Files[i]) fNamer := FileNamer(logdir, i) fnKey := fNamer() // first call yields key _ = fnKey resBytes, effUrl, err := fetch.UrlGetter(nil, fetch.Options{URL: surl}) if err != nil { lge(err) return } lg("fetched %4.1fkB from %v", float64(len(resBytes))/1024, stringspb.ToLenR(effUrl.String(), 60)) opts := CleaningOptions{Proxify: true} opts.FNamer = fNamer opts.RemoteHost = remoteHostname doc, err := DomClean(resBytes, opts) lge(err) _ = doc } // statistics on elements and attributes sorted1 := sortmap.SortMapByCount(attrDistinct) sorted1.Print(6) fmt.Println() sorted2 := sortmap.SortMapByCount(nodeDistinct) sorted2.Print(6) pf("correct finish\n") }