func WriteFile(fs fsi.FileSystem, fn string, b []byte) error { dir, _ := fs.SplitX(fn) err := fs.MkdirAll(dir, os.ModePerm) if err != nil && err != fsi.ErrFileExists { return err } err = fs.WriteFile(fn, b, 0) if err != nil { return err } return nil }
func saveDigest(lg loghttp.FuncBufUniv, fs fsi.FileSystem, fnDigest string, treeX *DirTree) { treeX.LastFound = time.Now() b, err := json.MarshalIndent(treeX, "", "\t") lg(err) if len(b) > 1024*1024-1 || true { b1 := snappy.Encode(nil, b) lg("digest encoded from %vkB to %vkB ", len(b)/1024, len(b1)/1024) b = b1 fnDigest = strings.Replace(fnDigest, ".json", ".json.snappy", -1) } err = fs.MkdirAll(path.Dir(fnDigest), 0755) lg(err) err = fs.WriteFile(fnDigest, b, 0755) lg(err) }
// // // Fetches the RSS.xml file. func rssXMLFile(w http.ResponseWriter, r *http.Request, fs fsi.FileSystem, rssUrl string) (RSS, *url.URL) { lg, lge := loghttp.Logger(w, r) bts, respInf, err := fetch.UrlGetter(r, fetch.Options{URL: rssUrl}) lge(err) bts = bytes.Replace(bts, []byte("content:encoded>"), []byte("content-encoded>S"), -1) // hack rssDoc := RSS{} err = xml.Unmarshal(bts, &rssDoc) lge(err) // save it bdmp := stringspb.IndentedDumpBytes(rssDoc) err = fs.MkdirAll(path.Join(docRoot, respInf.URL.Host), 0755) lge(err) err = fs.WriteFile(path.Join(docRoot, respInf.URL.Host, "outp_rss.xml"), bdmp, 0755) lge(err) lg("RSS resp size %5.2vkB, saved to %v", len(bdmp)/1024, respInf.URL.Host+"/outp_rss.xml") return rssDoc, respInf.URL }
func CreateSys(fs fsi.FileSystem) (*bytes.Buffer, string) { bb := new(bytes.Buffer) wpf(bb, "--------create-dirs---------\n") fc1 := func(p []string) { path := pth.Join(p...) err := fs.MkdirAll(relOpt+path, os.ModePerm) if err != nil { wpf(bb, "MkdirAll failed %v\n", err) } } fc1([]string{"ch1"}) fc1([]string{"ch1", "ch2"}) fc1([]string{"ch1", "ch2", "ch3"}) fc1([]string{"ch1", "ch2", "ch3", "ch4"}) fc1([]string{"ch1", "ch2a"}) fc1([]string{"ch1A"}) fc1([]string{"ch1B"}) fc1([]string{"d1", "d2", "d3_secretdir", "neverwalked"}) fc1([]string{"d1", "d2", "d3a", "willwalk"}) wpf(bb, "\n--------retrieve-dirs---------\n") // retrieval gotByPath := 0 wntByPath := 5 fc2 := func(p []string) { path := pth.Join(p...) wpf(bb, "searching... %q\n", path) f, err := fs.Lstat(relOpt + path) if err != nil { wpf(bb, " nothing retrieved - err %v\n", err) } else { wpf(bb, " fnd %v \n", pth.Join(path, f.Name())) gotByPath++ } } fc2([]string{"ch1"}) fc2([]string{"ch1", "ch2"}) fc2([]string{"ch1", "non-exist-dir"}) fc2([]string{"ch1", "ch2", "ch3"}) fc2([]string{"ch1A"}) fc2([]string{rel}) wpf(bb, "\nfnd %v of %v dirs \n", gotByPath, wntByPath) wpf(bb, "\n-------create and save some files----\n") fc4a := func(name, content string) { err := fs.WriteFile(relOpt+name, []byte(content), os.ModePerm) if err != nil { wpf(bb, "WriteFile %v failed %v\n", name, err) } } fc4b := func(name, content string) { f, err := fs.Create(relOpt + name) if err != nil { wpf(bb, "Create %v failed %v\n", name, err) } if err != nil { return } _, err = f.WriteString(content) if err != nil { wpf(bb, "WriteString %v failed %v\n", name, err) } err = f.Close() if err != nil { wpf(bb, "Sync %v failed %v\n", name, err) } } fc4a("ch1/ch2/file_1", "content 1") fc4b("ch1/ch2/file_2", "content 2") fc4a("ch1/ch2/ch3/file3", "another content") fc4b(relPsep+"file4", "chq content 2") // fsc, ok := memfs.Unwrap(fs) // if ok { // fsc.Dump() // } // return bb, "" wpf(bb, "\n-------retrieve files again----\n\n") gotNumFiles := 0 wntNumFiles := 4 gotSizeFiles := 0 wntSizeFiles := 9 + 9 + 15 + 13 fc5 := func(path string) { files, err := fs.ReadDir(relOpt + path) if err != nil { wpf(bb, "filesByPath %v failed %v\n", path, err) } wpf(bb, " srch %-20q yielded %v dirs+files\n", relOpt+path, len(files)) for k, v := range files { if v.IsDir() { wpf(bb, " skip dir %v \n", v.Name()) continue } data, err := fs.ReadFile(pth.Join(path, v.Name())) if err != nil { wpf(bb, "could not get content of %v => %v\n", pth.Join(path, v.Name()), err) } wpf(bb, " %v - %v %s\n", k, pth.Join(path, v.Name()), data) gotNumFiles++ gotSizeFiles += len(data) } } fc5("ch1/ch2") fc5("ch1/ch2/ch3") fc5(rel) wpf(bb, "\n") wpf(bb, "fnd %2v of %2v fils \n", gotNumFiles, wntNumFiles) wpf(bb, "fnd %2v of %2v fsize \n", gotSizeFiles, wntSizeFiles) wpf(bb, "\n") testRes := "" if gotNumFiles != wntNumFiles { testRes += spf("Create files num : wnt %2v - got %v\n", wntNumFiles, gotNumFiles) } if gotSizeFiles != wntSizeFiles { testRes += spf("Create files size: wnt %2v - got %v\n", wntSizeFiles, gotSizeFiles) } return bb, testRes }
// Fetch takes a RSS XML uri and fetches some of its documents. // It uses a three staged pipeline for parallel fetching. // Results are stored into the given filesystem fs. // Config points to the source of RSS XML, // and has some rules for conflating URI directories. // uriPrefix and config.DesiredNumber tell the func // which subdirs of the RSS dir should be fetched - and how many at max. func FetchUsingRSS(w http.ResponseWriter, r *http.Request, fs fsi.FileSystem, config FetchCommand, ) { lg, b := loghttp.BuffLoggerUniversal(w, r) closureOverBuf := func(bUnused *bytes.Buffer) { loghttp.Pf(w, r, b.String()) } defer closureOverBuf(b) // the argument is ignored, if config.Host == "" { lg(" empty host; returning") return } config = addDefaults(config) // Fetching the rssXML takes time. // We do it before the timouts of the pipeline stages are set off. lg(" ") lg(config.Host) if config.Host == "test.economist.com" { switchTData(w, r) } // lg(stringspb.IndentedDump(config)) dirTree := &DirTree{Name: "/", Dirs: map[string]DirTree{}, EndPoint: true} fnDigest := path.Join(docRoot, config.Host, "digest2.json") loadDigest(w, r, lg, fs, fnDigest, dirTree) // previous age := time.Now().Sub(dirTree.LastFound) lg("DirTree is %5.2v hours old (%v)", age.Hours(), dirTree.LastFound.Format(time.ANSIC)) if age.Hours() > 0.001 { rssUrl := matchingRSSURI(w, r, config) if rssUrl == "" { m := new(MyWorker) m.r = r m.lg = lg m.fs1 = fs m.SURL = path.Join(config.Host, config.SearchPrefix) _, _, _, err := fetchSave(m) lg(err) if err != nil { return } } else { rssUrl = path.Join(config.Host, rssUrl) rssDoc, rssUrlObj := rssXMLFile(w, r, fs, rssUrl) _ = rssUrlObj rssDoc2DirTree(w, r, dirTree, rssDoc, config.Host) } saveDigest(lg, fs, fnDigest, dirTree) } // lg(dirTree.String()) // // // setting up a 3 staged pipeline from bottom up // var fullArticles []FullArticle var inn chan *FullArticle = make(chan *FullArticle) // jobs are stuffed in here var out chan *FullArticle = make(chan *FullArticle) // completed jobs are delivered here var fin chan struct{} = make(chan struct{}) // downstream signals end to upstream var stage3Wait sync.WaitGroup // stage 3 // fire up the "collector", a fan-in go func() { stage3Wait.Add(1) // 400 good value; critical point at 35 // economist.com required 800 ms const delayInitial = 1200 const delayRefresh = 800 cout := time.After(time.Millisecond * delayInitial) for { select { case fa := <-out: fullArticles = append(fullArticles, *fa) pth := fetch.PathFromStringUrl(fa.Url) lg(" fetched %v - %v ", fa.Mod.Format("15:04:05"), stringspb.Ellipsoider(pth, 50)) cout = time.After(time.Millisecond * delayRefresh) // refresh timeout case <-cout: lg("timeout after %v articles", len(fullArticles)) // we are using channel == nil - channel closed combinations // inspired by http://dave.cheney.net/2013/04/30/curious-channels out = nil // not close(out) => case above is now blocked close(fin) lg("fin closed; out nilled") stage3Wait.Done() return } } }() // // stage 2 for i := 0; i < numWorkers; i++ { // fire up a dedicated fetcher routine, a worker // we are using channel == nil - channel closed combinations // inspired by http://dave.cheney.net/2013/04/30/curious-channels go func() { var a *FullArticle for { select { case a = <-inn: var err error var inf fetch.Info a.Body, inf, err = fetch.UrlGetter(r, fetch.Options{URL: a.Url}) lg(err) if a.Mod.IsZero() { a.Mod = inf.Mod } select { case out <- a: case <-fin: lg(" worker spinning down; branch 1; abandoning %v", a.Url) return } a = new(FullArticle) case <-fin: if a != nil && a.Url != "" { u, _ := url.Parse(a.Url) lg(" abandoned %v", u.Path) } else { lg(" worker spinning down; branch 2") } return } } }() } // // // // loading stage 1 uriPrefix := config.SearchPrefix found := 0 uriPrefixExcl := "impossible" for i := 0; i < 15; i++ { lg(" searching for prefix %v - excl %q - %v of %v", uriPrefix, uriPrefixExcl, found, config.DesiredNumber) found += stuffStage1(w, r, config, inn, fin, dirTree, uriPrefixExcl, uriPrefix, config.DesiredNumber-found) if found >= config.DesiredNumber { break } if uriPrefix == "/" || uriPrefix == "." { lg(" root exhausted") break } newPrefix := path.Dir(uriPrefix) uriPrefixExcl = uriPrefix uriPrefix = newPrefix } lg(" found %v of %v", found, config.DesiredNumber) // lg("stage3Wait.Wait() before") stage3Wait.Wait() lg("stage3Wait.Wait() after") // workers spin down earlier - // but ae log writer and response writer need some time // to record the spin-down messages time.Sleep(120 * time.Millisecond) // compile out directory statistics histoDir := map[string]int{} for _, a := range fullArticles { u, err := url.Parse(a.Url) lg(err) semanticUri := condenseTrailingDir(u.Path, config.CondenseTrailingDirs) dir := path.Dir(semanticUri) histoDir[dir]++ } sr := sortmap.SortMapByCount(histoDir) _ = sr // Create dirs for k, _ := range histoDir { dir := path.Join(docRoot, k) // config.Host already contained in k err := fs.MkdirAll(dir, 0755) lg(err) err = fs.Chtimes(dir, time.Now(), time.Now()) lg(err) } // Saving as files for _, a := range fullArticles { if len(a.Body) == 0 { continue } u, err := url.Parse(a.Url) u.Fragment = "" u.RawQuery = "" lg(err) semanticUri := condenseTrailingDir(u.RequestURI(), config.CondenseTrailingDirs) p := path.Join(docRoot, semanticUri) err = fs.WriteFile(p, a.Body, 0644) lg(err) err = fs.Chtimes(p, a.Mod, a.Mod) lg(err) } { b, err := json.MarshalIndent(histoDir, " ", "\t") lg(err) fnDigest := path.Join(docRoot, config.Host, "fetchDigest.json") err = fs.WriteFile(fnDigest, b, 0755) lg(err) } // fsm, ok := memfs.Unwrap(fs) // if ok { // fsm.Dump() // } }