func testReaddirnames(fs fsi.FileSystem, dir string, contents []string, t *testing.T) { file, err := fs.Open(dir) if err != nil { t.Fatalf("open %q failed: %v", dir, err) } defer file.Close() s, err2 := file.Readdirnames(-1) if err2 != nil { t.Fatalf("readdirnames %q failed: %v", dir, err2) } for _, m := range contents { found := false for _, n := range s { if n == "." || n == ".." { t.Errorf("got %s in directory", n) } if equal(m, n) { if found { t.Error("present twice:", m) } found = true } } if !found { t.Error("could not find", m) } } }
func loadDigest(w http.ResponseWriter, r *http.Request, lg loghttp.FuncBufUniv, fs fsi.FileSystem, fnDigest string, treeX *DirTree) { fnDigestSnappied := strings.Replace(fnDigest, ".json", ".json.snappy", -1) bts, err := fs.ReadFile(fnDigestSnappied) if err == nil { btsDec := []byte{} lg("encoded digest loaded, size %vkB", len(bts)/1024) btsDec, err := snappy.Decode(nil, bts) if err != nil { lg(err) return } lg("digest decoded from %vkB to %vkB", len(bts)/1024, len(btsDec)/1024) bts = btsDec } else { bts, err = fs.ReadFile(fnDigest) lg(err) } if err == nil { err = json.Unmarshal(bts, &treeX) lg(err) } lg("DirTree %5.2vkB loaded for %v", len(bts)/1024, fnDigest) }
// Walk walks the file tree rooted at root, calling walkFn for each file or // directory in the tree, including root. // // It requires only the fsi.FileSystem interface, and is therefore implementation independent. // // It is similar to filepath.Walk(root string, walkFunc) // // Directories are crawled in order of fs.ReadDir() // Walk crawls directories first, files second. // // Errors that arise visiting directories can be filtered by walkFn. // // Walk does not follow symbolic links. func Walk(fs fsi.FileSystem, root string, walkFn WalkFunc) error { info, err := fs.Lstat(root) if err != nil { // log.Printf("walk start error %10v %v", root, err) return walkFn(root, nil, err) } // log.Printf("walk start fnd %v", info.Name()) return walk(fs, root, info, walkFn) }
func RemoveSubtree(fs fsi.FileSystem) (*bytes.Buffer, string) { bb := new(bytes.Buffer) wpf(bb, "-------removedir----\n\n") err := fs.RemoveAll("ch1/ch2/ch3") wpf(bb, "fs.RemoveAll() returned %v\n\n", err) testRes := "" if err != nil { testRes = spf("RemoveTree: %v", err) } return bb, testRes }
// walk recursively descends path, calling walkFn. func walk(fs fsi.FileSystem, path string, info os.FileInfo, walkFn WalkFunc) error { // cntr++ // if cntr > 20 { // return fmt.Errorf("too many recursions") // } err := walkFn(path, info, nil) if err != nil { if info.IsDir() && err == SkipDir { return nil } return err } if !info.IsDir() { return nil } fis, err := fs.ReadDir(path) // fnd := "" // for i := 0; i < len(fis); i++ { // fnd += fis[i].Name() + ", " // } // log.Printf("readdir of %-26v => %v, %v", path, len(fis), fnd) if err != nil && err != fsi.EmptyQueryResult { return walkFn(path, info, err) } // for _, fi := range fis { filename := pth.Join(path, pth.Base(fi.Name())) fileInfo, err := fs.Lstat(filename) if err != nil { if err := walkFn(filename, fileInfo, err); err != nil && err != SkipDir { return err } } else { err = walk(fs, filename, fileInfo, walkFn) if err != nil { if !fileInfo.IsDir() || err != SkipDir { return err } } } } return nil }
func writeFile(t *testing.T, fs fsi.FileSystem, fname string, flag int, text string) string { f, err := fs.OpenFile(fname, flag, 0666) if err != nil { t.Fatalf("Open: %v", err) } n, err := io.WriteString(f, text) if err != nil { t.Fatalf("WriteString: %d, %v", n, err) } f.Close() data, err := ioutil.ReadFile(fname) if err != nil { t.Fatalf("ReadFile: %v", err) } return string(data) }
func saveDigest(lg loghttp.FuncBufUniv, fs fsi.FileSystem, fnDigest string, treeX *DirTree) { treeX.LastFound = time.Now() b, err := json.MarshalIndent(treeX, "", "\t") lg(err) if len(b) > 1024*1024-1 || true { b1 := snappy.Encode(nil, b) lg("digest encoded from %vkB to %vkB ", len(b)/1024, len(b1)/1024) b = b1 fnDigest = strings.Replace(fnDigest, ".json", ".json.snappy", -1) } err = fs.MkdirAll(path.Dir(fnDigest), 0755) lg(err) err = fs.WriteFile(fnDigest, b, 0755) lg(err) }
// // // Fetches the RSS.xml file. func rssXMLFile(w http.ResponseWriter, r *http.Request, fs fsi.FileSystem, rssUrl string) (RSS, *url.URL) { lg, lge := loghttp.Logger(w, r) bts, respInf, err := fetch.UrlGetter(r, fetch.Options{URL: rssUrl}) lge(err) bts = bytes.Replace(bts, []byte("content:encoded>"), []byte("content-encoded>S"), -1) // hack rssDoc := RSS{} err = xml.Unmarshal(bts, &rssDoc) lge(err) // save it bdmp := stringspb.IndentedDumpBytes(rssDoc) err = fs.MkdirAll(path.Join(docRoot, respInf.URL.Host), 0755) lge(err) err = fs.WriteFile(path.Join(docRoot, respInf.URL.Host, "outp_rss.xml"), bdmp, 0755) lge(err) lg("RSS resp size %5.2vkB, saved to %v", len(bdmp)/1024, respInf.URL.Host+"/outp_rss.xml") return rssDoc, respInf.URL }
func RetrieveByReadDir(fs fsi.FileSystem) (*bytes.Buffer, string) { bb := new(bytes.Buffer) wpf(bb, "--------retrieve by readDir---------\n\n") wnt1 := []int{2, 3, 2, 5} wnt2 := []int{2, 2, 5} got := []int{} fc3 := func(path string) { wpf(bb, "searching %q\n", path) children, err := fs.ReadDir(path) if err != nil { wpf(bb, " nothing retrieved - err %v\n", err) } else { for k, v := range children { wpf(bb, " child #%-2v %-24v\n", k, pth.Join(path, v.Name())) } got = append(got, len(children)) } wpf(bb, "\n") } fc3(`ch1/ch2/ch3`) fc3(`ch1/ch2`) fc3(`ch1`) fc3(rel) testRes := "" if spf("%+v", wnt1) != spf("%+v", got) && spf("%+v", wnt2) != spf("%+v", got) { testRes = spf("ReadDir: wnt %v or %v - got %v", wnt1, wnt2, got) } return bb, testRes }
func newFile(testName string, fs fsi.FileSystem, t *testing.T) (f fsi.File) { // Use a local file system, not NFS. // On Unix, override $TMPDIR in case the user // has it set to an NFS-mounted directory. dir := "" if runtime.GOOS != "windows" { dir = "/tmp" } fs.MkdirAll(dir, 0777) f, err := fs.Create(path.Join(dir, testName)) if err != nil { t.Fatalf("%v: open %s: %s", fs.Name(), testName, err) } return f }
func WriteFile(fs fsi.FileSystem, fn string, b []byte) error { dir, _ := fs.SplitX(fn) err := fs.MkdirAll(dir, os.ModePerm) if err != nil && err != fsi.ErrFileExists { return err } err = fs.WriteFile(fn, b, 0) if err != nil { return err } return nil }
func CreateSys(fs fsi.FileSystem) (*bytes.Buffer, string) { bb := new(bytes.Buffer) wpf(bb, "--------create-dirs---------\n") fc1 := func(p []string) { path := pth.Join(p...) err := fs.MkdirAll(relOpt+path, os.ModePerm) if err != nil { wpf(bb, "MkdirAll failed %v\n", err) } } fc1([]string{"ch1"}) fc1([]string{"ch1", "ch2"}) fc1([]string{"ch1", "ch2", "ch3"}) fc1([]string{"ch1", "ch2", "ch3", "ch4"}) fc1([]string{"ch1", "ch2a"}) fc1([]string{"ch1A"}) fc1([]string{"ch1B"}) fc1([]string{"d1", "d2", "d3_secretdir", "neverwalked"}) fc1([]string{"d1", "d2", "d3a", "willwalk"}) wpf(bb, "\n--------retrieve-dirs---------\n") // retrieval gotByPath := 0 wntByPath := 5 fc2 := func(p []string) { path := pth.Join(p...) wpf(bb, "searching... %q\n", path) f, err := fs.Lstat(relOpt + path) if err != nil { wpf(bb, " nothing retrieved - err %v\n", err) } else { wpf(bb, " fnd %v \n", pth.Join(path, f.Name())) gotByPath++ } } fc2([]string{"ch1"}) fc2([]string{"ch1", "ch2"}) fc2([]string{"ch1", "non-exist-dir"}) fc2([]string{"ch1", "ch2", "ch3"}) fc2([]string{"ch1A"}) fc2([]string{rel}) wpf(bb, "\nfnd %v of %v dirs \n", gotByPath, wntByPath) wpf(bb, "\n-------create and save some files----\n") fc4a := func(name, content string) { err := fs.WriteFile(relOpt+name, []byte(content), os.ModePerm) if err != nil { wpf(bb, "WriteFile %v failed %v\n", name, err) } } fc4b := func(name, content string) { f, err := fs.Create(relOpt + name) if err != nil { wpf(bb, "Create %v failed %v\n", name, err) } if err != nil { return } _, err = f.WriteString(content) if err != nil { wpf(bb, "WriteString %v failed %v\n", name, err) } err = f.Close() if err != nil { wpf(bb, "Sync %v failed %v\n", name, err) } } fc4a("ch1/ch2/file_1", "content 1") fc4b("ch1/ch2/file_2", "content 2") fc4a("ch1/ch2/ch3/file3", "another content") fc4b(relPsep+"file4", "chq content 2") // fsc, ok := memfs.Unwrap(fs) // if ok { // fsc.Dump() // } // return bb, "" wpf(bb, "\n-------retrieve files again----\n\n") gotNumFiles := 0 wntNumFiles := 4 gotSizeFiles := 0 wntSizeFiles := 9 + 9 + 15 + 13 fc5 := func(path string) { files, err := fs.ReadDir(relOpt + path) if err != nil { wpf(bb, "filesByPath %v failed %v\n", path, err) } wpf(bb, " srch %-20q yielded %v dirs+files\n", relOpt+path, len(files)) for k, v := range files { if v.IsDir() { wpf(bb, " skip dir %v \n", v.Name()) continue } data, err := fs.ReadFile(pth.Join(path, v.Name())) if err != nil { wpf(bb, "could not get content of %v => %v\n", pth.Join(path, v.Name()), err) } wpf(bb, " %v - %v %s\n", k, pth.Join(path, v.Name()), data) gotNumFiles++ gotSizeFiles += len(data) } } fc5("ch1/ch2") fc5("ch1/ch2/ch3") fc5(rel) wpf(bb, "\n") wpf(bb, "fnd %2v of %2v fils \n", gotNumFiles, wntNumFiles) wpf(bb, "fnd %2v of %2v fsize \n", gotSizeFiles, wntSizeFiles) wpf(bb, "\n") testRes := "" if gotNumFiles != wntNumFiles { testRes += spf("Create files num : wnt %2v - got %v\n", wntNumFiles, gotNumFiles) } if gotSizeFiles != wntSizeFiles { testRes += spf("Create files size: wnt %2v - got %v\n", wntSizeFiles, gotSizeFiles) } return bb, testRes }
// Puttting it all together func Dedup(oURL *url.URL, least3Files []repo.FullArticle, lg loghttp.FuncBufUniv, fs fsi.FileSystem) *html.Node { opts := domclean2.CleaningOptions{Proxify: true, Beautify: true} // opts.FNamer = fNamer opts.AddOutline = true // opts.RemoteHost = fetch.HostFromStringUrl(least3Files[0].Url) opts.RemoteHost = oURL.Host // // domclean for i := 0; i < len(least3Files); i++ { fNamer := domclean2.FileNamer(logDir, i) fNamer() // first call yields key lg("cleaning %4.1fkB from %v", float64(len(least3Files[i].Body))/1024, stringspb.ToLenR(least3Files[i].Url, 60)) doc, err := domclean2.DomClean(least3Files[i].Body, opts) lg(err) fileDump(lg, fs, doc, fNamer, ".html") } if false { // // Textify with brute force for i := 0; i < len(least3Files); i++ { fNamer := domclean2.FileNamer(logDir, i) fNamer() // first call yields key bts, err := fs.ReadFile(fNamer() + ".html") lg(err) doc, err := html.Parse(bytes.NewReader(bts)) lg(err) textifyBruteForce(doc) var buf bytes.Buffer err = html.Render(&buf, doc) lg(err) b := buf.Bytes() b = bytes.Replace(b, []byte("[br]"), []byte("\n"), -1) fileDump(lg, fs, b, fNamer, "_raw.txt") } } // // Textify with more finetuning. // Save result to memory. textsByArticOutl := map[string][]*TextifiedTree{} for i := 0; i < len(least3Files); i++ { fNamer := domclean2.FileNamer(logDir, i) fnKey := fNamer() // first call yields key bts, err := fs.ReadFile(fNamer() + ".html") doc, err := html.Parse(bytes.NewReader(bts)) lg(err) fNamer() // one more // mp, bts := BubbledUpTextExtraction(doc, fnKey) fileDump(lg, fs, bts, fNamer, ".txt") mpSorted, dump := orderByOutline(mp) fileDump(lg, fs, dump, fNamer, ".txt") textsByArticOutl[fnKey] = mpSorted // for k, v := range mpSorted { // if k%33 != 0 { // continue // } // log.Printf("%3v: %v %14v %v\n", k, v.SourceID, v.Outline, v.Lvl) // } } // // // We progress from level 1 downwards. // Lower levels skip weeded out higher levels, // to save expensive levenshtein comparisons var skipPrefixes = map[string]bool{} for weedStage := 1; weedStage <= stageMax; weedStage++ { fNamer := domclean2.FileNamer(logDir, 0) fnKey := fNamer() // first call yields key levelsToProcess = map[int]bool{weedStage: true} frags := similarTextifiedTrees(textsByArticOutl, skipPrefixes, map[string]bool{fnKey: true}) similaritiesToFile(fs, logDir, frags, weedStage) for _, frag := range frags { if len(frag.Similars) >= numTotal-1 && frag.SumRelLevenshtein/(numTotal-1) < 0.2 { skipPrefixes[frag.Outline+"."] = true } } b := new(bytes.Buffer) for k, _ := range skipPrefixes { b.WriteString(k) b.WriteByte(32) } // log.Printf("%v\n", b.String()) } // // Apply dedup fNamer := domclean2.FileNamer(logDir, 0) fNamer() // first call yields key bts, err := fs.ReadFile(fNamer() + ".html") lg(err) doc, err := html.Parse(bytes.NewReader(bts)) lg(err) dedupApply(doc, skipPrefixes) // A special after dedup cleaning: // Remove ol and cfrm attributes var fr func(*html.Node) fr = func(n *html.Node) { if n.Type == html.ElementNode { attr2 := make([]html.Attribute, 0, len(n.Attr)) for _, attr := range n.Attr { if attr.Key != "ol" && attr.Key != "cfrm" { attr2 = append(attr2, attr) } } n.Attr = attr2 } for c := n.FirstChild; c != nil; c = c.NextSibling { fr(c) } } fr(doc) if false { // does not add value var b7 bytes.Buffer err := html.Render(&b7, doc) lg(err) doc, err = domclean2.DomClean(b7.Bytes(), opts) lg(err) } else { domclean2.DomFormat(doc) } return doc }
// Fetch takes a RSS XML uri and fetches some of its documents. // It uses a three staged pipeline for parallel fetching. // Results are stored into the given filesystem fs. // Config points to the source of RSS XML, // and has some rules for conflating URI directories. // uriPrefix and config.DesiredNumber tell the func // which subdirs of the RSS dir should be fetched - and how many at max. func FetchUsingRSS(w http.ResponseWriter, r *http.Request, fs fsi.FileSystem, config FetchCommand, ) { lg, b := loghttp.BuffLoggerUniversal(w, r) closureOverBuf := func(bUnused *bytes.Buffer) { loghttp.Pf(w, r, b.String()) } defer closureOverBuf(b) // the argument is ignored, if config.Host == "" { lg(" empty host; returning") return } config = addDefaults(config) // Fetching the rssXML takes time. // We do it before the timouts of the pipeline stages are set off. lg(" ") lg(config.Host) if config.Host == "test.economist.com" { switchTData(w, r) } // lg(stringspb.IndentedDump(config)) dirTree := &DirTree{Name: "/", Dirs: map[string]DirTree{}, EndPoint: true} fnDigest := path.Join(docRoot, config.Host, "digest2.json") loadDigest(w, r, lg, fs, fnDigest, dirTree) // previous age := time.Now().Sub(dirTree.LastFound) lg("DirTree is %5.2v hours old (%v)", age.Hours(), dirTree.LastFound.Format(time.ANSIC)) if age.Hours() > 0.001 { rssUrl := matchingRSSURI(w, r, config) if rssUrl == "" { m := new(MyWorker) m.r = r m.lg = lg m.fs1 = fs m.SURL = path.Join(config.Host, config.SearchPrefix) _, _, _, err := fetchSave(m) lg(err) if err != nil { return } } else { rssUrl = path.Join(config.Host, rssUrl) rssDoc, rssUrlObj := rssXMLFile(w, r, fs, rssUrl) _ = rssUrlObj rssDoc2DirTree(w, r, dirTree, rssDoc, config.Host) } saveDigest(lg, fs, fnDigest, dirTree) } // lg(dirTree.String()) // // // setting up a 3 staged pipeline from bottom up // var fullArticles []FullArticle var inn chan *FullArticle = make(chan *FullArticle) // jobs are stuffed in here var out chan *FullArticle = make(chan *FullArticle) // completed jobs are delivered here var fin chan struct{} = make(chan struct{}) // downstream signals end to upstream var stage3Wait sync.WaitGroup // stage 3 // fire up the "collector", a fan-in go func() { stage3Wait.Add(1) // 400 good value; critical point at 35 // economist.com required 800 ms const delayInitial = 1200 const delayRefresh = 800 cout := time.After(time.Millisecond * delayInitial) for { select { case fa := <-out: fullArticles = append(fullArticles, *fa) pth := fetch.PathFromStringUrl(fa.Url) lg(" fetched %v - %v ", fa.Mod.Format("15:04:05"), stringspb.Ellipsoider(pth, 50)) cout = time.After(time.Millisecond * delayRefresh) // refresh timeout case <-cout: lg("timeout after %v articles", len(fullArticles)) // we are using channel == nil - channel closed combinations // inspired by http://dave.cheney.net/2013/04/30/curious-channels out = nil // not close(out) => case above is now blocked close(fin) lg("fin closed; out nilled") stage3Wait.Done() return } } }() // // stage 2 for i := 0; i < numWorkers; i++ { // fire up a dedicated fetcher routine, a worker // we are using channel == nil - channel closed combinations // inspired by http://dave.cheney.net/2013/04/30/curious-channels go func() { var a *FullArticle for { select { case a = <-inn: var err error var inf fetch.Info a.Body, inf, err = fetch.UrlGetter(r, fetch.Options{URL: a.Url}) lg(err) if a.Mod.IsZero() { a.Mod = inf.Mod } select { case out <- a: case <-fin: lg(" worker spinning down; branch 1; abandoning %v", a.Url) return } a = new(FullArticle) case <-fin: if a != nil && a.Url != "" { u, _ := url.Parse(a.Url) lg(" abandoned %v", u.Path) } else { lg(" worker spinning down; branch 2") } return } } }() } // // // // loading stage 1 uriPrefix := config.SearchPrefix found := 0 uriPrefixExcl := "impossible" for i := 0; i < 15; i++ { lg(" searching for prefix %v - excl %q - %v of %v", uriPrefix, uriPrefixExcl, found, config.DesiredNumber) found += stuffStage1(w, r, config, inn, fin, dirTree, uriPrefixExcl, uriPrefix, config.DesiredNumber-found) if found >= config.DesiredNumber { break } if uriPrefix == "/" || uriPrefix == "." { lg(" root exhausted") break } newPrefix := path.Dir(uriPrefix) uriPrefixExcl = uriPrefix uriPrefix = newPrefix } lg(" found %v of %v", found, config.DesiredNumber) // lg("stage3Wait.Wait() before") stage3Wait.Wait() lg("stage3Wait.Wait() after") // workers spin down earlier - // but ae log writer and response writer need some time // to record the spin-down messages time.Sleep(120 * time.Millisecond) // compile out directory statistics histoDir := map[string]int{} for _, a := range fullArticles { u, err := url.Parse(a.Url) lg(err) semanticUri := condenseTrailingDir(u.Path, config.CondenseTrailingDirs) dir := path.Dir(semanticUri) histoDir[dir]++ } sr := sortmap.SortMapByCount(histoDir) _ = sr // Create dirs for k, _ := range histoDir { dir := path.Join(docRoot, k) // config.Host already contained in k err := fs.MkdirAll(dir, 0755) lg(err) err = fs.Chtimes(dir, time.Now(), time.Now()) lg(err) } // Saving as files for _, a := range fullArticles { if len(a.Body) == 0 { continue } u, err := url.Parse(a.Url) u.Fragment = "" u.RawQuery = "" lg(err) semanticUri := condenseTrailingDir(u.RequestURI(), config.CondenseTrailingDirs) p := path.Join(docRoot, semanticUri) err = fs.WriteFile(p, a.Body, 0644) lg(err) err = fs.Chtimes(p, a.Mod, a.Mod) lg(err) } { b, err := json.MarshalIndent(histoDir, " ", "\t") lg(err) fnDigest := path.Join(docRoot, config.Host, "fetchDigest.json") err = fs.WriteFile(fnDigest, b, 0755) lg(err) } // fsm, ok := memfs.Unwrap(fs) // if ok { // fsm.Dump() // } }