func loadDigest(w http.ResponseWriter, r *http.Request, lg loghttp.FuncBufUniv, fs fsi.FileSystem, fnDigest string, treeX *DirTree) { fnDigestSnappied := strings.Replace(fnDigest, ".json", ".json.snappy", -1) bts, err := fs.ReadFile(fnDigestSnappied) if err == nil { btsDec := []byte{} lg("encoded digest loaded, size %vkB", len(bts)/1024) btsDec, err := snappy.Decode(nil, bts) if err != nil { lg(err) return } lg("digest decoded from %vkB to %vkB", len(bts)/1024, len(btsDec)/1024) bts = btsDec } else { bts, err = fs.ReadFile(fnDigest) lg(err) } if err == nil { err = json.Unmarshal(bts, &treeX) lg(err) } lg("DirTree %5.2vkB loaded for %v", len(bts)/1024, fnDigest) }
func CreateSys(fs fsi.FileSystem) (*bytes.Buffer, string) { bb := new(bytes.Buffer) wpf(bb, "--------create-dirs---------\n") fc1 := func(p []string) { path := pth.Join(p...) err := fs.MkdirAll(relOpt+path, os.ModePerm) if err != nil { wpf(bb, "MkdirAll failed %v\n", err) } } fc1([]string{"ch1"}) fc1([]string{"ch1", "ch2"}) fc1([]string{"ch1", "ch2", "ch3"}) fc1([]string{"ch1", "ch2", "ch3", "ch4"}) fc1([]string{"ch1", "ch2a"}) fc1([]string{"ch1A"}) fc1([]string{"ch1B"}) fc1([]string{"d1", "d2", "d3_secretdir", "neverwalked"}) fc1([]string{"d1", "d2", "d3a", "willwalk"}) wpf(bb, "\n--------retrieve-dirs---------\n") // retrieval gotByPath := 0 wntByPath := 5 fc2 := func(p []string) { path := pth.Join(p...) wpf(bb, "searching... %q\n", path) f, err := fs.Lstat(relOpt + path) if err != nil { wpf(bb, " nothing retrieved - err %v\n", err) } else { wpf(bb, " fnd %v \n", pth.Join(path, f.Name())) gotByPath++ } } fc2([]string{"ch1"}) fc2([]string{"ch1", "ch2"}) fc2([]string{"ch1", "non-exist-dir"}) fc2([]string{"ch1", "ch2", "ch3"}) fc2([]string{"ch1A"}) fc2([]string{rel}) wpf(bb, "\nfnd %v of %v dirs \n", gotByPath, wntByPath) wpf(bb, "\n-------create and save some files----\n") fc4a := func(name, content string) { err := fs.WriteFile(relOpt+name, []byte(content), os.ModePerm) if err != nil { wpf(bb, "WriteFile %v failed %v\n", name, err) } } fc4b := func(name, content string) { f, err := fs.Create(relOpt + name) if err != nil { wpf(bb, "Create %v failed %v\n", name, err) } if err != nil { return } _, err = f.WriteString(content) if err != nil { wpf(bb, "WriteString %v failed %v\n", name, err) } err = f.Close() if err != nil { wpf(bb, "Sync %v failed %v\n", name, err) } } fc4a("ch1/ch2/file_1", "content 1") fc4b("ch1/ch2/file_2", "content 2") fc4a("ch1/ch2/ch3/file3", "another content") fc4b(relPsep+"file4", "chq content 2") // fsc, ok := memfs.Unwrap(fs) // if ok { // fsc.Dump() // } // return bb, "" wpf(bb, "\n-------retrieve files again----\n\n") gotNumFiles := 0 wntNumFiles := 4 gotSizeFiles := 0 wntSizeFiles := 9 + 9 + 15 + 13 fc5 := func(path string) { files, err := fs.ReadDir(relOpt + path) if err != nil { wpf(bb, "filesByPath %v failed %v\n", path, err) } wpf(bb, " srch %-20q yielded %v dirs+files\n", relOpt+path, len(files)) for k, v := range files { if v.IsDir() { wpf(bb, " skip dir %v \n", v.Name()) continue } data, err := fs.ReadFile(pth.Join(path, v.Name())) if err != nil { wpf(bb, "could not get content of %v => %v\n", pth.Join(path, v.Name()), err) } wpf(bb, " %v - %v %s\n", k, pth.Join(path, v.Name()), data) gotNumFiles++ gotSizeFiles += len(data) } } fc5("ch1/ch2") fc5("ch1/ch2/ch3") fc5(rel) wpf(bb, "\n") wpf(bb, "fnd %2v of %2v fils \n", gotNumFiles, wntNumFiles) wpf(bb, "fnd %2v of %2v fsize \n", gotSizeFiles, wntSizeFiles) wpf(bb, "\n") testRes := "" if gotNumFiles != wntNumFiles { testRes += spf("Create files num : wnt %2v - got %v\n", wntNumFiles, gotNumFiles) } if gotSizeFiles != wntSizeFiles { testRes += spf("Create files size: wnt %2v - got %v\n", wntSizeFiles, gotSizeFiles) } return bb, testRes }
// Puttting it all together func Dedup(oURL *url.URL, least3Files []repo.FullArticle, lg loghttp.FuncBufUniv, fs fsi.FileSystem) *html.Node { opts := domclean2.CleaningOptions{Proxify: true, Beautify: true} // opts.FNamer = fNamer opts.AddOutline = true // opts.RemoteHost = fetch.HostFromStringUrl(least3Files[0].Url) opts.RemoteHost = oURL.Host // // domclean for i := 0; i < len(least3Files); i++ { fNamer := domclean2.FileNamer(logDir, i) fNamer() // first call yields key lg("cleaning %4.1fkB from %v", float64(len(least3Files[i].Body))/1024, stringspb.ToLenR(least3Files[i].Url, 60)) doc, err := domclean2.DomClean(least3Files[i].Body, opts) lg(err) fileDump(lg, fs, doc, fNamer, ".html") } if false { // // Textify with brute force for i := 0; i < len(least3Files); i++ { fNamer := domclean2.FileNamer(logDir, i) fNamer() // first call yields key bts, err := fs.ReadFile(fNamer() + ".html") lg(err) doc, err := html.Parse(bytes.NewReader(bts)) lg(err) textifyBruteForce(doc) var buf bytes.Buffer err = html.Render(&buf, doc) lg(err) b := buf.Bytes() b = bytes.Replace(b, []byte("[br]"), []byte("\n"), -1) fileDump(lg, fs, b, fNamer, "_raw.txt") } } // // Textify with more finetuning. // Save result to memory. textsByArticOutl := map[string][]*TextifiedTree{} for i := 0; i < len(least3Files); i++ { fNamer := domclean2.FileNamer(logDir, i) fnKey := fNamer() // first call yields key bts, err := fs.ReadFile(fNamer() + ".html") doc, err := html.Parse(bytes.NewReader(bts)) lg(err) fNamer() // one more // mp, bts := BubbledUpTextExtraction(doc, fnKey) fileDump(lg, fs, bts, fNamer, ".txt") mpSorted, dump := orderByOutline(mp) fileDump(lg, fs, dump, fNamer, ".txt") textsByArticOutl[fnKey] = mpSorted // for k, v := range mpSorted { // if k%33 != 0 { // continue // } // log.Printf("%3v: %v %14v %v\n", k, v.SourceID, v.Outline, v.Lvl) // } } // // // We progress from level 1 downwards. // Lower levels skip weeded out higher levels, // to save expensive levenshtein comparisons var skipPrefixes = map[string]bool{} for weedStage := 1; weedStage <= stageMax; weedStage++ { fNamer := domclean2.FileNamer(logDir, 0) fnKey := fNamer() // first call yields key levelsToProcess = map[int]bool{weedStage: true} frags := similarTextifiedTrees(textsByArticOutl, skipPrefixes, map[string]bool{fnKey: true}) similaritiesToFile(fs, logDir, frags, weedStage) for _, frag := range frags { if len(frag.Similars) >= numTotal-1 && frag.SumRelLevenshtein/(numTotal-1) < 0.2 { skipPrefixes[frag.Outline+"."] = true } } b := new(bytes.Buffer) for k, _ := range skipPrefixes { b.WriteString(k) b.WriteByte(32) } // log.Printf("%v\n", b.String()) } // // Apply dedup fNamer := domclean2.FileNamer(logDir, 0) fNamer() // first call yields key bts, err := fs.ReadFile(fNamer() + ".html") lg(err) doc, err := html.Parse(bytes.NewReader(bts)) lg(err) dedupApply(doc, skipPrefixes) // A special after dedup cleaning: // Remove ol and cfrm attributes var fr func(*html.Node) fr = func(n *html.Node) { if n.Type == html.ElementNode { attr2 := make([]html.Attribute, 0, len(n.Attr)) for _, attr := range n.Attr { if attr.Key != "ol" && attr.Key != "cfrm" { attr2 = append(attr2, attr) } } n.Attr = attr2 } for c := n.FirstChild; c != nil; c = c.NextSibling { fr(c) } } fr(doc) if false { // does not add value var b7 bytes.Buffer err := html.Render(&b7, doc) lg(err) doc, err = domclean2.DomClean(b7.Bytes(), opts) lg(err) } else { domclean2.DomFormat(doc) } return doc }