Пример #1
0
func loadDigest(w http.ResponseWriter, r *http.Request, lg loghttp.FuncBufUniv, fs fsi.FileSystem, fnDigest string, treeX *DirTree) {

	fnDigestSnappied := strings.Replace(fnDigest, ".json", ".json.snappy", -1)
	bts, err := fs.ReadFile(fnDigestSnappied)
	if err == nil {
		btsDec := []byte{}
		lg("encoded digest loaded, size %vkB", len(bts)/1024)
		btsDec, err := snappy.Decode(nil, bts)
		if err != nil {
			lg(err)
			return
		}
		lg("digest decoded from %vkB to %vkB", len(bts)/1024, len(btsDec)/1024)
		bts = btsDec
	} else {
		bts, err = fs.ReadFile(fnDigest)
		lg(err)
	}

	if err == nil {
		err = json.Unmarshal(bts, &treeX)
		lg(err)
	}

	lg("DirTree   %5.2vkB loaded for %v", len(bts)/1024, fnDigest)

}
Пример #2
0
func CreateSys(fs fsi.FileSystem) (*bytes.Buffer, string) {

	bb := new(bytes.Buffer)
	wpf(bb, "--------create-dirs---------\n")

	fc1 := func(p []string) {
		path := pth.Join(p...)
		err := fs.MkdirAll(relOpt+path, os.ModePerm)
		if err != nil {
			wpf(bb, "MkdirAll failed %v\n", err)
		}
	}

	fc1([]string{"ch1"})
	fc1([]string{"ch1", "ch2"})
	fc1([]string{"ch1", "ch2", "ch3"})
	fc1([]string{"ch1", "ch2", "ch3", "ch4"})
	fc1([]string{"ch1", "ch2a"})
	fc1([]string{"ch1A"})
	fc1([]string{"ch1B"})
	fc1([]string{"d1", "d2", "d3_secretdir", "neverwalked"})
	fc1([]string{"d1", "d2", "d3a", "willwalk"})

	wpf(bb, "\n--------retrieve-dirs---------\n")

	// retrieval
	gotByPath := 0
	wntByPath := 5
	fc2 := func(p []string) {
		path := pth.Join(p...)
		wpf(bb, "searching... %q\n", path)
		f, err := fs.Lstat(relOpt + path)
		if err != nil {
			wpf(bb, "   nothing retrieved - err %v\n", err)
		} else {
			wpf(bb, "   fnd %v \n", pth.Join(path, f.Name()))
			gotByPath++
		}
	}
	fc2([]string{"ch1"})
	fc2([]string{"ch1", "ch2"})
	fc2([]string{"ch1", "non-exist-dir"})
	fc2([]string{"ch1", "ch2", "ch3"})
	fc2([]string{"ch1A"})
	fc2([]string{rel})

	wpf(bb, "\nfnd %v of %v dirs \n", gotByPath, wntByPath)

	wpf(bb, "\n-------create and save some files----\n")

	fc4a := func(name, content string) {
		err := fs.WriteFile(relOpt+name, []byte(content), os.ModePerm)
		if err != nil {
			wpf(bb, "WriteFile %v failed %v\n", name, err)
		}
	}
	fc4b := func(name, content string) {
		f, err := fs.Create(relOpt + name)
		if err != nil {
			wpf(bb, "Create %v failed %v\n", name, err)
		}
		if err != nil {
			return
		}
		_, err = f.WriteString(content)
		if err != nil {
			wpf(bb, "WriteString %v failed %v\n", name, err)
		}
		err = f.Close()
		if err != nil {
			wpf(bb, "Sync %v failed %v\n", name, err)
		}
	}

	fc4a("ch1/ch2/file_1", "content 1")
	fc4b("ch1/ch2/file_2", "content 2")
	fc4a("ch1/ch2/ch3/file3", "another content")
	fc4b(relPsep+"file4", "chq content 2")

	// fsc, ok := memfs.Unwrap(fs)
	// if ok {
	// 	fsc.Dump()
	// }
	// return bb, ""

	wpf(bb, "\n-------retrieve files again----\n\n")

	gotNumFiles := 0
	wntNumFiles := 4
	gotSizeFiles := 0
	wntSizeFiles := 9 + 9 + 15 + 13

	fc5 := func(path string) {
		files, err := fs.ReadDir(relOpt + path)
		if err != nil {
			wpf(bb, "filesByPath %v failed %v\n", path, err)
		}
		wpf(bb, " srch %-20q yielded %v dirs+files\n", relOpt+path, len(files))

		for k, v := range files {
			if v.IsDir() {
				wpf(bb, "   skip dir %v \n", v.Name())
				continue
			}
			data, err := fs.ReadFile(pth.Join(path, v.Name()))
			if err != nil {
				wpf(bb, "could not get content of %v =>  %v\n", pth.Join(path, v.Name()), err)
			}
			wpf(bb, "     %v  -  %v %s\n", k, pth.Join(path, v.Name()), data)
			gotNumFiles++
			gotSizeFiles += len(data)
		}
	}

	fc5("ch1/ch2")
	fc5("ch1/ch2/ch3")
	fc5(rel)

	wpf(bb, "\n")

	wpf(bb, "fnd %2v of %2v fils \n", gotNumFiles, wntNumFiles)
	wpf(bb, "fnd %2v of %2v fsize \n", gotSizeFiles, wntSizeFiles)
	wpf(bb, "\n")

	testRes := ""
	if gotNumFiles != wntNumFiles {
		testRes += spf("Create files num :   wnt %2v - got %v\n", wntNumFiles, gotNumFiles)
	}
	if gotSizeFiles != wntSizeFiles {
		testRes += spf("Create files size:   wnt %2v - got %v\n", wntSizeFiles, gotSizeFiles)
	}
	return bb, testRes
}
Пример #3
0
// Puttting it all together
func Dedup(oURL *url.URL,
	least3Files []repo.FullArticle, lg loghttp.FuncBufUniv, fs fsi.FileSystem) *html.Node {

	opts := domclean2.CleaningOptions{Proxify: true, Beautify: true}
	// opts.FNamer = fNamer
	opts.AddOutline = true
	// opts.RemoteHost = fetch.HostFromStringUrl(least3Files[0].Url)
	opts.RemoteHost = oURL.Host

	//
	// domclean
	for i := 0; i < len(least3Files); i++ {

		fNamer := domclean2.FileNamer(logDir, i)
		fNamer() // first call yields key

		lg("cleaning %4.1fkB from %v", float64(len(least3Files[i].Body))/1024,
			stringspb.ToLenR(least3Files[i].Url, 60))

		doc, err := domclean2.DomClean(least3Files[i].Body, opts)
		lg(err)

		fileDump(lg, fs, doc, fNamer, ".html")

	}

	if false {
		//
		// Textify with brute force
		for i := 0; i < len(least3Files); i++ {

			fNamer := domclean2.FileNamer(logDir, i)
			fNamer() // first call yields key

			bts, err := fs.ReadFile(fNamer() + ".html")
			lg(err)
			doc, err := html.Parse(bytes.NewReader(bts))
			lg(err)

			textifyBruteForce(doc)

			var buf bytes.Buffer
			err = html.Render(&buf, doc)
			lg(err)

			b := buf.Bytes()
			b = bytes.Replace(b, []byte("[br]"), []byte("\n"), -1)

			fileDump(lg, fs, b, fNamer, "_raw.txt")
		}
	}

	//
	// Textify with more finetuning.
	// Save result to memory.
	textsByArticOutl := map[string][]*TextifiedTree{}
	for i := 0; i < len(least3Files); i++ {

		fNamer := domclean2.FileNamer(logDir, i)
		fnKey := fNamer() // first call yields key

		bts, err := fs.ReadFile(fNamer() + ".html")

		doc, err := html.Parse(bytes.NewReader(bts))
		lg(err)

		fNamer() // one more

		//
		mp, bts := BubbledUpTextExtraction(doc, fnKey)
		fileDump(lg, fs, bts, fNamer, ".txt")

		mpSorted, dump := orderByOutline(mp)
		fileDump(lg, fs, dump, fNamer, ".txt")
		textsByArticOutl[fnKey] = mpSorted

		// for k, v := range mpSorted {
		// 	if k%33 != 0 {
		// 		continue
		// 	}
		// 	log.Printf("%3v: %v %14v  %v\n", k, v.SourceID, v.Outline, v.Lvl)
		// }

	}

	//
	//
	// We progress from level 1 downwards.
	// Lower levels skip weeded out higher levels,
	// to save expensive levenshtein comparisons
	var skipPrefixes = map[string]bool{}
	for weedStage := 1; weedStage <= stageMax; weedStage++ {

		fNamer := domclean2.FileNamer(logDir, 0)
		fnKey := fNamer() // first call yields key

		levelsToProcess = map[int]bool{weedStage: true}
		frags := similarTextifiedTrees(textsByArticOutl, skipPrefixes, map[string]bool{fnKey: true})

		similaritiesToFile(fs, logDir, frags, weedStage)

		for _, frag := range frags {
			if len(frag.Similars) >= numTotal-1 &&
				frag.SumRelLevenshtein/(numTotal-1) < 0.2 {
				skipPrefixes[frag.Outline+"."] = true
			}
		}
		b := new(bytes.Buffer)
		for k, _ := range skipPrefixes {
			b.WriteString(k)
			b.WriteByte(32)
		}
		// log.Printf("%v\n", b.String())

	}

	//
	// Apply dedup
	fNamer := domclean2.FileNamer(logDir, 0)
	fNamer() // first call yields key

	bts, err := fs.ReadFile(fNamer() + ".html")
	lg(err)
	doc, err := html.Parse(bytes.NewReader(bts))
	lg(err)

	dedupApply(doc, skipPrefixes)

	// A special after dedup cleaning:
	// Remove ol and cfrm attributes
	var fr func(*html.Node)
	fr = func(n *html.Node) {
		if n.Type == html.ElementNode {
			attr2 := make([]html.Attribute, 0, len(n.Attr))
			for _, attr := range n.Attr {
				if attr.Key != "ol" && attr.Key != "cfrm" {
					attr2 = append(attr2, attr)
				}
			}
			n.Attr = attr2
		}
		for c := n.FirstChild; c != nil; c = c.NextSibling {
			fr(c)
		}
	}
	fr(doc)

	if false {
		// does not add value
		var b7 bytes.Buffer
		err := html.Render(&b7, doc)
		lg(err)

		doc, err = domclean2.DomClean(b7.Bytes(), opts)
		lg(err)

	} else {
		domclean2.DomFormat(doc)
	}

	return doc
}