コード例 #1
0
ファイル: t2_dedup_test.go プロジェクト: aarzilli/tools
func Test2(t *testing.T) {

	start := time.Now()

	lg, b := loghttp.BuffLoggerUniversal(nil, nil)
	_ = b
	// closureOverBuf := func(bUnused *bytes.Buffer) {
	// 	loghttp.Pf(nil, nil, b.String())
	// }
	// defer closureOverBuf(b) // the argument is ignored,

	var c aetest.Context
	if false {
		var err error
		c, err = aetest.NewContext(nil)
		lg(err)
		if err != nil {
			return
		}
		defer c.Close()
	}
	fs := GetFS(c, 2)

	lg("took1 %4.2v secs", time.Now().Sub(start).Seconds())

	least3Files := FetchAndDecodeJSON(nil, URLs[0], "", lg, fs)

	lg("took2 %4.2v secs", time.Now().Sub(start).Seconds())

	doc := Dedup(least3Files, lg, fs)

	fNamer := domclean2.FileNamer(logDir, 0)
	fNamer() // first call yields key
	fsPerm := GetFS(c, 2)
	fileDump(lg, fsPerm, doc, fNamer, "_fin.html")

	pf("MapSimiliarCompares: %v SimpleCompares: %v LevenstheinComp: %v\n", breakMapsTooDistinct, appliedLevenshtein, appliedCompare)
	pf("Finish\n")

}
コード例 #2
0
ファイル: 7_pipeline.go プロジェクト: aarzilli/tools
// Puttting it all together
func Dedup(oURL *url.URL,
	least3Files []repo.FullArticle, lg loghttp.FuncBufUniv, fs fsi.FileSystem) *html.Node {

	opts := domclean2.CleaningOptions{Proxify: true, Beautify: true}
	// opts.FNamer = fNamer
	opts.AddOutline = true
	// opts.RemoteHost = fetch.HostFromStringUrl(least3Files[0].Url)
	opts.RemoteHost = oURL.Host

	//
	// domclean
	for i := 0; i < len(least3Files); i++ {

		fNamer := domclean2.FileNamer(logDir, i)
		fNamer() // first call yields key

		lg("cleaning %4.1fkB from %v", float64(len(least3Files[i].Body))/1024,
			stringspb.ToLenR(least3Files[i].Url, 60))

		doc, err := domclean2.DomClean(least3Files[i].Body, opts)
		lg(err)

		fileDump(lg, fs, doc, fNamer, ".html")

	}

	if false {
		//
		// Textify with brute force
		for i := 0; i < len(least3Files); i++ {

			fNamer := domclean2.FileNamer(logDir, i)
			fNamer() // first call yields key

			bts, err := fs.ReadFile(fNamer() + ".html")
			lg(err)
			doc, err := html.Parse(bytes.NewReader(bts))
			lg(err)

			textifyBruteForce(doc)

			var buf bytes.Buffer
			err = html.Render(&buf, doc)
			lg(err)

			b := buf.Bytes()
			b = bytes.Replace(b, []byte("[br]"), []byte("\n"), -1)

			fileDump(lg, fs, b, fNamer, "_raw.txt")
		}
	}

	//
	// Textify with more finetuning.
	// Save result to memory.
	textsByArticOutl := map[string][]*TextifiedTree{}
	for i := 0; i < len(least3Files); i++ {

		fNamer := domclean2.FileNamer(logDir, i)
		fnKey := fNamer() // first call yields key

		bts, err := fs.ReadFile(fNamer() + ".html")

		doc, err := html.Parse(bytes.NewReader(bts))
		lg(err)

		fNamer() // one more

		//
		mp, bts := BubbledUpTextExtraction(doc, fnKey)
		fileDump(lg, fs, bts, fNamer, ".txt")

		mpSorted, dump := orderByOutline(mp)
		fileDump(lg, fs, dump, fNamer, ".txt")
		textsByArticOutl[fnKey] = mpSorted

		// for k, v := range mpSorted {
		// 	if k%33 != 0 {
		// 		continue
		// 	}
		// 	log.Printf("%3v: %v %14v  %v\n", k, v.SourceID, v.Outline, v.Lvl)
		// }

	}

	//
	//
	// We progress from level 1 downwards.
	// Lower levels skip weeded out higher levels,
	// to save expensive levenshtein comparisons
	var skipPrefixes = map[string]bool{}
	for weedStage := 1; weedStage <= stageMax; weedStage++ {

		fNamer := domclean2.FileNamer(logDir, 0)
		fnKey := fNamer() // first call yields key

		levelsToProcess = map[int]bool{weedStage: true}
		frags := similarTextifiedTrees(textsByArticOutl, skipPrefixes, map[string]bool{fnKey: true})

		similaritiesToFile(fs, logDir, frags, weedStage)

		for _, frag := range frags {
			if len(frag.Similars) >= numTotal-1 &&
				frag.SumRelLevenshtein/(numTotal-1) < 0.2 {
				skipPrefixes[frag.Outline+"."] = true
			}
		}
		b := new(bytes.Buffer)
		for k, _ := range skipPrefixes {
			b.WriteString(k)
			b.WriteByte(32)
		}
		// log.Printf("%v\n", b.String())

	}

	//
	// Apply dedup
	fNamer := domclean2.FileNamer(logDir, 0)
	fNamer() // first call yields key

	bts, err := fs.ReadFile(fNamer() + ".html")
	lg(err)
	doc, err := html.Parse(bytes.NewReader(bts))
	lg(err)

	dedupApply(doc, skipPrefixes)

	// A special after dedup cleaning:
	// Remove ol and cfrm attributes
	var fr func(*html.Node)
	fr = func(n *html.Node) {
		if n.Type == html.ElementNode {
			attr2 := make([]html.Attribute, 0, len(n.Attr))
			for _, attr := range n.Attr {
				if attr.Key != "ol" && attr.Key != "cfrm" {
					attr2 = append(attr2, attr)
				}
			}
			n.Attr = attr2
		}
		for c := n.FirstChild; c != nil; c = c.NextSibling {
			fr(c)
		}
	}
	fr(doc)

	if false {
		// does not add value
		var b7 bytes.Buffer
		err := html.Render(&b7, doc)
		lg(err)

		doc, err = domclean2.DomClean(b7.Bytes(), opts)
		lg(err)

	} else {
		domclean2.DomFormat(doc)
	}

	return doc
}
コード例 #3
0
ファイル: register_handlers.go プロジェクト: aarzilli/tools
// dedupHTTP wraps Dedup()
func dedupHTTP(w http.ResponseWriter, r *http.Request, m map[string]interface{}) {

	lg, b := loghttp.BuffLoggerUniversal(w, r)
	closureOverBuf := func(bUnused *bytes.Buffer) {
		loghttp.Pf(w, r, b.String())
	}
	defer closureOverBuf(b) // the argument is ignored,

	r.Header.Set("X-Custom-Header-Counter", "nocounter")

	wpf(b, tplx.ExecTplHelper(tplx.Head, map[string]interface{}{"HtmlTitle": "Deduplicating redundant stuff"}))
	defer wpf(b, tplx.Foot)

	wpf(b, "<pre>")
	defer wpf(b, "</pre>")

	err := r.ParseForm()
	lg(err)

	surl := r.FormValue(routes.URLParamKey)
	ourl, err := fetch.URLFromString(surl)
	lg(err)
	if err != nil {
		return
	}
	if ourl.Host == "" {
		lg("host is empty (%v)", surl)
		return
	}

	knownProtocol := ""
	if r.FormValue("prot") != "" {
		knownProtocol = r.FormValue("prot")
	}

	lg("Host %q, Path %q", ourl.Host, ourl.Path)

	fs := GetFS(appengine.NewContext(r), 0)

	least3Files := FetchAndDecodeJSON(r, ourl.String(), knownProtocol, lg, fs)

	lg("Fetched and decoded; found %v", len(least3Files))
	if len(least3Files) > 0 {
		doc := Dedup(ourl, least3Files, lg, fs)

		fNamer := domclean2.FileNamer(logDir, 0)
		fNamer() // first call yields key
		fsPerm := GetFS(appengine.NewContext(r), 0)
		fileDump(lg, fsPerm, doc, fNamer, "_fin.html")

		lg("MapSimiliarCompares: %v SimpleCompares: %v LevenstheinComp: %v\n", breakMapsTooDistinct, appliedLevenshtein, appliedCompare)
		lg("Finish\n")

		var b2 bytes.Buffer
		err := html.Render(&b2, doc)
		lg(err)
		if err != nil {
			return
		}

		b = new(bytes.Buffer)
		// w.Write([]byte("aa"))
		w.Header().Set("Content-type", "text/html; charset=utf-8")
		w.Write(b2.Bytes())

	}

}
コード例 #4
0
ファイル: t1_dedup_test.go プロジェクト: aarzilli/tools
func Test1(t *testing.T) {

	lg, b := loghttp.BuffLoggerUniversal(nil, nil)
	_ = b

	c, err := aetest.NewContext(nil)
	lg(err)
	if err != nil {
		return
	}
	defer c.Close()
	fs := GetFS(c, 2)

	remoteHostname := "www.welt.de"
	remoteHostname = "www.welt.de/politik/ausland"

	dirs1, _, msg, err := fileserver.GetDirContents(repo.RepoURL, remoteHostname)
	if err != nil {
		lg(err)
		lg("%s", msg)
	}

	lg("dirs1")
	for _, v := range dirs1 {
		lg("    %v", v)
	}

	least3URLs := []string{}
	for _, v1 := range dirs1 {

		p := path.Join(remoteHostname, v1)
		dirs2, fils2, msg, err := fileserver.GetDirContents(repo.RepoURL, p)
		_ = dirs2
		if err != nil {
			lg(err)
			lg("%s", msg)
		}
		// lg("  dirs2 %v", stringspb.IndentedDump(dirs2))
		// lg("  fils2 %v", stringspb.IndentedDump(fils2))

		for _, v2 := range fils2 {
			least3URLs = append(least3URLs, path.Join(remoteHostname, v1, v2))
		}
	}

	if len(least3URLs) < numTotal {
		lg("not enough files in rss fetcher cache")
		return
	} else {
		least3URLs = least3URLs[:numTotal+1]
	}

	lg("fils2")
	for _, v := range least3URLs {
		lg("    %v", v)
	}

	// domclean

	least3Files := make([]repo.FullArticle, 0, len(least3URLs))
	for i := 0; i < len(least3URLs); i++ {

		surl := spf("%v/%v", repo.RepoURL, least3URLs[i])

		fNamer := domclean2.FileNamer(logDir, i)
		fNamer() // first call yields key

		resBytes, inf, err := fetch.UrlGetter(nil, fetch.Options{URL: surl})
		if err != nil {
			lg(err)
			return
		}
		lg("fetched %4.1fkB from %v", float64(len(resBytes))/1024, stringspb.ToLenR(inf.URL.String(), 60))

		fa := repo.FullArticle{}
		fa.Url = inf.URL.String()
		fa.Mod = inf.Mod
		fa.Body = resBytes
		least3Files = append(least3Files, fa)

	}

	doc := Dedup(least3Files, lg, fs)

	fNamer := domclean2.FileNamer(logDir, 0)
	fNamer() // first call yields key
	fsPerm := GetFS(c, 2)
	fileDump(lg, fsPerm, doc, fNamer, "_fin.html")

	pf("MapSimiliarCompares: %v SimpleCompares: %v LevenstheinComp: %v\n", breakMapsTooDistinct, appliedLevenshtein, appliedCompare)
	pf("correct finish\n")

}