Exemplo n.º 1
0
// dedupHTTP wraps Dedup()
func dedupHTTP(w http.ResponseWriter, r *http.Request, m map[string]interface{}) {

	lg, b := loghttp.BuffLoggerUniversal(w, r)
	closureOverBuf := func(bUnused *bytes.Buffer) {
		loghttp.Pf(w, r, b.String())
	}
	defer closureOverBuf(b) // the argument is ignored,

	r.Header.Set("X-Custom-Header-Counter", "nocounter")

	wpf(b, tplx.ExecTplHelper(tplx.Head, map[string]interface{}{"HtmlTitle": "Deduplicating redundant stuff"}))
	defer wpf(b, tplx.Foot)

	wpf(b, "<pre>")
	defer wpf(b, "</pre>")

	err := r.ParseForm()
	lg(err)

	surl := r.FormValue(routes.URLParamKey)
	ourl, err := fetch.URLFromString(surl)
	lg(err)
	if err != nil {
		return
	}
	if ourl.Host == "" {
		lg("host is empty (%v)", surl)
		return
	}

	knownProtocol := ""
	if r.FormValue("prot") != "" {
		knownProtocol = r.FormValue("prot")
	}

	lg("Host %q, Path %q", ourl.Host, ourl.Path)

	fs := GetFS(appengine.NewContext(r), 0)

	least3Files := FetchAndDecodeJSON(r, ourl.String(), knownProtocol, lg, fs)

	lg("Fetched and decoded; found %v", len(least3Files))
	if len(least3Files) > 0 {
		doc := Dedup(ourl, least3Files, lg, fs)

		fNamer := domclean2.FileNamer(logDir, 0)
		fNamer() // first call yields key
		fsPerm := GetFS(appengine.NewContext(r), 0)
		fileDump(lg, fsPerm, doc, fNamer, "_fin.html")

		lg("MapSimiliarCompares: %v SimpleCompares: %v LevenstheinComp: %v\n", breakMapsTooDistinct, appliedLevenshtein, appliedCompare)
		lg("Finish\n")

		var b2 bytes.Buffer
		err := html.Render(&b2, doc)
		lg(err)
		if err != nil {
			return
		}

		b = new(bytes.Buffer)
		// w.Write([]byte("aa"))
		w.Header().Set("Content-type", "text/html; charset=utf-8")
		w.Write(b2.Bytes())

	}

}
Exemplo n.º 2
0
// FetchSimilar is an extended version of Fetch
// It is uses a DirTree of crawled *links*, not actual files.
// As it moves up the DOM, it crawls every document for additional links.
// It first moves up to find similar URLs on the same depth
//                        /\
//          /\           /  \
//    /\   /  \         /    \
// It then moves up the ladder again - to accept higher URLs
//                        /\
//          /\
//    /\
func FetchSimilar(w http.ResponseWriter, r *http.Request, m map[string]interface{}) {

	lg, b := loghttp.BuffLoggerUniversal(w, r)
	closureOverBuf := func(bUnused *bytes.Buffer) {
		loghttp.Pf(w, r, b.String())
	}
	defer closureOverBuf(b) // the argument is ignored,

	r.Header.Set("X-Custom-Header-Counter", "nocounter")

	start := time.Now()

	wpf(b, tplx.ExecTplHelper(tplx.Head, map[string]interface{}{"HtmlTitle": "Find similar HTML URLs"}))
	defer wpf(b, tplx.Foot)

	wpf(b, "<pre>")
	defer wpf(b, "</pre>")

	fs1 := GetFS(appengine.NewContext(r))

	err := r.ParseForm()
	lg(err)

	countSimilar := 3
	sCountSimilar := r.FormValue("cnt")
	if sCountSimilar != "" {
		i, err := strconv.Atoi(strings.TrimSpace(sCountSimilar))
		if err == nil {
			countSimilar = i
		}
	}

	surl := r.FormValue(routes.URLParamKey)
	ourl, err := fetch.URLFromString(surl)
	lg(err)
	if err != nil {
		return
	}
	if ourl.Host == "" {
		lg("host is empty (%v)", surl)
		return
	}

	knownProtocol := ""
	if r.FormValue("prot") != "" {
		knownProtocol = r.FormValue("prot")
	}

	numWorkers := 0
	sNumWorkers := r.FormValue("numworkers")
	if sNumWorkers != "" {
		i, err := strconv.Atoi(strings.TrimSpace(sNumWorkers))
		if err == nil {
			numWorkers = i
		}
	}

	srcDepth := strings.Count(ourl.Path, "/")

	cmd := FetchCommand{}
	cmd.Host = ourl.Host
	cmd.SearchPrefix = ourl.Path
	cmd = addDefaults(cmd)

	dirTree := &DirTree{Name: "/", Dirs: map[string]DirTree{}, EndPoint: true}
	fnDigest := path.Join(docRoot, cmd.Host, "digest2.json")
	loadDigest(w, r, lg, fs1, fnDigest, dirTree) // previous
	lg("dirtree 400 chars is %v end of dirtree\t\t", stringspb.ToLen(dirTree.String(), 400))

	m1 := new(MyWorker)
	m1.r = r
	m1.lg = lg
	m1.fs1 = fs1
	m1.SURL = path.Join(cmd.Host, ourl.Path)
	m1.Protocol = knownProtocol
	btsSrc, modSrc, usedExisting, err := fetchSave(m1)
	if !usedExisting {
		addAnchors(lg, cmd.Host, btsSrc, dirTree)
	}
	lg(err)
	if err != nil {
		return
	}

	lg("\t\t%4.2v secs so far 1", time.Now().Sub(start).Seconds())

	var treePath string
	treePath = "/blogs/freeexchange"
	treePath = "/news/europe"
	treePath = path.Dir(ourl.Path)

	opt := LevelWiseDeeperOptions{}
	opt.Rump = treePath
	opt.ExcludeDir = "/news/americas"
	opt.ExcludeDir = "/blogs/buttonwood"
	opt.ExcludeDir = "/something-impossible"
	opt.MinDepthDiff = 1
	opt.MaxDepthDiff = 1
	opt.CondenseTrailingDirs = cmd.CondenseTrailingDirs
	opt.MaxNumber = cmd.DesiredNumber + 1  // one more for "self"
	opt.MaxNumber = cmd.DesiredNumber + 40 // collect more, 'cause we filter out those too old later

	var subtree *DirTree
	links := []FullArticle{}

	alreadyCrawled := map[string]struct{}{}

MarkOuter:
	for j := 0; j < srcDepth; j++ {
		treePath = path.Dir(ourl.Path)
	MarkInner:
		// for i := 1; i < srcDepth; i++ {
		for i := 1; i < (srcDepth + 5); i++ {

			subtree, treePath = DiveToDeepestMatch(dirTree, treePath)

			lg("Looking from height %v to level %v  - %v", srcDepth-i, srcDepth-j, treePath)

			if _, ok := alreadyCrawled[treePath]; ok {
				// lg("\t already digested %v", treePath)
				continue
			}

			m2 := new(MyWorker)
			m2.r = r
			m2.lg = lg
			m2.fs1 = fs1
			m2.SURL = path.Join(cmd.Host, treePath)
			m2.Protocol = knownProtocol

			btsPar, _, usedExisting, err := fetchSave(m2)
			lg(err)
			if err != nil {
				return
			}
			alreadyCrawled[treePath] = struct{}{}
			if !usedExisting {
				addAnchors(lg, cmd.Host, btsPar, dirTree)
			}

			if subtree == nil {
				lg("\n#%v treePath %q ; subtree is nil", i, treePath)
			} else {
				// lg("\n#%v treePath %q ; subtree exists", i, treePath)

				opt.Rump = treePath
				opt.MinDepthDiff = i - j
				opt.MaxDepthDiff = i - j
				lvlLinks := LevelWiseDeeper(nil, nil, subtree, opt)
				links = append(links, lvlLinks...)
				for _, art := range lvlLinks {
					_ = art
					// lg("#%v fnd    %v", i, stringspb.ToLen(art.Url, 100))
				}

				if len(links) >= opt.MaxNumber {
					lg("found enough links")
					break MarkOuter
				}

				pathPrev := treePath
				treePath = path.Dir(treePath)
				// lg("#%v  bef %v - aft %v", i, pathPrev, treePath)

				if pathPrev == "." && treePath == "." ||
					pathPrev == "/" && treePath == "/" ||
					pathPrev == "" && treePath == "." {
					lg("break to innner")
					break MarkInner
				}
			}

		}
	}

	//
	//
	//
	//
	lg("%v links after %4.2v secs", len(links), time.Now().Sub(start).Seconds())

	lg("============================")
	lg("Now reading/fetching actual similar files - not just the links")
	//
	tried := 0
	selecteds := []FullArticle{}

	nonExisting := []FullArticle{}
	nonExistFetched := []FullArticle{}

	for _, art := range links {

		if art.Url == ourl.Path {
			lg("skipping self\t%v", art.Url)
			continue
		}

		tried++

		useExisting := false

		semanticUri := condenseTrailingDir(art.Url, cmd.CondenseTrailingDirs)
		p := path.Join(docRoot, cmd.Host, semanticUri)

		f, err := fs1.Open(p)
		// lg(err) // its no error if file does not exist
		if err != nil {
			// lg("!nstore %q", semanticUri)
		} else {
			// lg("reading %q", semanticUri)

			// lets put this into a func, so that f.close it called at the end of this func
			// otherwise defer f.close() spans the entire func and prevents
			// overwrites chmods further down
			f := func() {
				defer f.Close()
				fi, err := f.Stat()
				lg(err)
				if err != nil {

				} else {
					age := time.Now().Sub(fi.ModTime())
					if age.Hours() < 10 {
						lg("\t\tusing existing file with age %4.2v hrs", age.Hours())
						art.Mod = fi.ModTime()
						bts, err := ioutil.ReadAll(f)
						lg(err)
						art.Body = bts
						if len(bts) < 200 {
							if bytes.Contains(bts, []byte(fetch.MsgNoRdirects)) {
								return
							}
						}
						selecteds = append(selecteds, art)
						useExisting = true
					}
				}
			}
			f()

		}

		if !useExisting {
			nonExisting = append(nonExisting, art)
		}

		if len(selecteds) >= countSimilar {
			break
		}

	}
	lg("============================")
	lg("tried %v links - yielding %v existing similars; not existing in datastore: %v, %v were requested.",
		tried, len(selecteds), len(nonExisting), countSimilar)

	if len(selecteds) < countSimilar {
		jobs := make([]distrib.Worker, 0, len(nonExisting))
		for _, art := range nonExisting {
			surl := path.Join(cmd.Host, art.Url)
			wrkr := MyWorker{SURL: surl}
			wrkr.Protocol = knownProtocol
			wrkr.r = r
			wrkr.lg = lg
			wrkr.fs1 = fs1
			job := distrib.Worker(&wrkr)
			jobs = append(jobs, job)
		}

		opt := distrib.NewDefaultOptions()
		opt.TimeOutDur = 3500 * time.Millisecond
		opt.Want = int32(countSimilar - len(selecteds) + 4) // get some more, in case we have "redirected" bodies
		opt.NumWorkers = int(opt.Want)                      // 5s query limit; => hurry; spawn as many as we want
		if numWorkers > 0 {
			opt.NumWorkers = numWorkers
		}
		lg("Preparing %v simultaneous, wanting %v fetches; at %4.2v secs.", opt.NumWorkers, opt.Want, time.Now().Sub(start).Seconds())
		opt.CollectRemainder = false // 5s query limit; => hurry; dont wait for stragglers

		ret, msg := distrib.Distrib(jobs, opt)
		lg("Distrib returned at %4.2v secs with %v results.", time.Now().Sub(start).Seconds(), len(ret))

		lg("\n" + msg.String())
		for _, v := range ret {
			v1, _ := v.Worker.(*MyWorker)
			if v1.FA != nil {
				age := time.Now().Sub(v1.FA.Mod)
				if age.Hours() < 10 {
					lg("\t\tusing fetched file with age %4.2v hrs", age.Hours())
					nonExistFetched = append(nonExistFetched, *v1.FA)
					if len(nonExistFetched) > (countSimilar - len(selecteds)) {
						break
					}
				}
			}
			if v1.err != nil {
				lg(err)
			}
		}

		lg("tried %v links - yielding %v fetched - jobs %v", len(nonExisting), len(nonExistFetched), len(jobs))
		selecteds = append(selecteds, nonExistFetched...)

		//
		//
		// Extract links
		for _, v := range nonExistFetched {
			// lg("links -> memory dirtree for %q", v.Url)
			addAnchors(lg, cmd.Host, v.Body, dirTree)
		}

	}

	//
	if time.Now().Sub(dirTree.LastFound).Seconds() < 10 {
		lg("saving accumulated (new) links to digest")
		saveDigest(lg, fs1, fnDigest, dirTree)
	}

	lg("\t\t%4.2v secs so far 3", time.Now().Sub(start).Seconds())

	mp := map[string][]byte{}
	mp["msg"] = b.Bytes()
	mp["url_self"] = []byte(condenseTrailingDir(ourl.Path, cmd.CondenseTrailingDirs))
	mp["mod_self"] = []byte(modSrc.Format(http.TimeFormat))
	mp["bod_self"] = btsSrc

	for i, v := range selecteds {
		mp["url__"+spf("%02v", i)] = []byte(v.Url)
		mp["mod__"+spf("%02v", i)] = []byte(v.Mod.Format(http.TimeFormat))
		mp["bod__"+spf("%02v", i)] = v.Body
	}

	mp["lensimilar"] = []byte(spf("%02v", len(selecteds)))

	//
	smp, err := json.MarshalIndent(mp, "", "\t")
	if err != nil {
		lg(b, "marshalling mp to []byte failed\n")
		return
	}

	r.Header.Set("X-Custom-Header-Counter", "nocounter")
	w.Header().Set("Content-Type", "application/json")
	w.Write(smp)

	b.Reset()             // this keeps the  buf pointer intact; outgoing defers are still heeded
	b = new(bytes.Buffer) // creates a *new* buf pointer; outgoing defers write into the *old* buf

	lg("\t\t%4.2v secs so far 4 (json resp written as []byte)", time.Now().Sub(start).Seconds())

	return

}
Exemplo n.º 3
0
// Fetches URL if local file is outdated.
// saves fetched file
//
// link extraction, link addition to treeX now accumulated one level higher
// bool return value: use existing => true
func fetchSave(m *MyWorker) ([]byte, time.Time, bool, error) {

	// w http.ResponseWriter,
	// r *http.Request,

	// Determine FileName
	ourl, err := fetch.URLFromString(m.SURL)
	fc := FetchCommand{}
	fc.Host = ourl.Host
	fc = addDefaults(fc)
	semanticUri := condenseTrailingDir(m.SURL, fc.CondenseTrailingDirs)
	fn := path.Join(docRoot, semanticUri)

	m.lg("crawlin %q", m.SURL)

	// File already exists?
	// Open file for age check
	var bts []byte
	var mod time.Time
	f := func() error {
		file1, err := m.fs1.Open(fn)
		// m.lg(err) // file may simply not exist
		if err != nil {
			return err // file may simply not exist
		}
		defer file1.Close() // file close *fast* at the end of *this* anonymous func

		fi, err := file1.Stat()
		m.lg(err)
		if err != nil {
			return err
		}

		if fi.IsDir() {
			m.lg("\t\t file is a directory, skipping - %v", fn)
			return fmt.Errorf("is directory: %v", fn)
		}

		mod = fi.ModTime()
		age := time.Now().Sub(mod)
		if age.Hours() > 10 {
			m.lg("\t\t file %4.2v hours old, refetch ", age.Hours())
			return fmt.Errorf("too old: %v", fn)
		}

		m.lg("\t\t file only %4.2v hours old, take %4.2vkB from datastore", age.Hours(), fi.Size()/1024)
		bts, err = ioutil.ReadAll(file1)
		if err != nil {
			return err
		}
		return nil
	}

	err = f()
	if err == nil {
		return bts, mod, true, err
	}

	//
	// Fetch
	bts, inf, err := fetch.UrlGetter(m.r, fetch.Options{URL: m.SURL, KnownProtocol: m.Protocol, RedirectHandling: 1})
	m.lg(err)
	if err != nil {
		if inf.Status != http.StatusNotFound {
			m.lg("tried to fetch %v, %v", m.SURL, inf.URL)
			m.lg("msg %v", inf.Msg)
			return []byte{}, inf.Mod, false, err
		}
		// In our traversing upwards, we might encounter "directory links" that have no index.html.
		// For a *derived* URL, this is no error.
		bts = []byte(" ... not found ... ")
	}
	if inf.Mod.IsZero() {
		inf.Mod = time.Now().Add(-75 * time.Minute)
	}

	//
	//
	// main request still exists?
	if false {
		var cx context.Context
		cx = util_appengine.SafelyExtractGaeContext(m.r)
		if cx == nil {
			m.lg("timed out - returning")
			return bts, inf.Mod, false, fmt.Errorf("req timed out")
		}
	}

	m.lg("retrivd+saved %q; %vkB ", inf.URL.Host+inf.URL.Path, len(bts)/1024)

	if len(bts) > 1024*1024-1 {
		bts = removeScriptsAndComments(m.lg, bts)
		m.lg("size reduced_1 to %vkB ", len(bts)/1024)

		// if len(bts) > 1024*1024-1 {
		// 	bts = snappy.Encode(nil, bts)
		// 	fn = strings.Replace(fn, ".html", ".snap.html", -1)
		// 	m.lg("size reduced_2 to %vkB ", len(bts)/1024)
		// }
	}

	//
	//
	dir := path.Dir(fn)
	err = m.fs1.MkdirAll(dir, 0755)
	m.lg(err)
	err = m.fs1.Chtimes(dir, time.Now(), time.Now())
	m.lg(err)
	err = m.fs1.WriteFile(fn, bts, 0644)
	m.lg(err)
	err = m.fs1.Chtimes(fn, inf.Mod, inf.Mod)
	m.lg(err)

	return bts, inf.Mod, false, nil

}