コード例 #1
0
ファイル: 8_fetch_similar.go プロジェクト: aarzilli/tools
// FetchSimilar is an extended version of Fetch
// It is uses a DirTree of crawled *links*, not actual files.
// As it moves up the DOM, it crawls every document for additional links.
// It first moves up to find similar URLs on the same depth
//                        /\
//          /\           /  \
//    /\   /  \         /    \
// It then moves up the ladder again - to accept higher URLs
//                        /\
//          /\
//    /\
func FetchSimilar(w http.ResponseWriter, r *http.Request, m map[string]interface{}) {

	lg, b := loghttp.BuffLoggerUniversal(w, r)
	closureOverBuf := func(bUnused *bytes.Buffer) {
		loghttp.Pf(w, r, b.String())
	}
	defer closureOverBuf(b) // the argument is ignored,

	r.Header.Set("X-Custom-Header-Counter", "nocounter")

	start := time.Now()

	wpf(b, tplx.ExecTplHelper(tplx.Head, map[string]interface{}{"HtmlTitle": "Find similar HTML URLs"}))
	defer wpf(b, tplx.Foot)

	wpf(b, "<pre>")
	defer wpf(b, "</pre>")

	fs1 := GetFS(appengine.NewContext(r))

	err := r.ParseForm()
	lg(err)

	countSimilar := 3
	sCountSimilar := r.FormValue("cnt")
	if sCountSimilar != "" {
		i, err := strconv.Atoi(strings.TrimSpace(sCountSimilar))
		if err == nil {
			countSimilar = i
		}
	}

	surl := r.FormValue(routes.URLParamKey)
	ourl, err := fetch.URLFromString(surl)
	lg(err)
	if err != nil {
		return
	}
	if ourl.Host == "" {
		lg("host is empty (%v)", surl)
		return
	}

	knownProtocol := ""
	if r.FormValue("prot") != "" {
		knownProtocol = r.FormValue("prot")
	}

	numWorkers := 0
	sNumWorkers := r.FormValue("numworkers")
	if sNumWorkers != "" {
		i, err := strconv.Atoi(strings.TrimSpace(sNumWorkers))
		if err == nil {
			numWorkers = i
		}
	}

	srcDepth := strings.Count(ourl.Path, "/")

	cmd := FetchCommand{}
	cmd.Host = ourl.Host
	cmd.SearchPrefix = ourl.Path
	cmd = addDefaults(cmd)

	dirTree := &DirTree{Name: "/", Dirs: map[string]DirTree{}, EndPoint: true}
	fnDigest := path.Join(docRoot, cmd.Host, "digest2.json")
	loadDigest(w, r, lg, fs1, fnDigest, dirTree) // previous
	lg("dirtree 400 chars is %v end of dirtree\t\t", stringspb.ToLen(dirTree.String(), 400))

	m1 := new(MyWorker)
	m1.r = r
	m1.lg = lg
	m1.fs1 = fs1
	m1.SURL = path.Join(cmd.Host, ourl.Path)
	m1.Protocol = knownProtocol
	btsSrc, modSrc, usedExisting, err := fetchSave(m1)
	if !usedExisting {
		addAnchors(lg, cmd.Host, btsSrc, dirTree)
	}
	lg(err)
	if err != nil {
		return
	}

	lg("\t\t%4.2v secs so far 1", time.Now().Sub(start).Seconds())

	var treePath string
	treePath = "/blogs/freeexchange"
	treePath = "/news/europe"
	treePath = path.Dir(ourl.Path)

	opt := LevelWiseDeeperOptions{}
	opt.Rump = treePath
	opt.ExcludeDir = "/news/americas"
	opt.ExcludeDir = "/blogs/buttonwood"
	opt.ExcludeDir = "/something-impossible"
	opt.MinDepthDiff = 1
	opt.MaxDepthDiff = 1
	opt.CondenseTrailingDirs = cmd.CondenseTrailingDirs
	opt.MaxNumber = cmd.DesiredNumber + 1  // one more for "self"
	opt.MaxNumber = cmd.DesiredNumber + 40 // collect more, 'cause we filter out those too old later

	var subtree *DirTree
	links := []FullArticle{}

	alreadyCrawled := map[string]struct{}{}

MarkOuter:
	for j := 0; j < srcDepth; j++ {
		treePath = path.Dir(ourl.Path)
	MarkInner:
		// for i := 1; i < srcDepth; i++ {
		for i := 1; i < (srcDepth + 5); i++ {

			subtree, treePath = DiveToDeepestMatch(dirTree, treePath)

			lg("Looking from height %v to level %v  - %v", srcDepth-i, srcDepth-j, treePath)

			if _, ok := alreadyCrawled[treePath]; ok {
				// lg("\t already digested %v", treePath)
				continue
			}

			m2 := new(MyWorker)
			m2.r = r
			m2.lg = lg
			m2.fs1 = fs1
			m2.SURL = path.Join(cmd.Host, treePath)
			m2.Protocol = knownProtocol

			btsPar, _, usedExisting, err := fetchSave(m2)
			lg(err)
			if err != nil {
				return
			}
			alreadyCrawled[treePath] = struct{}{}
			if !usedExisting {
				addAnchors(lg, cmd.Host, btsPar, dirTree)
			}

			if subtree == nil {
				lg("\n#%v treePath %q ; subtree is nil", i, treePath)
			} else {
				// lg("\n#%v treePath %q ; subtree exists", i, treePath)

				opt.Rump = treePath
				opt.MinDepthDiff = i - j
				opt.MaxDepthDiff = i - j
				lvlLinks := LevelWiseDeeper(nil, nil, subtree, opt)
				links = append(links, lvlLinks...)
				for _, art := range lvlLinks {
					_ = art
					// lg("#%v fnd    %v", i, stringspb.ToLen(art.Url, 100))
				}

				if len(links) >= opt.MaxNumber {
					lg("found enough links")
					break MarkOuter
				}

				pathPrev := treePath
				treePath = path.Dir(treePath)
				// lg("#%v  bef %v - aft %v", i, pathPrev, treePath)

				if pathPrev == "." && treePath == "." ||
					pathPrev == "/" && treePath == "/" ||
					pathPrev == "" && treePath == "." {
					lg("break to innner")
					break MarkInner
				}
			}

		}
	}

	//
	//
	//
	//
	lg("%v links after %4.2v secs", len(links), time.Now().Sub(start).Seconds())

	lg("============================")
	lg("Now reading/fetching actual similar files - not just the links")
	//
	tried := 0
	selecteds := []FullArticle{}

	nonExisting := []FullArticle{}
	nonExistFetched := []FullArticle{}

	for _, art := range links {

		if art.Url == ourl.Path {
			lg("skipping self\t%v", art.Url)
			continue
		}

		tried++

		useExisting := false

		semanticUri := condenseTrailingDir(art.Url, cmd.CondenseTrailingDirs)
		p := path.Join(docRoot, cmd.Host, semanticUri)

		f, err := fs1.Open(p)
		// lg(err) // its no error if file does not exist
		if err != nil {
			// lg("!nstore %q", semanticUri)
		} else {
			// lg("reading %q", semanticUri)

			// lets put this into a func, so that f.close it called at the end of this func
			// otherwise defer f.close() spans the entire func and prevents
			// overwrites chmods further down
			f := func() {
				defer f.Close()
				fi, err := f.Stat()
				lg(err)
				if err != nil {

				} else {
					age := time.Now().Sub(fi.ModTime())
					if age.Hours() < 10 {
						lg("\t\tusing existing file with age %4.2v hrs", age.Hours())
						art.Mod = fi.ModTime()
						bts, err := ioutil.ReadAll(f)
						lg(err)
						art.Body = bts
						if len(bts) < 200 {
							if bytes.Contains(bts, []byte(fetch.MsgNoRdirects)) {
								return
							}
						}
						selecteds = append(selecteds, art)
						useExisting = true
					}
				}
			}
			f()

		}

		if !useExisting {
			nonExisting = append(nonExisting, art)
		}

		if len(selecteds) >= countSimilar {
			break
		}

	}
	lg("============================")
	lg("tried %v links - yielding %v existing similars; not existing in datastore: %v, %v were requested.",
		tried, len(selecteds), len(nonExisting), countSimilar)

	if len(selecteds) < countSimilar {
		jobs := make([]distrib.Worker, 0, len(nonExisting))
		for _, art := range nonExisting {
			surl := path.Join(cmd.Host, art.Url)
			wrkr := MyWorker{SURL: surl}
			wrkr.Protocol = knownProtocol
			wrkr.r = r
			wrkr.lg = lg
			wrkr.fs1 = fs1
			job := distrib.Worker(&wrkr)
			jobs = append(jobs, job)
		}

		opt := distrib.NewDefaultOptions()
		opt.TimeOutDur = 3500 * time.Millisecond
		opt.Want = int32(countSimilar - len(selecteds) + 4) // get some more, in case we have "redirected" bodies
		opt.NumWorkers = int(opt.Want)                      // 5s query limit; => hurry; spawn as many as we want
		if numWorkers > 0 {
			opt.NumWorkers = numWorkers
		}
		lg("Preparing %v simultaneous, wanting %v fetches; at %4.2v secs.", opt.NumWorkers, opt.Want, time.Now().Sub(start).Seconds())
		opt.CollectRemainder = false // 5s query limit; => hurry; dont wait for stragglers

		ret, msg := distrib.Distrib(jobs, opt)
		lg("Distrib returned at %4.2v secs with %v results.", time.Now().Sub(start).Seconds(), len(ret))

		lg("\n" + msg.String())
		for _, v := range ret {
			v1, _ := v.Worker.(*MyWorker)
			if v1.FA != nil {
				age := time.Now().Sub(v1.FA.Mod)
				if age.Hours() < 10 {
					lg("\t\tusing fetched file with age %4.2v hrs", age.Hours())
					nonExistFetched = append(nonExistFetched, *v1.FA)
					if len(nonExistFetched) > (countSimilar - len(selecteds)) {
						break
					}
				}
			}
			if v1.err != nil {
				lg(err)
			}
		}

		lg("tried %v links - yielding %v fetched - jobs %v", len(nonExisting), len(nonExistFetched), len(jobs))
		selecteds = append(selecteds, nonExistFetched...)

		//
		//
		// Extract links
		for _, v := range nonExistFetched {
			// lg("links -> memory dirtree for %q", v.Url)
			addAnchors(lg, cmd.Host, v.Body, dirTree)
		}

	}

	//
	if time.Now().Sub(dirTree.LastFound).Seconds() < 10 {
		lg("saving accumulated (new) links to digest")
		saveDigest(lg, fs1, fnDigest, dirTree)
	}

	lg("\t\t%4.2v secs so far 3", time.Now().Sub(start).Seconds())

	mp := map[string][]byte{}
	mp["msg"] = b.Bytes()
	mp["url_self"] = []byte(condenseTrailingDir(ourl.Path, cmd.CondenseTrailingDirs))
	mp["mod_self"] = []byte(modSrc.Format(http.TimeFormat))
	mp["bod_self"] = btsSrc

	for i, v := range selecteds {
		mp["url__"+spf("%02v", i)] = []byte(v.Url)
		mp["mod__"+spf("%02v", i)] = []byte(v.Mod.Format(http.TimeFormat))
		mp["bod__"+spf("%02v", i)] = v.Body
	}

	mp["lensimilar"] = []byte(spf("%02v", len(selecteds)))

	//
	smp, err := json.MarshalIndent(mp, "", "\t")
	if err != nil {
		lg(b, "marshalling mp to []byte failed\n")
		return
	}

	r.Header.Set("X-Custom-Header-Counter", "nocounter")
	w.Header().Set("Content-Type", "application/json")
	w.Write(smp)

	b.Reset()             // this keeps the  buf pointer intact; outgoing defers are still heeded
	b = new(bytes.Buffer) // creates a *new* buf pointer; outgoing defers write into the *old* buf

	lg("\t\t%4.2v secs so far 4 (json resp written as []byte)", time.Now().Sub(start).Seconds())

	return

}
コード例 #2
0
ファイル: t4_test.go プロジェクト: aarzilli/tools
// +build p4
// go test -tags=p4

package test

import (
	"fmt"
	"math/rand"
	"testing"
	"time"

	"github.com/pbberlin/tools/distrib"
)

var opt = distrib.NewDefaultOptions()

type MyWorker struct {
	Inp, Res int
}

func (m *MyWorker) Work() {
	m.Res = 500 + m.Inp
	time.Sleep(time.Millisecond * time.Duration(rand.Intn(20)))
}

func Test4(t *testing.T) {
	opt.CollectRemainder = true
	do(t, 3)

}