Exemple #1
0
// Print prints a visual representation
// of the slices of tokens and their distance matrix
func (m *Matrix) Print() {

	rows, cols := m.rows, m.cols
	mx := m.mx

	fp := fmt.Printf

	fmt2 := fmt.Sprintf("%s-%vd", "%", cl)

	fp(strings.Repeat(" ", 2*cl))
	for _, col := range cols {
		scol := fmt.Sprintf("%v", col)
		fp("%v ", stringspb.ToLen(scol, cl-1)) // at least one space right
	}
	fp("\n")

	fp(strings.Repeat(" ", cl))
	fp(fmt2, mx[0][0])
	for j, _ := range cols {
		fp(fmt2, mx[0][j+1])
	}
	fp("\n")

	//
	for i, row := range rows {
		srow := fmt.Sprintf("%v", row)
		fp("%v ", stringspb.ToLen(srow, cl-1)) // at least one space right
		fp(fmt2, mx[i+1][0])
		for j, _ := range cols {
			fp(fmt2, mx[i+1][j+1])
		}
		fp("\n")
	}
	// fp("\n")
}
Exemple #2
0
func Pf(w io.Writer, r *http.Request, f string, vs ...interface{}) {

	for idx, v := range vs {
		switch v := v.(type) {
		case []byte:
			if len(v) > 1024*5 {
				appdx := append([]byte(" ...omitted... "), v[len(v)-100:]...)
				vs[idx] = append(v[:1024*5], appdx...)
			}
		case string:
			if len(v) > 1024*5 {
				appdx := " ...omitted... " + v[len(v)-100:]
				vs[idx] = v[:1024*5] + appdx
			}
		}
	}

	// Prepare the string
	var s string
	if len(vs) > 0 {
		s = fmt.Sprintf(f, vs...)
	} else {
		s = f
	}

	if s == "" {
		return
	}

	// Write it to http response or bytes.Buffer
	// unless prefixed with 'lo ' - log only.
	// Thread-safety could be introduced by syncing/locking w.
	if w != nil && !strings.HasPrefix(s, "lo ") {
		w.Write([]byte(s))
		w.Write([]byte{'\n'})
	}

	// Write to log/gae-log
	// Adding src code info

	line, file := runtimepb.LineFileXUp(1)
	// if strings.HasSuffix(file, "log.go")
	if strings.HasSuffix(file, runtimepb.ThisFile()) { // change
		line, file = runtimepb.LineFileXUp(2)
	}

	if len(s) < 60 {
		s = stringspb.ToLen(s, 60)
	}
	s = fmt.Sprintf("%v - %v:%v", s, file, line)

	// Log it
	c, _ := util_appengine.SafelyExtractGaeCtxError(r)
	if c == nil {
		lnp.Printf(s)
	} else {
		aelog.Infof(c, s)
	}

}
Exemple #3
0
func dirTreeStrRec(buf *bytes.Buffer, d *DirTree, lvl int) {
	ind2 := strings.Repeat("    ", lvl+1)
	keys := make([]string, 0, len(d.Dirs))
	for k, _ := range d.Dirs {
		keys = append(keys, k)
	}
	sort.Strings(keys)
	for _, key := range keys {
		buf.WriteString(ind2)
		indir := d.Dirs[key]
		buf.WriteString(stringspb.ToLen(indir.Name, 44-len(ind2)))
		if indir.EndPoint {
			buf.WriteString(fmt.Sprintf(" EP"))
		}
		buf.WriteByte(10)
		dirTreeStrRec(buf, &indir, lvl+1)
	}
}
Exemple #4
0
//
// https://developers.google.com/identity/choose-auth
// https://developers.google.com/identity/sign-in/web/backend-auth
func TokenSignin(w http.ResponseWriter, r *http.Request) {

	lg, _ := loghttp.BuffLoggerUniversal(w, r)

	// w.Header().Set("Access-Control-Allow-Origin", "http://localhost:1313")

	w.Header().Set("Access-Control-Allow-Origin", "http://"+routes.AppHostDev())

	w.Header().Del("Access-Control-Allow-Origin")
	w.Header().Set("Access-Control-Allow-Origin", "*")

	// err := r.ParseMultipartForm(1024 * 1024 * 2)
	err := r.ParseForm()
	lg(err)

	myToken := r.Form.Get("idtoken")
	tokSize := fmt.Sprintf("Len of Tok was %v. \n", len(myToken))

	fc1 := func(token *jwt.Token) (interface{}, error) {
		// Don't forget to validate the alg is what you expect:

		log.Printf("algo header is %v\n", token.Header["alg"])
		if _, ok := token.Method.(*jwt.SigningMethodRSA); !ok {
			return nil, fmt.Errorf("Unexpected signing method: %v", token.Header["alg"])
		}
		return token.Header["kid"], nil
	}

	token, err := jwt.Parse(myToken, fc1)

	// No direct error comparison possible; since err is wrapped in another struct
	if err != nil && strings.Contains(err.Error(), jwt.ErrPEMMappingObsolete.Error()) {

		currentPEMsURL := "https://www.googleapis.com/oauth2/v1/certs"
		req, err := http.NewRequest("GET", currentPEMsURL, nil)
		if err != nil {
			lg("creation of pem request failed")
			return
		}
		req.Header.Set("Content-Type", "application/json")

		fo := fetch.Options{Req: req}
		fo.KnownProtocol = "https"
		fo.ForceHTTPSEvenOnDevelopmentServer = true
		bts, inf, err := fetch.UrlGetter(r, fo)
		lg(err)
		if err != nil {
			lg("tried to fetch %v, %v", currentPEMsURL, inf.URL)
			lg("msg %v", inf.Msg)
			return
		}
		if len(bts) > 200 {
			var data1 map[string]string
			err = json.Unmarshal(bts, &data1)
			lg(err)
			// lg(stringspb.IndentedDumpBytes(data1))
			// w.Write(stringspb.IndentedDumpBytes(data1))
			if len(data1) > 1 {
				lg("PEM mappings updated")
				jwt.MappingToPEM = data1
			} else {
				lg("PEM mapping response contained only %v records; bytes length %v", len(data1), len(bts))
			}
		}

	}

	token, err = jwt.Parse(myToken, fc1)

	if err != nil && strings.Contains(err.Error(), jwt.ErrInvalidKey.Error()) {
		w.Write([]byte("The submitted RSA Key was somehow unparseable. We still accept the token.\n"))
		/*
			https://developers.google.com/identity/sign-in/web/backend-auth
		*/
		err = nil
		token.Valid = true
	}

	if err != nil {
		w.Write([]byte("--- " + err.Error() + ".\n"))
	}

	if err == nil && token.Valid {

		tk := ""
		tk += fmt.Sprintf("     Algor:     %v\n", token.Method)
		tk += fmt.Sprintf("     Header:    %v\n", token.Header)
		for k, v := range token.Claims {
			tk += fmt.Sprintf("\t  %-8v %v\n", k, v)
		}
		lg(tk)

		w.Write([]byte("tokensignin; valid.   \n"))
		w.Write([]byte(tokSize))
		sb := "header-sub-not-present"
		if _, ok := token.Claims["sub"]; ok {
			sb = token.Claims["sub"].(string)
		}
		w.Write([]byte("ID from PWT is " + sb + "\n"))

		_, usr, msg1 := login.CheckForNormalUser(r)
		if usr != nil {
			w.Write([]byte("ID from SRV is " + usr.ID + "\n"))
		}
		w.Write([]byte(msg1 + "\n"))

	} else {
		w.Write([]byte("tokensignin; INVALID. \n"))
		w.Write([]byte(tokSize))
		w.Write([]byte(stringspb.ToLen(myToken, 30)))

		vrf := fmt.Sprintf("\nhttps://www.googleapis.com/oauth2/v3/tokeninfo?id_token=%v \n", myToken)
		w.Write([]byte(vrf))
	}

}
Exemple #5
0
// FetchSimilar is an extended version of Fetch
// It is uses a DirTree of crawled *links*, not actual files.
// As it moves up the DOM, it crawls every document for additional links.
// It first moves up to find similar URLs on the same depth
//                        /\
//          /\           /  \
//    /\   /  \         /    \
// It then moves up the ladder again - to accept higher URLs
//                        /\
//          /\
//    /\
func FetchSimilar(w http.ResponseWriter, r *http.Request, m map[string]interface{}) {

	lg, b := loghttp.BuffLoggerUniversal(w, r)
	closureOverBuf := func(bUnused *bytes.Buffer) {
		loghttp.Pf(w, r, b.String())
	}
	defer closureOverBuf(b) // the argument is ignored,

	r.Header.Set("X-Custom-Header-Counter", "nocounter")

	start := time.Now()

	wpf(b, tplx.ExecTplHelper(tplx.Head, map[string]interface{}{"HtmlTitle": "Find similar HTML URLs"}))
	defer wpf(b, tplx.Foot)

	wpf(b, "<pre>")
	defer wpf(b, "</pre>")

	fs1 := GetFS(appengine.NewContext(r))

	err := r.ParseForm()
	lg(err)

	countSimilar := 3
	sCountSimilar := r.FormValue("cnt")
	if sCountSimilar != "" {
		i, err := strconv.Atoi(strings.TrimSpace(sCountSimilar))
		if err == nil {
			countSimilar = i
		}
	}

	surl := r.FormValue(routes.URLParamKey)
	ourl, err := fetch.URLFromString(surl)
	lg(err)
	if err != nil {
		return
	}
	if ourl.Host == "" {
		lg("host is empty (%v)", surl)
		return
	}

	knownProtocol := ""
	if r.FormValue("prot") != "" {
		knownProtocol = r.FormValue("prot")
	}

	numWorkers := 0
	sNumWorkers := r.FormValue("numworkers")
	if sNumWorkers != "" {
		i, err := strconv.Atoi(strings.TrimSpace(sNumWorkers))
		if err == nil {
			numWorkers = i
		}
	}

	srcDepth := strings.Count(ourl.Path, "/")

	cmd := FetchCommand{}
	cmd.Host = ourl.Host
	cmd.SearchPrefix = ourl.Path
	cmd = addDefaults(cmd)

	dirTree := &DirTree{Name: "/", Dirs: map[string]DirTree{}, EndPoint: true}
	fnDigest := path.Join(docRoot, cmd.Host, "digest2.json")
	loadDigest(w, r, lg, fs1, fnDigest, dirTree) // previous
	lg("dirtree 400 chars is %v end of dirtree\t\t", stringspb.ToLen(dirTree.String(), 400))

	m1 := new(MyWorker)
	m1.r = r
	m1.lg = lg
	m1.fs1 = fs1
	m1.SURL = path.Join(cmd.Host, ourl.Path)
	m1.Protocol = knownProtocol
	btsSrc, modSrc, usedExisting, err := fetchSave(m1)
	if !usedExisting {
		addAnchors(lg, cmd.Host, btsSrc, dirTree)
	}
	lg(err)
	if err != nil {
		return
	}

	lg("\t\t%4.2v secs so far 1", time.Now().Sub(start).Seconds())

	var treePath string
	treePath = "/blogs/freeexchange"
	treePath = "/news/europe"
	treePath = path.Dir(ourl.Path)

	opt := LevelWiseDeeperOptions{}
	opt.Rump = treePath
	opt.ExcludeDir = "/news/americas"
	opt.ExcludeDir = "/blogs/buttonwood"
	opt.ExcludeDir = "/something-impossible"
	opt.MinDepthDiff = 1
	opt.MaxDepthDiff = 1
	opt.CondenseTrailingDirs = cmd.CondenseTrailingDirs
	opt.MaxNumber = cmd.DesiredNumber + 1  // one more for "self"
	opt.MaxNumber = cmd.DesiredNumber + 40 // collect more, 'cause we filter out those too old later

	var subtree *DirTree
	links := []FullArticle{}

	alreadyCrawled := map[string]struct{}{}

MarkOuter:
	for j := 0; j < srcDepth; j++ {
		treePath = path.Dir(ourl.Path)
	MarkInner:
		// for i := 1; i < srcDepth; i++ {
		for i := 1; i < (srcDepth + 5); i++ {

			subtree, treePath = DiveToDeepestMatch(dirTree, treePath)

			lg("Looking from height %v to level %v  - %v", srcDepth-i, srcDepth-j, treePath)

			if _, ok := alreadyCrawled[treePath]; ok {
				// lg("\t already digested %v", treePath)
				continue
			}

			m2 := new(MyWorker)
			m2.r = r
			m2.lg = lg
			m2.fs1 = fs1
			m2.SURL = path.Join(cmd.Host, treePath)
			m2.Protocol = knownProtocol

			btsPar, _, usedExisting, err := fetchSave(m2)
			lg(err)
			if err != nil {
				return
			}
			alreadyCrawled[treePath] = struct{}{}
			if !usedExisting {
				addAnchors(lg, cmd.Host, btsPar, dirTree)
			}

			if subtree == nil {
				lg("\n#%v treePath %q ; subtree is nil", i, treePath)
			} else {
				// lg("\n#%v treePath %q ; subtree exists", i, treePath)

				opt.Rump = treePath
				opt.MinDepthDiff = i - j
				opt.MaxDepthDiff = i - j
				lvlLinks := LevelWiseDeeper(nil, nil, subtree, opt)
				links = append(links, lvlLinks...)
				for _, art := range lvlLinks {
					_ = art
					// lg("#%v fnd    %v", i, stringspb.ToLen(art.Url, 100))
				}

				if len(links) >= opt.MaxNumber {
					lg("found enough links")
					break MarkOuter
				}

				pathPrev := treePath
				treePath = path.Dir(treePath)
				// lg("#%v  bef %v - aft %v", i, pathPrev, treePath)

				if pathPrev == "." && treePath == "." ||
					pathPrev == "/" && treePath == "/" ||
					pathPrev == "" && treePath == "." {
					lg("break to innner")
					break MarkInner
				}
			}

		}
	}

	//
	//
	//
	//
	lg("%v links after %4.2v secs", len(links), time.Now().Sub(start).Seconds())

	lg("============================")
	lg("Now reading/fetching actual similar files - not just the links")
	//
	tried := 0
	selecteds := []FullArticle{}

	nonExisting := []FullArticle{}
	nonExistFetched := []FullArticle{}

	for _, art := range links {

		if art.Url == ourl.Path {
			lg("skipping self\t%v", art.Url)
			continue
		}

		tried++

		useExisting := false

		semanticUri := condenseTrailingDir(art.Url, cmd.CondenseTrailingDirs)
		p := path.Join(docRoot, cmd.Host, semanticUri)

		f, err := fs1.Open(p)
		// lg(err) // its no error if file does not exist
		if err != nil {
			// lg("!nstore %q", semanticUri)
		} else {
			// lg("reading %q", semanticUri)

			// lets put this into a func, so that f.close it called at the end of this func
			// otherwise defer f.close() spans the entire func and prevents
			// overwrites chmods further down
			f := func() {
				defer f.Close()
				fi, err := f.Stat()
				lg(err)
				if err != nil {

				} else {
					age := time.Now().Sub(fi.ModTime())
					if age.Hours() < 10 {
						lg("\t\tusing existing file with age %4.2v hrs", age.Hours())
						art.Mod = fi.ModTime()
						bts, err := ioutil.ReadAll(f)
						lg(err)
						art.Body = bts
						if len(bts) < 200 {
							if bytes.Contains(bts, []byte(fetch.MsgNoRdirects)) {
								return
							}
						}
						selecteds = append(selecteds, art)
						useExisting = true
					}
				}
			}
			f()

		}

		if !useExisting {
			nonExisting = append(nonExisting, art)
		}

		if len(selecteds) >= countSimilar {
			break
		}

	}
	lg("============================")
	lg("tried %v links - yielding %v existing similars; not existing in datastore: %v, %v were requested.",
		tried, len(selecteds), len(nonExisting), countSimilar)

	if len(selecteds) < countSimilar {
		jobs := make([]distrib.Worker, 0, len(nonExisting))
		for _, art := range nonExisting {
			surl := path.Join(cmd.Host, art.Url)
			wrkr := MyWorker{SURL: surl}
			wrkr.Protocol = knownProtocol
			wrkr.r = r
			wrkr.lg = lg
			wrkr.fs1 = fs1
			job := distrib.Worker(&wrkr)
			jobs = append(jobs, job)
		}

		opt := distrib.NewDefaultOptions()
		opt.TimeOutDur = 3500 * time.Millisecond
		opt.Want = int32(countSimilar - len(selecteds) + 4) // get some more, in case we have "redirected" bodies
		opt.NumWorkers = int(opt.Want)                      // 5s query limit; => hurry; spawn as many as we want
		if numWorkers > 0 {
			opt.NumWorkers = numWorkers
		}
		lg("Preparing %v simultaneous, wanting %v fetches; at %4.2v secs.", opt.NumWorkers, opt.Want, time.Now().Sub(start).Seconds())
		opt.CollectRemainder = false // 5s query limit; => hurry; dont wait for stragglers

		ret, msg := distrib.Distrib(jobs, opt)
		lg("Distrib returned at %4.2v secs with %v results.", time.Now().Sub(start).Seconds(), len(ret))

		lg("\n" + msg.String())
		for _, v := range ret {
			v1, _ := v.Worker.(*MyWorker)
			if v1.FA != nil {
				age := time.Now().Sub(v1.FA.Mod)
				if age.Hours() < 10 {
					lg("\t\tusing fetched file with age %4.2v hrs", age.Hours())
					nonExistFetched = append(nonExistFetched, *v1.FA)
					if len(nonExistFetched) > (countSimilar - len(selecteds)) {
						break
					}
				}
			}
			if v1.err != nil {
				lg(err)
			}
		}

		lg("tried %v links - yielding %v fetched - jobs %v", len(nonExisting), len(nonExistFetched), len(jobs))
		selecteds = append(selecteds, nonExistFetched...)

		//
		//
		// Extract links
		for _, v := range nonExistFetched {
			// lg("links -> memory dirtree for %q", v.Url)
			addAnchors(lg, cmd.Host, v.Body, dirTree)
		}

	}

	//
	if time.Now().Sub(dirTree.LastFound).Seconds() < 10 {
		lg("saving accumulated (new) links to digest")
		saveDigest(lg, fs1, fnDigest, dirTree)
	}

	lg("\t\t%4.2v secs so far 3", time.Now().Sub(start).Seconds())

	mp := map[string][]byte{}
	mp["msg"] = b.Bytes()
	mp["url_self"] = []byte(condenseTrailingDir(ourl.Path, cmd.CondenseTrailingDirs))
	mp["mod_self"] = []byte(modSrc.Format(http.TimeFormat))
	mp["bod_self"] = btsSrc

	for i, v := range selecteds {
		mp["url__"+spf("%02v", i)] = []byte(v.Url)
		mp["mod__"+spf("%02v", i)] = []byte(v.Mod.Format(http.TimeFormat))
		mp["bod__"+spf("%02v", i)] = v.Body
	}

	mp["lensimilar"] = []byte(spf("%02v", len(selecteds)))

	//
	smp, err := json.MarshalIndent(mp, "", "\t")
	if err != nil {
		lg(b, "marshalling mp to []byte failed\n")
		return
	}

	r.Header.Set("X-Custom-Header-Counter", "nocounter")
	w.Header().Set("Content-Type", "application/json")
	w.Write(smp)

	b.Reset()             // this keeps the  buf pointer intact; outgoing defers are still heeded
	b = new(bytes.Buffer) // creates a *new* buf pointer; outgoing defers write into the *old* buf

	lg("\t\t%4.2v secs so far 4 (json resp written as []byte)", time.Now().Sub(start).Seconds())

	return

}
Exemple #6
0
func similarTextifiedTrees2(src *TextifiedTree, mp map[string][]*TextifiedTree, skipPrefix map[string]bool) {

	// srcE := word.WrapAsEqualer(string(src.Text), true) // ssrc as Equaler
	srcE := wordb.WrapAsEqualer(src.Text, true)
	srcLen := float64(len(src.Text))

	for fnKey, tts := range mp {

		if fnKey == src.SourceID {
			pf("    to %v SKIP self\n", fnKey)
			continue
		}

		pf("    to %v\n", fnKey)

		cntr, br := 0, true
		for _, tt := range tts {
			// outl, text := tt.Outl, tt.Text

			if tt.Lvl > src.Lvl+levelsTolerance {
				break // since we are now sorted by lvl, we can this is safe
			}

			if tt.Lvl == src.Lvl ||
				(tt.Lvl > src.Lvl && tt.Lvl <= src.Lvl+levelsTolerance) {
				// proceed
			} else {
				continue
			}

			if src.NumTokens < 1 {
				continue
			}

			if src.NumTokens < 5 && tt.NumTokens > 7 {
				continue
			}

			if HistoBasedDistance(src, tt) > 0.51 {
				breakMapsTooDistinct++
				continue
			}

			relSize := srcLen / float64(util.Max(1, len(tt.Text)))
			if relSize < 0.33 || relSize > 3 {
				continue
			}

			absDist, relDist := 0, 0.0

			if tt.NumTokens == src.NumTokens &&
				len(tt.Text) == len(src.Text) &&
				bytes.Equal(tt.Text, src.Text) {
				absDist, relDist = 0, 0.0
				appliedCompare++
			} else {
				dstE := wordb.WrapAsEqualer(tt.Text, true) // destinations as Equaler
				m := levenshtein.New(srcE, dstE, opt)
				absDist, relDist = m.Distance()
				appliedLevenshtein++
			}

			//
			if relDist < 0.26 && absDist < 10 {
				if br {
					pf("\t")
				}

				sd := ""
				sd = string(tt.Text[:util.Min(2*excerptLen, len(tt.Text)-1)])
				sd = stringspb.ToLen(sd, 2*excerptLen+1)
				pf("%12v %v %4v %5.2v   ", tt.Outline, sd, absDist, relDist)

				cntr++
				br = false

				sim := Similar{}
				sim.SourceID = fnKey
				sim.Lvl = tt.Lvl
				sim.Outline = tt.Outline
				sim.AbsLevenshtein = absDist
				sim.RelLevenshtein = relDist
				sim.Text = tt.Text
				src.Similars = append(src.Similars, sim)
				src.SumAbsLevenshtein += absDist
				src.SumRelLevenshtein += relDist

				if cntr%2 == 0 || cntr > 20 {
					pf("\n")
					br = true
				}
				if cntr > 20 {
					break
				}
			}

		}
		if !br {
			pf("\n")
		}
	}

}