// Print prints a visual representation // of the slices of tokens and their distance matrix func (m *Matrix) Print() { rows, cols := m.rows, m.cols mx := m.mx fp := fmt.Printf fmt2 := fmt.Sprintf("%s-%vd", "%", cl) fp(strings.Repeat(" ", 2*cl)) for _, col := range cols { scol := fmt.Sprintf("%v", col) fp("%v ", stringspb.ToLen(scol, cl-1)) // at least one space right } fp("\n") fp(strings.Repeat(" ", cl)) fp(fmt2, mx[0][0]) for j, _ := range cols { fp(fmt2, mx[0][j+1]) } fp("\n") // for i, row := range rows { srow := fmt.Sprintf("%v", row) fp("%v ", stringspb.ToLen(srow, cl-1)) // at least one space right fp(fmt2, mx[i+1][0]) for j, _ := range cols { fp(fmt2, mx[i+1][j+1]) } fp("\n") } // fp("\n") }
func Pf(w io.Writer, r *http.Request, f string, vs ...interface{}) { for idx, v := range vs { switch v := v.(type) { case []byte: if len(v) > 1024*5 { appdx := append([]byte(" ...omitted... "), v[len(v)-100:]...) vs[idx] = append(v[:1024*5], appdx...) } case string: if len(v) > 1024*5 { appdx := " ...omitted... " + v[len(v)-100:] vs[idx] = v[:1024*5] + appdx } } } // Prepare the string var s string if len(vs) > 0 { s = fmt.Sprintf(f, vs...) } else { s = f } if s == "" { return } // Write it to http response or bytes.Buffer // unless prefixed with 'lo ' - log only. // Thread-safety could be introduced by syncing/locking w. if w != nil && !strings.HasPrefix(s, "lo ") { w.Write([]byte(s)) w.Write([]byte{'\n'}) } // Write to log/gae-log // Adding src code info line, file := runtimepb.LineFileXUp(1) // if strings.HasSuffix(file, "log.go") if strings.HasSuffix(file, runtimepb.ThisFile()) { // change line, file = runtimepb.LineFileXUp(2) } if len(s) < 60 { s = stringspb.ToLen(s, 60) } s = fmt.Sprintf("%v - %v:%v", s, file, line) // Log it c, _ := util_appengine.SafelyExtractGaeCtxError(r) if c == nil { lnp.Printf(s) } else { aelog.Infof(c, s) } }
func dirTreeStrRec(buf *bytes.Buffer, d *DirTree, lvl int) { ind2 := strings.Repeat(" ", lvl+1) keys := make([]string, 0, len(d.Dirs)) for k, _ := range d.Dirs { keys = append(keys, k) } sort.Strings(keys) for _, key := range keys { buf.WriteString(ind2) indir := d.Dirs[key] buf.WriteString(stringspb.ToLen(indir.Name, 44-len(ind2))) if indir.EndPoint { buf.WriteString(fmt.Sprintf(" EP")) } buf.WriteByte(10) dirTreeStrRec(buf, &indir, lvl+1) } }
// // https://developers.google.com/identity/choose-auth // https://developers.google.com/identity/sign-in/web/backend-auth func TokenSignin(w http.ResponseWriter, r *http.Request) { lg, _ := loghttp.BuffLoggerUniversal(w, r) // w.Header().Set("Access-Control-Allow-Origin", "http://localhost:1313") w.Header().Set("Access-Control-Allow-Origin", "http://"+routes.AppHostDev()) w.Header().Del("Access-Control-Allow-Origin") w.Header().Set("Access-Control-Allow-Origin", "*") // err := r.ParseMultipartForm(1024 * 1024 * 2) err := r.ParseForm() lg(err) myToken := r.Form.Get("idtoken") tokSize := fmt.Sprintf("Len of Tok was %v. \n", len(myToken)) fc1 := func(token *jwt.Token) (interface{}, error) { // Don't forget to validate the alg is what you expect: log.Printf("algo header is %v\n", token.Header["alg"]) if _, ok := token.Method.(*jwt.SigningMethodRSA); !ok { return nil, fmt.Errorf("Unexpected signing method: %v", token.Header["alg"]) } return token.Header["kid"], nil } token, err := jwt.Parse(myToken, fc1) // No direct error comparison possible; since err is wrapped in another struct if err != nil && strings.Contains(err.Error(), jwt.ErrPEMMappingObsolete.Error()) { currentPEMsURL := "https://www.googleapis.com/oauth2/v1/certs" req, err := http.NewRequest("GET", currentPEMsURL, nil) if err != nil { lg("creation of pem request failed") return } req.Header.Set("Content-Type", "application/json") fo := fetch.Options{Req: req} fo.KnownProtocol = "https" fo.ForceHTTPSEvenOnDevelopmentServer = true bts, inf, err := fetch.UrlGetter(r, fo) lg(err) if err != nil { lg("tried to fetch %v, %v", currentPEMsURL, inf.URL) lg("msg %v", inf.Msg) return } if len(bts) > 200 { var data1 map[string]string err = json.Unmarshal(bts, &data1) lg(err) // lg(stringspb.IndentedDumpBytes(data1)) // w.Write(stringspb.IndentedDumpBytes(data1)) if len(data1) > 1 { lg("PEM mappings updated") jwt.MappingToPEM = data1 } else { lg("PEM mapping response contained only %v records; bytes length %v", len(data1), len(bts)) } } } token, err = jwt.Parse(myToken, fc1) if err != nil && strings.Contains(err.Error(), jwt.ErrInvalidKey.Error()) { w.Write([]byte("The submitted RSA Key was somehow unparseable. We still accept the token.\n")) /* https://developers.google.com/identity/sign-in/web/backend-auth */ err = nil token.Valid = true } if err != nil { w.Write([]byte("--- " + err.Error() + ".\n")) } if err == nil && token.Valid { tk := "" tk += fmt.Sprintf(" Algor: %v\n", token.Method) tk += fmt.Sprintf(" Header: %v\n", token.Header) for k, v := range token.Claims { tk += fmt.Sprintf("\t %-8v %v\n", k, v) } lg(tk) w.Write([]byte("tokensignin; valid. \n")) w.Write([]byte(tokSize)) sb := "header-sub-not-present" if _, ok := token.Claims["sub"]; ok { sb = token.Claims["sub"].(string) } w.Write([]byte("ID from PWT is " + sb + "\n")) _, usr, msg1 := login.CheckForNormalUser(r) if usr != nil { w.Write([]byte("ID from SRV is " + usr.ID + "\n")) } w.Write([]byte(msg1 + "\n")) } else { w.Write([]byte("tokensignin; INVALID. \n")) w.Write([]byte(tokSize)) w.Write([]byte(stringspb.ToLen(myToken, 30))) vrf := fmt.Sprintf("\nhttps://www.googleapis.com/oauth2/v3/tokeninfo?id_token=%v \n", myToken) w.Write([]byte(vrf)) } }
// FetchSimilar is an extended version of Fetch // It is uses a DirTree of crawled *links*, not actual files. // As it moves up the DOM, it crawls every document for additional links. // It first moves up to find similar URLs on the same depth // /\ // /\ / \ // /\ / \ / \ // It then moves up the ladder again - to accept higher URLs // /\ // /\ // /\ func FetchSimilar(w http.ResponseWriter, r *http.Request, m map[string]interface{}) { lg, b := loghttp.BuffLoggerUniversal(w, r) closureOverBuf := func(bUnused *bytes.Buffer) { loghttp.Pf(w, r, b.String()) } defer closureOverBuf(b) // the argument is ignored, r.Header.Set("X-Custom-Header-Counter", "nocounter") start := time.Now() wpf(b, tplx.ExecTplHelper(tplx.Head, map[string]interface{}{"HtmlTitle": "Find similar HTML URLs"})) defer wpf(b, tplx.Foot) wpf(b, "<pre>") defer wpf(b, "</pre>") fs1 := GetFS(appengine.NewContext(r)) err := r.ParseForm() lg(err) countSimilar := 3 sCountSimilar := r.FormValue("cnt") if sCountSimilar != "" { i, err := strconv.Atoi(strings.TrimSpace(sCountSimilar)) if err == nil { countSimilar = i } } surl := r.FormValue(routes.URLParamKey) ourl, err := fetch.URLFromString(surl) lg(err) if err != nil { return } if ourl.Host == "" { lg("host is empty (%v)", surl) return } knownProtocol := "" if r.FormValue("prot") != "" { knownProtocol = r.FormValue("prot") } numWorkers := 0 sNumWorkers := r.FormValue("numworkers") if sNumWorkers != "" { i, err := strconv.Atoi(strings.TrimSpace(sNumWorkers)) if err == nil { numWorkers = i } } srcDepth := strings.Count(ourl.Path, "/") cmd := FetchCommand{} cmd.Host = ourl.Host cmd.SearchPrefix = ourl.Path cmd = addDefaults(cmd) dirTree := &DirTree{Name: "/", Dirs: map[string]DirTree{}, EndPoint: true} fnDigest := path.Join(docRoot, cmd.Host, "digest2.json") loadDigest(w, r, lg, fs1, fnDigest, dirTree) // previous lg("dirtree 400 chars is %v end of dirtree\t\t", stringspb.ToLen(dirTree.String(), 400)) m1 := new(MyWorker) m1.r = r m1.lg = lg m1.fs1 = fs1 m1.SURL = path.Join(cmd.Host, ourl.Path) m1.Protocol = knownProtocol btsSrc, modSrc, usedExisting, err := fetchSave(m1) if !usedExisting { addAnchors(lg, cmd.Host, btsSrc, dirTree) } lg(err) if err != nil { return } lg("\t\t%4.2v secs so far 1", time.Now().Sub(start).Seconds()) var treePath string treePath = "/blogs/freeexchange" treePath = "/news/europe" treePath = path.Dir(ourl.Path) opt := LevelWiseDeeperOptions{} opt.Rump = treePath opt.ExcludeDir = "/news/americas" opt.ExcludeDir = "/blogs/buttonwood" opt.ExcludeDir = "/something-impossible" opt.MinDepthDiff = 1 opt.MaxDepthDiff = 1 opt.CondenseTrailingDirs = cmd.CondenseTrailingDirs opt.MaxNumber = cmd.DesiredNumber + 1 // one more for "self" opt.MaxNumber = cmd.DesiredNumber + 40 // collect more, 'cause we filter out those too old later var subtree *DirTree links := []FullArticle{} alreadyCrawled := map[string]struct{}{} MarkOuter: for j := 0; j < srcDepth; j++ { treePath = path.Dir(ourl.Path) MarkInner: // for i := 1; i < srcDepth; i++ { for i := 1; i < (srcDepth + 5); i++ { subtree, treePath = DiveToDeepestMatch(dirTree, treePath) lg("Looking from height %v to level %v - %v", srcDepth-i, srcDepth-j, treePath) if _, ok := alreadyCrawled[treePath]; ok { // lg("\t already digested %v", treePath) continue } m2 := new(MyWorker) m2.r = r m2.lg = lg m2.fs1 = fs1 m2.SURL = path.Join(cmd.Host, treePath) m2.Protocol = knownProtocol btsPar, _, usedExisting, err := fetchSave(m2) lg(err) if err != nil { return } alreadyCrawled[treePath] = struct{}{} if !usedExisting { addAnchors(lg, cmd.Host, btsPar, dirTree) } if subtree == nil { lg("\n#%v treePath %q ; subtree is nil", i, treePath) } else { // lg("\n#%v treePath %q ; subtree exists", i, treePath) opt.Rump = treePath opt.MinDepthDiff = i - j opt.MaxDepthDiff = i - j lvlLinks := LevelWiseDeeper(nil, nil, subtree, opt) links = append(links, lvlLinks...) for _, art := range lvlLinks { _ = art // lg("#%v fnd %v", i, stringspb.ToLen(art.Url, 100)) } if len(links) >= opt.MaxNumber { lg("found enough links") break MarkOuter } pathPrev := treePath treePath = path.Dir(treePath) // lg("#%v bef %v - aft %v", i, pathPrev, treePath) if pathPrev == "." && treePath == "." || pathPrev == "/" && treePath == "/" || pathPrev == "" && treePath == "." { lg("break to innner") break MarkInner } } } } // // // // lg("%v links after %4.2v secs", len(links), time.Now().Sub(start).Seconds()) lg("============================") lg("Now reading/fetching actual similar files - not just the links") // tried := 0 selecteds := []FullArticle{} nonExisting := []FullArticle{} nonExistFetched := []FullArticle{} for _, art := range links { if art.Url == ourl.Path { lg("skipping self\t%v", art.Url) continue } tried++ useExisting := false semanticUri := condenseTrailingDir(art.Url, cmd.CondenseTrailingDirs) p := path.Join(docRoot, cmd.Host, semanticUri) f, err := fs1.Open(p) // lg(err) // its no error if file does not exist if err != nil { // lg("!nstore %q", semanticUri) } else { // lg("reading %q", semanticUri) // lets put this into a func, so that f.close it called at the end of this func // otherwise defer f.close() spans the entire func and prevents // overwrites chmods further down f := func() { defer f.Close() fi, err := f.Stat() lg(err) if err != nil { } else { age := time.Now().Sub(fi.ModTime()) if age.Hours() < 10 { lg("\t\tusing existing file with age %4.2v hrs", age.Hours()) art.Mod = fi.ModTime() bts, err := ioutil.ReadAll(f) lg(err) art.Body = bts if len(bts) < 200 { if bytes.Contains(bts, []byte(fetch.MsgNoRdirects)) { return } } selecteds = append(selecteds, art) useExisting = true } } } f() } if !useExisting { nonExisting = append(nonExisting, art) } if len(selecteds) >= countSimilar { break } } lg("============================") lg("tried %v links - yielding %v existing similars; not existing in datastore: %v, %v were requested.", tried, len(selecteds), len(nonExisting), countSimilar) if len(selecteds) < countSimilar { jobs := make([]distrib.Worker, 0, len(nonExisting)) for _, art := range nonExisting { surl := path.Join(cmd.Host, art.Url) wrkr := MyWorker{SURL: surl} wrkr.Protocol = knownProtocol wrkr.r = r wrkr.lg = lg wrkr.fs1 = fs1 job := distrib.Worker(&wrkr) jobs = append(jobs, job) } opt := distrib.NewDefaultOptions() opt.TimeOutDur = 3500 * time.Millisecond opt.Want = int32(countSimilar - len(selecteds) + 4) // get some more, in case we have "redirected" bodies opt.NumWorkers = int(opt.Want) // 5s query limit; => hurry; spawn as many as we want if numWorkers > 0 { opt.NumWorkers = numWorkers } lg("Preparing %v simultaneous, wanting %v fetches; at %4.2v secs.", opt.NumWorkers, opt.Want, time.Now().Sub(start).Seconds()) opt.CollectRemainder = false // 5s query limit; => hurry; dont wait for stragglers ret, msg := distrib.Distrib(jobs, opt) lg("Distrib returned at %4.2v secs with %v results.", time.Now().Sub(start).Seconds(), len(ret)) lg("\n" + msg.String()) for _, v := range ret { v1, _ := v.Worker.(*MyWorker) if v1.FA != nil { age := time.Now().Sub(v1.FA.Mod) if age.Hours() < 10 { lg("\t\tusing fetched file with age %4.2v hrs", age.Hours()) nonExistFetched = append(nonExistFetched, *v1.FA) if len(nonExistFetched) > (countSimilar - len(selecteds)) { break } } } if v1.err != nil { lg(err) } } lg("tried %v links - yielding %v fetched - jobs %v", len(nonExisting), len(nonExistFetched), len(jobs)) selecteds = append(selecteds, nonExistFetched...) // // // Extract links for _, v := range nonExistFetched { // lg("links -> memory dirtree for %q", v.Url) addAnchors(lg, cmd.Host, v.Body, dirTree) } } // if time.Now().Sub(dirTree.LastFound).Seconds() < 10 { lg("saving accumulated (new) links to digest") saveDigest(lg, fs1, fnDigest, dirTree) } lg("\t\t%4.2v secs so far 3", time.Now().Sub(start).Seconds()) mp := map[string][]byte{} mp["msg"] = b.Bytes() mp["url_self"] = []byte(condenseTrailingDir(ourl.Path, cmd.CondenseTrailingDirs)) mp["mod_self"] = []byte(modSrc.Format(http.TimeFormat)) mp["bod_self"] = btsSrc for i, v := range selecteds { mp["url__"+spf("%02v", i)] = []byte(v.Url) mp["mod__"+spf("%02v", i)] = []byte(v.Mod.Format(http.TimeFormat)) mp["bod__"+spf("%02v", i)] = v.Body } mp["lensimilar"] = []byte(spf("%02v", len(selecteds))) // smp, err := json.MarshalIndent(mp, "", "\t") if err != nil { lg(b, "marshalling mp to []byte failed\n") return } r.Header.Set("X-Custom-Header-Counter", "nocounter") w.Header().Set("Content-Type", "application/json") w.Write(smp) b.Reset() // this keeps the buf pointer intact; outgoing defers are still heeded b = new(bytes.Buffer) // creates a *new* buf pointer; outgoing defers write into the *old* buf lg("\t\t%4.2v secs so far 4 (json resp written as []byte)", time.Now().Sub(start).Seconds()) return }
func similarTextifiedTrees2(src *TextifiedTree, mp map[string][]*TextifiedTree, skipPrefix map[string]bool) { // srcE := word.WrapAsEqualer(string(src.Text), true) // ssrc as Equaler srcE := wordb.WrapAsEqualer(src.Text, true) srcLen := float64(len(src.Text)) for fnKey, tts := range mp { if fnKey == src.SourceID { pf(" to %v SKIP self\n", fnKey) continue } pf(" to %v\n", fnKey) cntr, br := 0, true for _, tt := range tts { // outl, text := tt.Outl, tt.Text if tt.Lvl > src.Lvl+levelsTolerance { break // since we are now sorted by lvl, we can this is safe } if tt.Lvl == src.Lvl || (tt.Lvl > src.Lvl && tt.Lvl <= src.Lvl+levelsTolerance) { // proceed } else { continue } if src.NumTokens < 1 { continue } if src.NumTokens < 5 && tt.NumTokens > 7 { continue } if HistoBasedDistance(src, tt) > 0.51 { breakMapsTooDistinct++ continue } relSize := srcLen / float64(util.Max(1, len(tt.Text))) if relSize < 0.33 || relSize > 3 { continue } absDist, relDist := 0, 0.0 if tt.NumTokens == src.NumTokens && len(tt.Text) == len(src.Text) && bytes.Equal(tt.Text, src.Text) { absDist, relDist = 0, 0.0 appliedCompare++ } else { dstE := wordb.WrapAsEqualer(tt.Text, true) // destinations as Equaler m := levenshtein.New(srcE, dstE, opt) absDist, relDist = m.Distance() appliedLevenshtein++ } // if relDist < 0.26 && absDist < 10 { if br { pf("\t") } sd := "" sd = string(tt.Text[:util.Min(2*excerptLen, len(tt.Text)-1)]) sd = stringspb.ToLen(sd, 2*excerptLen+1) pf("%12v %v %4v %5.2v ", tt.Outline, sd, absDist, relDist) cntr++ br = false sim := Similar{} sim.SourceID = fnKey sim.Lvl = tt.Lvl sim.Outline = tt.Outline sim.AbsLevenshtein = absDist sim.RelLevenshtein = relDist sim.Text = tt.Text src.Similars = append(src.Similars, sim) src.SumAbsLevenshtein += absDist src.SumRelLevenshtein += relDist if cntr%2 == 0 || cntr > 20 { pf("\n") br = true } if cntr > 20 { break } } } if !br { pf("\n") } } }