// requesting via http; not from filesystem // unused func fetchDigest(hostWithPrefix, domain string) (*DirTree, error) { lg, lge := loghttp.Logger(nil, nil) _ = lg surl := path.Join(hostWithPrefix, domain, "digest2.json") bts, _, err := fetch.UrlGetter(nil, fetch.Options{URL: surl}) lge(err) if err != nil { return nil, err } // lg("%s", bts) dirTree := &DirTree{Name: "/", Dirs: map[string]DirTree{}, EndPoint: true} if err == nil { err = json.Unmarshal(bts, dirTree) lge(err) if err != nil { return nil, err } } lg("DirTree %5.2vkB loaded for %v", len(bts)/1024, surl) age := time.Now().Sub(dirTree.LastFound) lg("DirTree is %5.2v hours old (%v)", age.Hours(), dirTree.LastFound.Format(time.ANSIC)) return dirTree, nil }
// GetDirContents fetches from fileserver - via http // Parsing the received JSON into string slices func GetDirContents(hostWithPrefix, dir string) ([]string, []string, *bytes.Buffer, error) { lg, lge := loghttp.Logger(nil, nil) _ = lg var b = new(bytes.Buffer) dirs := []string{} fils := []string{} // build url urlSubDirs, err := url.Parse(path.Join(hostWithPrefix, dir)) lge(err) if err != nil { return dirs, fils, b, err } sd := urlSubDirs.String() sd = common.Directorify(sd) wpf(b, "requ subdirs from %v", sd) // make req bsubdirs, effU, err := fetch.UrlGetter(nil, fetch.Options{URL: sd}) lge(err) if err != nil { return dirs, fils, b, err } wpf(b, "got %s - %v", bsubdirs, effU) // parse json mpSubDir := []map[string]string{} err = json.Unmarshal(bsubdirs, &mpSubDir) lge(err) if err != nil { // lg("%s", bsubdirs) return dirs, fils, b, err } wpf(b, "json of subdir is %s", stringspb.IndentedDump(mpSubDir)) for _, v := range mpSubDir { if dir, ok := v["path"]; ok { if strings.HasSuffix(dir, "/") { dirs = append(dirs, dir) } else { fils = append(fils, dir) } } if smod, ok := v["mod"]; ok { t, err := time.Parse(time.RFC1123Z, smod) lge(err) wpf(b, "age %-6.2v", time.Now().Sub(t).Hours()) } } return dirs, fils, b, nil }
func formRedirector(w http.ResponseWriter, r *http.Request, m map[string]interface{}) { lg, lge := loghttp.Logger(w, r) var msg, cntnt, rURL string w.Header().Set("Content-type", "text/html; charset=utf-8") // w.Header().Set("Content-type", "text/html; charset=latin-1") rURL = r.FormValue("redirect-to") lg("lo redirect to: %v", rURL) if len(r.PostForm) > 0 { // loghttp.Pf(w, r, "post unimplemented:<br> %#v <br>\n", r.PostForm) // return msg += fmt.Sprintf("post converted to get<br>") } rURL = fmt.Sprintf("%v?1=2&", rURL) for key, vals := range r.Form { if key == "redirect-to" { continue } val := vals[0] if util_appengine.IsLocalEnviron() { val = strings.Replace(val, " ", "%20", -1) } rURL = fmt.Sprintf("%v&%v=%v", rURL, key, val) } bts, inf, err := fetch.UrlGetter(r, fetch.Options{URL: rURL}) lge(err) cntnt = string(bts) cntnt = insertNewlines.Replace(cntnt) cntnt = undouble.Replace(cntnt) cntnt = domclean1.ModifyHTML(r, inf.URL, cntnt) fmt.Fprintf(w, "%s \n\n", cntnt) fmt.Fprintf(w, "%s \n\n", msg) }
// Post2Receiver takes commands and http posts them to // the command receiver func Post2Receiver(r *http.Request, commands []FetchCommand) (*bytes.Buffer, error) { b := new(bytes.Buffer) if commands == nil || len(commands) == 0 { return b, fmt.Errorf("Slice of commands nil or empty %v", commands) } ii := instance_mgt.Get(r) fullURL := fmt.Sprintf("https://%s%s", ii.PureHostname, uriFetchCommandReceiver) wpf(b, "sending to URL: %v\n", fullURL) bcommands, err := json.MarshalIndent(commands, "", "\t") if err != nil { wpf(b, "marshalling to []byte failed\n") return b, err } req, err := http.NewRequest("POST", fullURL, bytes.NewBuffer(bcommands)) if err != nil { wpf(b, "creation of POST request failed\n") return b, err } req.Header.Set("X-Custom-Header-Counter", "nocounter") req.Header.Set("Content-Type", "application/json") bts, reqUrl, err := fetch.UrlGetter(r, fetch.Options{Req: req}) _, _ = bts, reqUrl if err != nil { wpf(b, "Sending the POST request failed\n") return b, err } wpf(b, "effective req url: %v\n", reqUrl) wpf(b, "response body:\n") wpf(b, "%s\n", html.EscapeString(string(bts))) return b, nil }
// // // Fetches the RSS.xml file. func rssXMLFile(w http.ResponseWriter, r *http.Request, fs fsi.FileSystem, rssUrl string) (RSS, *url.URL) { lg, lge := loghttp.Logger(w, r) bts, respInf, err := fetch.UrlGetter(r, fetch.Options{URL: rssUrl}) lge(err) bts = bytes.Replace(bts, []byte("content:encoded>"), []byte("content-encoded>S"), -1) // hack rssDoc := RSS{} err = xml.Unmarshal(bts, &rssDoc) lge(err) // save it bdmp := stringspb.IndentedDumpBytes(rssDoc) err = fs.MkdirAll(path.Join(docRoot, respInf.URL.Host), 0755) lge(err) err = fs.WriteFile(path.Join(docRoot, respInf.URL.Host, "outp_rss.xml"), bdmp, 0755) lge(err) lg("RSS resp size %5.2vkB, saved to %v", len(bdmp)/1024, respInf.URL.Host+"/outp_rss.xml") return rssDoc, respInf.URL }
// // https://developers.google.com/identity/choose-auth // https://developers.google.com/identity/sign-in/web/backend-auth func TokenSignin(w http.ResponseWriter, r *http.Request) { lg, _ := loghttp.BuffLoggerUniversal(w, r) // w.Header().Set("Access-Control-Allow-Origin", "http://localhost:1313") w.Header().Set("Access-Control-Allow-Origin", "http://"+routes.AppHostDev()) w.Header().Del("Access-Control-Allow-Origin") w.Header().Set("Access-Control-Allow-Origin", "*") // err := r.ParseMultipartForm(1024 * 1024 * 2) err := r.ParseForm() lg(err) myToken := r.Form.Get("idtoken") tokSize := fmt.Sprintf("Len of Tok was %v. \n", len(myToken)) fc1 := func(token *jwt.Token) (interface{}, error) { // Don't forget to validate the alg is what you expect: log.Printf("algo header is %v\n", token.Header["alg"]) if _, ok := token.Method.(*jwt.SigningMethodRSA); !ok { return nil, fmt.Errorf("Unexpected signing method: %v", token.Header["alg"]) } return token.Header["kid"], nil } token, err := jwt.Parse(myToken, fc1) // No direct error comparison possible; since err is wrapped in another struct if err != nil && strings.Contains(err.Error(), jwt.ErrPEMMappingObsolete.Error()) { currentPEMsURL := "https://www.googleapis.com/oauth2/v1/certs" req, err := http.NewRequest("GET", currentPEMsURL, nil) if err != nil { lg("creation of pem request failed") return } req.Header.Set("Content-Type", "application/json") fo := fetch.Options{Req: req} fo.KnownProtocol = "https" fo.ForceHTTPSEvenOnDevelopmentServer = true bts, inf, err := fetch.UrlGetter(r, fo) lg(err) if err != nil { lg("tried to fetch %v, %v", currentPEMsURL, inf.URL) lg("msg %v", inf.Msg) return } if len(bts) > 200 { var data1 map[string]string err = json.Unmarshal(bts, &data1) lg(err) // lg(stringspb.IndentedDumpBytes(data1)) // w.Write(stringspb.IndentedDumpBytes(data1)) if len(data1) > 1 { lg("PEM mappings updated") jwt.MappingToPEM = data1 } else { lg("PEM mapping response contained only %v records; bytes length %v", len(data1), len(bts)) } } } token, err = jwt.Parse(myToken, fc1) if err != nil && strings.Contains(err.Error(), jwt.ErrInvalidKey.Error()) { w.Write([]byte("The submitted RSA Key was somehow unparseable. We still accept the token.\n")) /* https://developers.google.com/identity/sign-in/web/backend-auth */ err = nil token.Valid = true } if err != nil { w.Write([]byte("--- " + err.Error() + ".\n")) } if err == nil && token.Valid { tk := "" tk += fmt.Sprintf(" Algor: %v\n", token.Method) tk += fmt.Sprintf(" Header: %v\n", token.Header) for k, v := range token.Claims { tk += fmt.Sprintf("\t %-8v %v\n", k, v) } lg(tk) w.Write([]byte("tokensignin; valid. \n")) w.Write([]byte(tokSize)) sb := "header-sub-not-present" if _, ok := token.Claims["sub"]; ok { sb = token.Claims["sub"].(string) } w.Write([]byte("ID from PWT is " + sb + "\n")) _, usr, msg1 := login.CheckForNormalUser(r) if usr != nil { w.Write([]byte("ID from SRV is " + usr.ID + "\n")) } w.Write([]byte(msg1 + "\n")) } else { w.Write([]byte("tokensignin; INVALID. \n")) w.Write([]byte(tokSize)) w.Write([]byte(stringspb.ToLen(myToken, 30))) vrf := fmt.Sprintf("\nhttps://www.googleapis.com/oauth2/v3/tokeninfo?id_token=%v \n", myToken) w.Write([]byte(vrf)) } }
func FetchAndDecodeJSON(r *http.Request, surl, knownProtocol string, lg loghttp.FuncBufUniv, fs fsi.FileSystem) []repo.FullArticle { fullURL := fmt.Sprintf("%s%s?%s=%s&cnt=%v&prot=%v", routes.AppHost(), routes.FetchSimilarURI, routes.URLParamKey, surl, numTotal-1, knownProtocol) // fullURL = fmt.Sprintf("%s%s?%s=%s&cnt=%v", r.URL.Host, repo.routes.FetchSimilarURI, // routes.URLParamKey, surl, numTotal-1) lg("lo fetching %v", fullURL) start := time.Now() fo := fetch.Options{} fo.URL = fullURL bJSON, inf, err := fetch.UrlGetter(r, fo) _ = inf lg(err) if err != nil { lg("msg %v", inf.Msg) return nil } if len(bJSON) == 0 { lg("empty bJSON") return nil } lg("\t\tfetch resp complete after %4.2v secs; %vkB", time.Now().Sub(start).Seconds(), len(bJSON)/1024) var mp map[string][]byte err = json.Unmarshal(bJSON, &mp) lg(err) if err != nil { if _, ok := mp["msg"]; ok { lg("%s", mp["msg"]) } else { lg("%s", bJSON) } return nil } smaxFound := string(mp["lensimilar"]) maxFound := util.Stoi(smaxFound) if maxFound < numTotal-1 { lg("not enough files returned by FetchSimilar 1 - mp[lensimilar] too small: %s", mp["lensimilar"]) return nil } least3Files := make([]repo.FullArticle, maxFound+1) _, ok1 := mp["url_self"] _, ok2 := mp["mod_self"] _, ok3 := mp["bod_self"] if ok1 && ok2 && ok3 { least3Files[0].Url = string(mp["url_self"]) least3Files[0].Mod, err = time.Parse(http.TimeFormat, string(mp["mod_self"])) lg(err) least3Files[0].Body = mp["bod_self"] if len(least3Files[0].Body) < 200 { if !bytes.Contains(least3Files[0].Body, []byte(fetch.MsgNoRdirects)) { lg("found base but its a redirect") return nil } } } lg("found base") for k, v := range mp { if k == "msg" { continue } if strings.HasSuffix(k, "self") { continue } if strings.HasPrefix(k, "url__") { sval := strings.TrimPrefix(k, "url__") val := util.Stoi(sval) // lg("%v %v %s", sval, val, v) least3Files[val+1].Url = string(v) } if strings.HasPrefix(k, "mod__") { sval := strings.TrimPrefix(k, "mod__") val := util.Stoi(sval) // lg("%v %v %s", sval, val, v) least3Files[val+1].Mod, err = time.Parse(http.TimeFormat, string(v)) lg(err) } if strings.HasPrefix(k, "bod__") { sval := strings.TrimPrefix(k, "bod__") val := util.Stoi(sval) least3Files[val+1].Body = v //html.EscapeString(string(v) } } lg("found %v similar; decoding complete after %4.2v secs", maxFound, time.Now().Sub(start).Seconds()) for _, v := range least3Files { lg("%v %v", v.Url, len(v.Body)) } return least3Files }
func requestPay(w http.ResponseWriter, r *http.Request, m map[string]interface{}) { lg, b := loghttp.BuffLoggerUniversal(w, r) closureOverBuf := func(bUnused *bytes.Buffer) { loghttp.Pf(w, r, b.String()) } defer closureOverBuf(b) // the argument is ignored, r.Header.Set("X-Custom-Header-Counter", "nocounter") protoc := "https://" if appengine.IsDevAppServer() { protoc = "http://" } host := appengine.DefaultVersionHostname(appengine.NewContext(r)) if appengine.IsDevAppServer() { host = "not-loclhost" } confirmURL := fmt.Sprintf("%v%v%v", protoc, host, uriConfirmPayment) confirmURL = url.QueryEscape(confirmURL) addrURL := fmt.Sprintf("https://%v/api/receive?method=create&address=%v&callback=%v&customsecret=49&api_code=%v", blockChainHost, bitCoinAddress, confirmURL, apiKey) req, err := http.NewRequest("GET", addrURL, nil) lg(err) if err != nil { return } bts, inf, err := fetch.UrlGetter(r, fetch.Options{Req: req}) bts = bytes.Replace(bts, []byte(`","`), []byte(`", "`), -1) if err != nil { lg(err) lg(inf.Msg) return } lg("response body 1:\n") lg("%s\n", string(bts)) lg("response body 2:\n") var data1 map[string]interface{} err = json.Unmarshal(bts, &data1) lg(err) lg(stringspb.IndentedDumpBytes(data1)) // lg("%#v", data1) inputAddress, ok := data1["input_address"].(string) if !ok { lg("input address could not be casted to string; is type %T", data1["input_address"]) return } feePercent, ok := data1["fee_percent"].(float64) if !ok { lg("fee percent could not be casted to float64; is type %T", data1["fee_percent"]) return } lg("Input Adress will be %q; fee percent will be %4.2v", inputAddress, feePercent) }
func fetchSimForm(w http.ResponseWriter, r *http.Request, m map[string]interface{}) { lg, b := loghttp.BuffLoggerUniversal(w, r) closureOverBuf := func(bUnused *bytes.Buffer) { loghttp.Pf(w, r, b.String()) } defer closureOverBuf(b) // the argument is ignored, r.Header.Set("X-Custom-Header-Counter", "nocounter") // on live server => always use https if r.URL.Scheme != "https" && !util_appengine.IsLocalEnviron() { r.URL.Scheme = "https" r.URL.Host = r.Host lg("lo - redirect %v", r.URL.String()) http.Redirect(w, r, r.URL.String(), http.StatusFound) } err := r.ParseForm() lg(err) rURL := "" if r.FormValue(routes.URLParamKey) != "" { rURL = r.FormValue(routes.URLParamKey) } if len(rURL) == 0 { wpf(b, tplx.ExecTplHelper(tplx.Head, map[string]interface{}{"HtmlTitle": "Find similar HTML URLs"})) defer wpf(b, tplx.Foot) tm := map[string]string{ "val": "www.welt.de/politik/ausland/article146154432/Tuerkische-Bodentruppen-marschieren-im-Nordirak-ein.html", "fieldname": routes.URLParamKey, } tplForm := tt.Must(tt.New("tplName01").Parse(htmlForm)) tplForm.Execute(b, tm) } else { fullURL := fmt.Sprintf("https://%s%s?%s=%s&cnt=%s&prot=%s", r.Host, routes.FetchSimilarURI, routes.URLParamKey, rURL, r.FormValue("cnt"), r.FormValue("prot")) lg("lo - sending to URL 1: %v", fullURL) fo := fetch.Options{} fo.URL = fullURL bts, inf, err := fetch.UrlGetter(r, fo) _ = inf lg(err) if err != nil { return } if len(bts) == 0 { lg("empty bts") return } var mp map[string][]byte err = json.Unmarshal(bts, &mp) lg(err) if err != nil { lg("%s", bts) return } w.Header().Set("Content-Type", "text/html; charset=utf-8") if _, ok := mp["msg"]; ok { w.Write(mp["msg"]) } for k, v := range mp { if k != "msg" { wpf(w, "<br><br>%s:\n", k) if true { wpf(w, "len %v", len(v)) } else { wpf(w, "%s", html.EscapeString(string(v))) } } } } }
// Fetch takes a RSS XML uri and fetches some of its documents. // It uses a three staged pipeline for parallel fetching. // Results are stored into the given filesystem fs. // Config points to the source of RSS XML, // and has some rules for conflating URI directories. // uriPrefix and config.DesiredNumber tell the func // which subdirs of the RSS dir should be fetched - and how many at max. func FetchUsingRSS(w http.ResponseWriter, r *http.Request, fs fsi.FileSystem, config FetchCommand, ) { lg, b := loghttp.BuffLoggerUniversal(w, r) closureOverBuf := func(bUnused *bytes.Buffer) { loghttp.Pf(w, r, b.String()) } defer closureOverBuf(b) // the argument is ignored, if config.Host == "" { lg(" empty host; returning") return } config = addDefaults(config) // Fetching the rssXML takes time. // We do it before the timouts of the pipeline stages are set off. lg(" ") lg(config.Host) if config.Host == "test.economist.com" { switchTData(w, r) } // lg(stringspb.IndentedDump(config)) dirTree := &DirTree{Name: "/", Dirs: map[string]DirTree{}, EndPoint: true} fnDigest := path.Join(docRoot, config.Host, "digest2.json") loadDigest(w, r, lg, fs, fnDigest, dirTree) // previous age := time.Now().Sub(dirTree.LastFound) lg("DirTree is %5.2v hours old (%v)", age.Hours(), dirTree.LastFound.Format(time.ANSIC)) if age.Hours() > 0.001 { rssUrl := matchingRSSURI(w, r, config) if rssUrl == "" { m := new(MyWorker) m.r = r m.lg = lg m.fs1 = fs m.SURL = path.Join(config.Host, config.SearchPrefix) _, _, _, err := fetchSave(m) lg(err) if err != nil { return } } else { rssUrl = path.Join(config.Host, rssUrl) rssDoc, rssUrlObj := rssXMLFile(w, r, fs, rssUrl) _ = rssUrlObj rssDoc2DirTree(w, r, dirTree, rssDoc, config.Host) } saveDigest(lg, fs, fnDigest, dirTree) } // lg(dirTree.String()) // // // setting up a 3 staged pipeline from bottom up // var fullArticles []FullArticle var inn chan *FullArticle = make(chan *FullArticle) // jobs are stuffed in here var out chan *FullArticle = make(chan *FullArticle) // completed jobs are delivered here var fin chan struct{} = make(chan struct{}) // downstream signals end to upstream var stage3Wait sync.WaitGroup // stage 3 // fire up the "collector", a fan-in go func() { stage3Wait.Add(1) // 400 good value; critical point at 35 // economist.com required 800 ms const delayInitial = 1200 const delayRefresh = 800 cout := time.After(time.Millisecond * delayInitial) for { select { case fa := <-out: fullArticles = append(fullArticles, *fa) pth := fetch.PathFromStringUrl(fa.Url) lg(" fetched %v - %v ", fa.Mod.Format("15:04:05"), stringspb.Ellipsoider(pth, 50)) cout = time.After(time.Millisecond * delayRefresh) // refresh timeout case <-cout: lg("timeout after %v articles", len(fullArticles)) // we are using channel == nil - channel closed combinations // inspired by http://dave.cheney.net/2013/04/30/curious-channels out = nil // not close(out) => case above is now blocked close(fin) lg("fin closed; out nilled") stage3Wait.Done() return } } }() // // stage 2 for i := 0; i < numWorkers; i++ { // fire up a dedicated fetcher routine, a worker // we are using channel == nil - channel closed combinations // inspired by http://dave.cheney.net/2013/04/30/curious-channels go func() { var a *FullArticle for { select { case a = <-inn: var err error var inf fetch.Info a.Body, inf, err = fetch.UrlGetter(r, fetch.Options{URL: a.Url}) lg(err) if a.Mod.IsZero() { a.Mod = inf.Mod } select { case out <- a: case <-fin: lg(" worker spinning down; branch 1; abandoning %v", a.Url) return } a = new(FullArticle) case <-fin: if a != nil && a.Url != "" { u, _ := url.Parse(a.Url) lg(" abandoned %v", u.Path) } else { lg(" worker spinning down; branch 2") } return } } }() } // // // // loading stage 1 uriPrefix := config.SearchPrefix found := 0 uriPrefixExcl := "impossible" for i := 0; i < 15; i++ { lg(" searching for prefix %v - excl %q - %v of %v", uriPrefix, uriPrefixExcl, found, config.DesiredNumber) found += stuffStage1(w, r, config, inn, fin, dirTree, uriPrefixExcl, uriPrefix, config.DesiredNumber-found) if found >= config.DesiredNumber { break } if uriPrefix == "/" || uriPrefix == "." { lg(" root exhausted") break } newPrefix := path.Dir(uriPrefix) uriPrefixExcl = uriPrefix uriPrefix = newPrefix } lg(" found %v of %v", found, config.DesiredNumber) // lg("stage3Wait.Wait() before") stage3Wait.Wait() lg("stage3Wait.Wait() after") // workers spin down earlier - // but ae log writer and response writer need some time // to record the spin-down messages time.Sleep(120 * time.Millisecond) // compile out directory statistics histoDir := map[string]int{} for _, a := range fullArticles { u, err := url.Parse(a.Url) lg(err) semanticUri := condenseTrailingDir(u.Path, config.CondenseTrailingDirs) dir := path.Dir(semanticUri) histoDir[dir]++ } sr := sortmap.SortMapByCount(histoDir) _ = sr // Create dirs for k, _ := range histoDir { dir := path.Join(docRoot, k) // config.Host already contained in k err := fs.MkdirAll(dir, 0755) lg(err) err = fs.Chtimes(dir, time.Now(), time.Now()) lg(err) } // Saving as files for _, a := range fullArticles { if len(a.Body) == 0 { continue } u, err := url.Parse(a.Url) u.Fragment = "" u.RawQuery = "" lg(err) semanticUri := condenseTrailingDir(u.RequestURI(), config.CondenseTrailingDirs) p := path.Join(docRoot, semanticUri) err = fs.WriteFile(p, a.Body, 0644) lg(err) err = fs.Chtimes(p, a.Mod, a.Mod) lg(err) } { b, err := json.MarshalIndent(histoDir, " ", "\t") lg(err) fnDigest := path.Join(docRoot, config.Host, "fetchDigest.json") err = fs.WriteFile(fnDigest, b, 0755) lg(err) } // fsm, ok := memfs.Unwrap(fs) // if ok { // fsm.Dump() // } }
func Test1(t *testing.T) { lg, b := loghttp.BuffLoggerUniversal(nil, nil) _ = b c, err := aetest.NewContext(nil) lg(err) if err != nil { return } defer c.Close() fs := GetFS(c, 2) remoteHostname := "www.welt.de" remoteHostname = "www.welt.de/politik/ausland" dirs1, _, msg, err := fileserver.GetDirContents(repo.RepoURL, remoteHostname) if err != nil { lg(err) lg("%s", msg) } lg("dirs1") for _, v := range dirs1 { lg(" %v", v) } least3URLs := []string{} for _, v1 := range dirs1 { p := path.Join(remoteHostname, v1) dirs2, fils2, msg, err := fileserver.GetDirContents(repo.RepoURL, p) _ = dirs2 if err != nil { lg(err) lg("%s", msg) } // lg(" dirs2 %v", stringspb.IndentedDump(dirs2)) // lg(" fils2 %v", stringspb.IndentedDump(fils2)) for _, v2 := range fils2 { least3URLs = append(least3URLs, path.Join(remoteHostname, v1, v2)) } } if len(least3URLs) < numTotal { lg("not enough files in rss fetcher cache") return } else { least3URLs = least3URLs[:numTotal+1] } lg("fils2") for _, v := range least3URLs { lg(" %v", v) } // domclean least3Files := make([]repo.FullArticle, 0, len(least3URLs)) for i := 0; i < len(least3URLs); i++ { surl := spf("%v/%v", repo.RepoURL, least3URLs[i]) fNamer := domclean2.FileNamer(logDir, i) fNamer() // first call yields key resBytes, inf, err := fetch.UrlGetter(nil, fetch.Options{URL: surl}) if err != nil { lg(err) return } lg("fetched %4.1fkB from %v", float64(len(resBytes))/1024, stringspb.ToLenR(inf.URL.String(), 60)) fa := repo.FullArticle{} fa.Url = inf.URL.String() fa.Mod = inf.Mod fa.Body = resBytes least3Files = append(least3Files, fa) } doc := Dedup(least3Files, lg, fs) fNamer := domclean2.FileNamer(logDir, 0) fNamer() // first call yields key fsPerm := GetFS(c, 2) fileDump(lg, fsPerm, doc, fNamer, "_fin.html") pf("MapSimiliarCompares: %v SimpleCompares: %v LevenstheinComp: %v\n", breakMapsTooDistinct, appliedLevenshtein, appliedCompare) pf("correct finish\n") }
// Fetches URL if local file is outdated. // saves fetched file // // link extraction, link addition to treeX now accumulated one level higher // bool return value: use existing => true func fetchSave(m *MyWorker) ([]byte, time.Time, bool, error) { // w http.ResponseWriter, // r *http.Request, // Determine FileName ourl, err := fetch.URLFromString(m.SURL) fc := FetchCommand{} fc.Host = ourl.Host fc = addDefaults(fc) semanticUri := condenseTrailingDir(m.SURL, fc.CondenseTrailingDirs) fn := path.Join(docRoot, semanticUri) m.lg("crawlin %q", m.SURL) // File already exists? // Open file for age check var bts []byte var mod time.Time f := func() error { file1, err := m.fs1.Open(fn) // m.lg(err) // file may simply not exist if err != nil { return err // file may simply not exist } defer file1.Close() // file close *fast* at the end of *this* anonymous func fi, err := file1.Stat() m.lg(err) if err != nil { return err } if fi.IsDir() { m.lg("\t\t file is a directory, skipping - %v", fn) return fmt.Errorf("is directory: %v", fn) } mod = fi.ModTime() age := time.Now().Sub(mod) if age.Hours() > 10 { m.lg("\t\t file %4.2v hours old, refetch ", age.Hours()) return fmt.Errorf("too old: %v", fn) } m.lg("\t\t file only %4.2v hours old, take %4.2vkB from datastore", age.Hours(), fi.Size()/1024) bts, err = ioutil.ReadAll(file1) if err != nil { return err } return nil } err = f() if err == nil { return bts, mod, true, err } // // Fetch bts, inf, err := fetch.UrlGetter(m.r, fetch.Options{URL: m.SURL, KnownProtocol: m.Protocol, RedirectHandling: 1}) m.lg(err) if err != nil { if inf.Status != http.StatusNotFound { m.lg("tried to fetch %v, %v", m.SURL, inf.URL) m.lg("msg %v", inf.Msg) return []byte{}, inf.Mod, false, err } // In our traversing upwards, we might encounter "directory links" that have no index.html. // For a *derived* URL, this is no error. bts = []byte(" ... not found ... ") } if inf.Mod.IsZero() { inf.Mod = time.Now().Add(-75 * time.Minute) } // // // main request still exists? if false { var cx context.Context cx = util_appengine.SafelyExtractGaeContext(m.r) if cx == nil { m.lg("timed out - returning") return bts, inf.Mod, false, fmt.Errorf("req timed out") } } m.lg("retrivd+saved %q; %vkB ", inf.URL.Host+inf.URL.Path, len(bts)/1024) if len(bts) > 1024*1024-1 { bts = removeScriptsAndComments(m.lg, bts) m.lg("size reduced_1 to %vkB ", len(bts)/1024) // if len(bts) > 1024*1024-1 { // bts = snappy.Encode(nil, bts) // fn = strings.Replace(fn, ".html", ".snap.html", -1) // m.lg("size reduced_2 to %vkB ", len(bts)/1024) // } } // // dir := path.Dir(fn) err = m.fs1.MkdirAll(dir, 0755) m.lg(err) err = m.fs1.Chtimes(dir, time.Now(), time.Now()) m.lg(err) err = m.fs1.WriteFile(fn, bts, 0644) m.lg(err) err = m.fs1.Chtimes(fn, inf.Mod, inf.Mod) m.lg(err) return bts, inf.Mod, false, nil }
func Test1(t *testing.T) { lg, lge := loghttp.Logger(nil, nil) // c := prepare(t) // defer c.Close() lg("waiting for webserver") time.Sleep(2 * time.Millisecond) remoteHostname := "www.welt.de" dirs1, _, msg, err := fileserver.GetDirContents(hostWithPref, remoteHostname) if err != nil { lge(err) lg("%s", msg) } lg("dirs1") for _, v := range dirs1 { lg(" %v", v) } least3Files := []string{} for _, v1 := range dirs1 { dirs2, fils2, msg, err := fileserver.GetDirContents(hostWithPref, path.Join(remoteHostname, v1)) _ = dirs2 if err != nil { lge(err) lg("%s", msg) } // lg(" dirs2 %v", stringspb.IndentedDump(dirs2)) // lg(" fils2 %v", stringspb.IndentedDump(fils2)) if len(fils2) > numTotal-1 { for i2, v2 := range fils2 { least3Files = append(least3Files, path.Join(remoteHostname, v1, v2)) if i2 == numTotal-1 { break } } break } } if len(least3Files) < numTotal { lg("not enough files in rss fetcher cache") return } lg("fils2") for _, v := range least3Files { lg(" %v", v) } logdir := prepareLogDir() iter := make([]int, numTotal) for i, _ := range iter { surl := spf("%v/%v", hostWithPref, least3Files[i]) fNamer := FileNamer(logdir, i) fnKey := fNamer() // first call yields key _ = fnKey resBytes, effUrl, err := fetch.UrlGetter(nil, fetch.Options{URL: surl}) if err != nil { lge(err) return } lg("fetched %4.1fkB from %v", float64(len(resBytes))/1024, stringspb.ToLenR(effUrl.String(), 60)) opts := CleaningOptions{Proxify: true} opts.FNamer = fNamer opts.RemoteHost = remoteHostname doc, err := DomClean(resBytes, opts) lge(err) _ = doc } // statistics on elements and attributes sorted1 := sortmap.SortMapByCount(attrDistinct) sorted1.Print(6) fmt.Println() sorted2 := sortmap.SortMapByCount(nodeDistinct) sorted2.Print(6) pf("correct finish\n") }
// handleFetchURL either displays a form for requesting an url // or it returns the URL´s contents. func handleFetchURL(w http.ResponseWriter, r *http.Request, m map[string]interface{}) { lg, b := loghttp.BuffLoggerUniversal(w, r) _ = b // on live server => always use https if r.URL.Scheme != "https" && !util_appengine.IsLocalEnviron() { r.URL.Scheme = "https" r.URL.Host = r.Host lg("lo - redirect %v", r.URL.String()) http.Redirect(w, r, r.URL.String(), http.StatusFound) } /* To distinguish between posted and getted value, we check the "post-only" slice of values first. If nothing's there, but FormValue *has* a value, then it was "getted", otherwise "posted" */ rURL := "" urlAs := "" err := r.ParseForm() lg(err) if r.PostFormValue(routes.URLParamKey) != "" { urlAs += "url posted " rURL = r.PostFormValue(routes.URLParamKey) } if r.FormValue(routes.URLParamKey) != "" { if rURL == "" { urlAs += "url getted " rURL = r.FormValue(routes.URLParamKey) } } // lg("received %v: %q", urlAs, rURL) if len(rURL) == 0 { tplAdder, tplExec := tplx.FuncTplBuilder(w, r) tplAdder("n_html_title", "Fetch some http data", nil) m := map[string]string{ "protocol": "https", "host": r.Host, // not fetch.HostFromReq(r) "path": routes.ProxifyURI, "name": routes.URLParamKey, "val": "google.com", } if util_appengine.IsLocalEnviron() { m["protocol"] = "http" } tplAdder("n_cont_0", c_formFetchUrl, m) tplExec(w, r) } else { r.Header.Set("X-Custom-Header-Counter", "nocounter") bts, inf, err := fetch.UrlGetter(r, fetch.Options{URL: rURL}) lg(err) tp := mime.TypeByExtension(path.Ext(inf.URL.Path)) if false { ext := path.Ext(rURL) ext = strings.ToLower(ext) tp = mime.TypeByExtension(ext) } w.Header().Set("Content-Type", tp) // w.Header().Set("Content-type", "text/html; charset=latin-1") if r.FormValue("dbg") != "" { w.Header().Set("Content-type", "text/html; charset=utf-8") fmt.Fprintf(w, "%s<br>\n %s<br>\n %v", inf.URL.Path, tp, inf.URL.String()) return } opts := domclean2.CleaningOptions{Proxify: true} opts.Beautify = true // "<a> Linktext without trailing space" opts.RemoteHost = fetch.HostFromStringUrl(rURL) // opts.ProxyHost = routes.AppHost() opts.ProxyHost = fetch.HostFromReq(r) if !util_appengine.IsLocalEnviron() { opts.ProxyHost = fetch.HostFromReq(r) } doc, err := domclean2.DomClean(bts, opts) var bufRend bytes.Buffer err = html.Render(&bufRend, doc) lg(err) w.Write(bufRend.Bytes()) } }