func rssDoc2DirTree(w http.ResponseWriter, r *http.Request, treeX *DirTree, rssDoc RSS, domain string) { lg, lge := loghttp.Logger(w, r) _ = lg if treeX == nil { treeX = &DirTree{Name: "root1", Dirs: map[string]DirTree{}, LastFound: time.Now().Truncate(time.Minute)} } articleList := []FullArticle{} for _, lpItem := range rssDoc.Items.ItemList { t, err := time.Parse(time.RFC1123Z, lpItem.Published) // := time.Parse("Mon, 2 Jan 2006 15:04:05 -0700", lpItem.Published) lge(err) articleList = append(articleList, FullArticle{Url: lpItem.Link, Mod: t}) } lg1, _ := loghttp.BuffLoggerUniversal(w, r) path2DirTree(lg1, treeX, articleList, domain, true) }
func switchTData(w http.ResponseWriter, r *http.Request) { lg, lge := loghttp.Logger(w, r) _ = lge b := fetch.TestData["test.economist.com"] sub1 := []byte(`<li><a href="/sections/newcontinent">xxx</a></li>`) sub2 := []byte(`<li><a href="/sections/asia">Asia</a></li>`) sub3 := []byte(`<li><a href="/sections/asia">Asia</a></li> <li><a href="/sections/newcontinent">xxx</a></li>`) if bytes.Contains(b, sub1) { b = bytes.Replace(b, sub1, []byte{}, -1) } else { b = bytes.Replace(b, sub2, sub3, -1) } if bytes.Contains(b, sub1) { lg("now contains %s", sub1) } else { lg("NOT contains %s", sub1) } fetch.TestData["test.economist.com"] = b }
func Test2(t *testing.T) { lg, lge := loghttp.Logger(nil, nil) doc, err := html.Parse(strings.NewReader(testDocs[0])) if err != nil { lge(err) return } removeCommentsAndIntertagWhitespace(NdX{doc, 0}) breakoutImagesFromAnchorTrees(doc) removeCommentsAndIntertagWhitespace(NdX{doc, 0}) reIndent(doc, 0) var b bytes.Buffer err = html.Render(&b, doc) lge(err) if b.String() != testDocs[1] { t.Errorf("output unexpted") } osutilpb.Bytes2File("outp1_inp.html", []byte(testDocs[0])) osutilpb.Dom2File("outp2_got.html", doc) osutilpb.Bytes2File("outp3_want.html", []byte(testDocs[1])) lg("end") }
// requesting via http; not from filesystem // unused func fetchDigest(hostWithPrefix, domain string) (*DirTree, error) { lg, lge := loghttp.Logger(nil, nil) _ = lg surl := path.Join(hostWithPrefix, domain, "digest2.json") bts, _, err := fetch.UrlGetter(nil, fetch.Options{URL: surl}) lge(err) if err != nil { return nil, err } // lg("%s", bts) dirTree := &DirTree{Name: "/", Dirs: map[string]DirTree{}, EndPoint: true} if err == nil { err = json.Unmarshal(bts, dirTree) lge(err) if err != nil { return nil, err } } lg("DirTree %5.2vkB loaded for %v", len(bts)/1024, surl) age := time.Now().Sub(dirTree.LastFound) lg("DirTree is %5.2v hours old (%v)", age.Hours(), dirTree.LastFound.Format(time.ANSIC)) return dirTree, nil }
func prepare(t *testing.T) aetest.Context { lg, lge := loghttp.Logger(nil, nil) _ = lg c, err := aetest.NewContext(nil) if err != nil { lge(err) t.Fatal(err) } serveFile := func(w http.ResponseWriter, r *http.Request, m map[string]interface{}) { fs1 := repo.GetFS(c) fileserver.FsiFileServer(w, r, fileserver.Options{FS: fs1, Prefix: repo.UriMountNameY}) } http.HandleFunc(repo.UriMountNameY, loghttp.Adapter(serveFile)) go func() { log.Fatal( http.ListenAndServe(cTestHostOwn, nil), ) }() return c }
func PrepareLogDir() string { lg, lge := loghttp.Logger(nil, nil) logdir := "outp" lg("logdir is %v ", logdir) // sweep previous rmPath := fmt.Sprintf("./%v/", logdir) err := os.RemoveAll(rmPath) if err != nil { lge(err) os.Exit(1) } lg("removed %q", rmPath) // create anew err = os.Mkdir(logdir, 0755) if err != nil && !os.IsExist(err) { lge(err) os.Exit(1) } return logdir }
// FetchHTML executes the fetch commands. // It creates the configured filesystem and calls the fetcher. func FetchHTML(w http.ResponseWriter, r *http.Request, fcs []FetchCommand) { lg, lge := loghttp.Logger(w, r) var err error fs := GetFS(appengine.NewContext(r)) // fs = fsi.FileSystem(memMapFileSys) wpf(w, tplx.ExecTplHelper(tplx.Head, map[string]interface{}{"HtmlTitle": "Requesting files"})) defer wpf(w, tplx.Foot) wpf(w, "<pre>") defer wpf(w, "</pre>") err = fs.WriteFile(path.Join(docRoot, "msg.html"), msg, 0644) lge(err) // err = fs.WriteFile(path.Join(docRoot, "index.html"), []byte("content of index.html"), 0644) // lge(err) err = fs.MkdirAll(path.Join(docRoot, "testDirX/testDirY"), 0755) lge(err) for _, config := range fcs { FetchUsingRSS(w, r, fs, config) } lg("fetching complete") }
// GetDirContents fetches from fileserver - via http // Parsing the received JSON into string slices func GetDirContents(hostWithPrefix, dir string) ([]string, []string, *bytes.Buffer, error) { lg, lge := loghttp.Logger(nil, nil) _ = lg var b = new(bytes.Buffer) dirs := []string{} fils := []string{} // build url urlSubDirs, err := url.Parse(path.Join(hostWithPrefix, dir)) lge(err) if err != nil { return dirs, fils, b, err } sd := urlSubDirs.String() sd = common.Directorify(sd) wpf(b, "requ subdirs from %v", sd) // make req bsubdirs, effU, err := fetch.UrlGetter(nil, fetch.Options{URL: sd}) lge(err) if err != nil { return dirs, fils, b, err } wpf(b, "got %s - %v", bsubdirs, effU) // parse json mpSubDir := []map[string]string{} err = json.Unmarshal(bsubdirs, &mpSubDir) lge(err) if err != nil { // lg("%s", bsubdirs) return dirs, fils, b, err } wpf(b, "json of subdir is %s", stringspb.IndentedDump(mpSubDir)) for _, v := range mpSubDir { if dir, ok := v["path"]; ok { if strings.HasSuffix(dir, "/") { dirs = append(dirs, dir) } else { fils = append(fils, dir) } } if smod, ok := v["mod"]; ok { t, err := time.Parse(time.RFC1123Z, smod) lge(err) wpf(b, "age %-6.2v", time.Now().Sub(t).Hours()) } } return dirs, fils, b, nil }
func sendUpload(w http.ResponseWriter, r *http.Request, m map[string]interface{}) { lg, _ := loghttp.Logger(w, r) // c := appengine.NewContext(r) wpf(w, tplx.ExecTplHelper(tplx.Head, map[string]interface{}{"HtmlTitle": "Post an Upload"})) defer wpf(w, tplx.Foot) tData := map[string]string{"Url": UrlUploadReceive} err := tplBase.ExecuteTemplate(w, "tplName01", tData) if err != nil { lg("tpl did not compile: %v", err) } }
func DeleteSubtree(w http.ResponseWriter, r *http.Request, m map[string]interface{}) { lg, lge := loghttp.Logger(w, r) err := r.ParseForm() lge(err) wpf(w, tplx.ExecTplHelper(tplx.Head, map[string]interface{}{"HtmlTitle": "Delete Subtree for curr FS"})) defer wpf(w, tplx.Foot) if r.Method == "POST" { wpf(w, "<pre>\n") defer wpf(w, "\n</pre>") mountPoint := dsfs.MountPointLast() if len(r.FormValue("mountname")) > 0 { mountPoint = r.FormValue("mountname") } lg("mount point is %v", mountPoint) pathPrefix := "impossible-value" if len(r.FormValue("pathprefix")) > 0 { pathPrefix = r.FormValue("pathprefix") } lg("pathprefix is %v", pathPrefix) fs := getFS(appengine.NewContext(r), mountPoint) lg("created fs %v-%v ", fs.Name(), fs.String()) lg("removing %q - and its subtree ...", pathPrefix) err := fs.RemoveAll(pathPrefix) lge(err) errMc := memcache.Flush(appengine.NewContext(r)) lge(errMc) if err == nil && errMc == nil { lg("success") } } else { tData := map[string]string{"Url": UriDeleteSubtree} err := tplBase.ExecuteTemplate(w, "tplName01", tData) lge(err) } }
// stuffStage1 ranges over the RSS entries and filters out unwanted directories. // Wanted urls are sent to the stage one channel. func stuffStage1(w http.ResponseWriter, r *http.Request, config FetchCommand, inn chan *FullArticle, fin chan struct{}, dirTree *DirTree, uriPrefixExcl, uriPrefixIncl string, nWant int) (nFound int) { lg, lge := loghttp.Logger(w, r) _ = lge subtree, head := DiveToDeepestMatch(dirTree, uriPrefixIncl) if subtree == nil { lg(" does not exist in dirtree: %q", uriPrefixIncl) } else { opt := LevelWiseDeeperOptions{} opt.Rump = head opt.ExcludeDir = uriPrefixExcl opt.MaxDepthDiff = config.DepthTolerance opt.CondenseTrailingDirs = config.CondenseTrailingDirs opt.MaxNumber = nWant articles := LevelWiseDeeper(w, r, subtree, opt) // lg(" levelwise deeper found %v articles", len(articles)) for _, art := range articles { lg(" feed #%02v: %v - %v", nFound, art.Mod.Format("15:04:05"), stringspb.Ellipsoider(art.Url, 50)) art.Url = config.Host + art.Url select { case inn <- &art: // stage 1 loading case <-fin: lg("downstream stage has shut down, stop stuffing stage1") return } nFound++ if nFound > nWant-1 { return } } } return }
// fetchCommandReceiver takes http post requests, extracts the JSON commands // and submits them to FetchHTML func fetchCommandReceiver(w http.ResponseWriter, r *http.Request, m map[string]interface{}) { lg, lge := loghttp.Logger(w, r) var fcs []FetchCommand // The type of resp.body <io.Reader> lends itself to using decode. // http://stackoverflow.com/ - ... using-json-unmarshal-vs-json-newdecoder-decode // // Nevertheless, we use Unmarshal here, because we want to inspect the bytes of body. var Unmarshal_versus_Decode = true if Unmarshal_versus_Decode { body, err := ioutil.ReadAll(r.Body) // no response write until here ! lge(err) if len(body) == 0 { lg("empty body") return } err = json.Unmarshal(body, &fcs) if err != nil { lge(err) lg("body is %s", body) return } } else { dec := json.NewDecoder(r.Body) for { if err := dec.Decode(&fcs); err == io.EOF { break } else if err != nil { lge(err) return } lg("command loop is: %s", stringspb.IndentedDump(fcs)) } } FetchHTML(w, r, fcs) }
// Submit test commands by http posting them. func staticFetchViaPosting2Receiver(w http.ResponseWriter, r *http.Request, m map[string]interface{}) { lg, lge := loghttp.Logger(w, r) wpf(w, tplx.ExecTplHelper(tplx.Head, map[string]interface{}{"HtmlTitle": "JSON Post"})) defer wpf(w, tplx.Foot) wpf(w, "<pre>") defer wpf(w, "</pre>") b, err := Post2Receiver(r, testCommands) lge(err) lg("msg from Post2Receiver:") lg(b.String()) }
func formRedirector(w http.ResponseWriter, r *http.Request, m map[string]interface{}) { lg, lge := loghttp.Logger(w, r) var msg, cntnt, rURL string w.Header().Set("Content-type", "text/html; charset=utf-8") // w.Header().Set("Content-type", "text/html; charset=latin-1") rURL = r.FormValue("redirect-to") lg("lo redirect to: %v", rURL) if len(r.PostForm) > 0 { // loghttp.Pf(w, r, "post unimplemented:<br> %#v <br>\n", r.PostForm) // return msg += fmt.Sprintf("post converted to get<br>") } rURL = fmt.Sprintf("%v?1=2&", rURL) for key, vals := range r.Form { if key == "redirect-to" { continue } val := vals[0] if util_appengine.IsLocalEnviron() { val = strings.Replace(val, " ", "%20", -1) } rURL = fmt.Sprintf("%v&%v=%v", rURL, key, val) } bts, inf, err := fetch.UrlGetter(r, fetch.Options{URL: rURL}) lge(err) cntnt = string(bts) cntnt = insertNewlines.Replace(cntnt) cntnt = undouble.Replace(cntnt) cntnt = domclean1.ModifyHTML(r, inf.URL, cntnt) fmt.Fprintf(w, "%s \n\n", cntnt) fmt.Fprintf(w, "%s \n\n", msg) }
func Test1(t *testing.T) { lg, lge := loghttp.Logger(nil, nil) c, err := aetest.NewContext(nil) if err != nil { lge(err) t.Fatal(err) } defer c.Close() whichType = 2 fs := GetFS(c) lg(fs.Name() + "-" + fs.String()) for _, config := range testCommands { Fetch(nil, nil, fs, config) } }
// Each domain might have *several* RSS URLs. // Function matchingRSSURI returns the most fitting RSS URL // for a given SearchPrefix, or empty string. func matchingRSSURI(w http.ResponseWriter, r *http.Request, c FetchCommand) (ret string) { lg, lge := loghttp.Logger(w, r) _, _ = lg, lge cntr := 0 sp := c.SearchPrefix MarkX: for { // lg("search pref %v", sp) if rss, ok := c.RssXMLURI[sp]; ok { ret = rss lg("found rss url %v for %v", ret, sp) break MarkX } spPrev := sp sp = path.Dir(sp) if spPrev == "." && sp == "." || spPrev == "/" && sp == "/" || spPrev == "" && sp == "." { lg("Did not find a RSS URL for %v %q", c.SearchPrefix, ret) break } cntr++ if cntr > 20 { lg("Select RSS Loop did not terminate. %v", c.SearchPrefix) break } } return }
// // // Fetches the RSS.xml file. func rssXMLFile(w http.ResponseWriter, r *http.Request, fs fsi.FileSystem, rssUrl string) (RSS, *url.URL) { lg, lge := loghttp.Logger(w, r) bts, respInf, err := fetch.UrlGetter(r, fetch.Options{URL: rssUrl}) lge(err) bts = bytes.Replace(bts, []byte("content:encoded>"), []byte("content-encoded>S"), -1) // hack rssDoc := RSS{} err = xml.Unmarshal(bts, &rssDoc) lge(err) // save it bdmp := stringspb.IndentedDumpBytes(rssDoc) err = fs.MkdirAll(path.Join(docRoot, respInf.URL.Host), 0755) lge(err) err = fs.WriteFile(path.Join(docRoot, respInf.URL.Host, "outp_rss.xml"), bdmp, 0755) lge(err) lg("RSS resp size %5.2vkB, saved to %v", len(bdmp)/1024, respInf.URL.Host+"/outp_rss.xml") return rssDoc, respInf.URL }
func DomClean(b []byte, opt CleaningOptions) (*html.Node, error) { lg, lge := loghttp.Logger(nil, nil) _ = lg b = globFixes(b) doc, err := html.Parse(bytes.NewReader(b)) if err != nil { lge(err) return nil, err } if opt.FNamer != nil { osutilpb.Dom2File(opt.FNamer()+".html", doc) } // // cleanseDom(doc, 0) removeCommentsAndIntertagWhitespace(NdX{doc, 0}) fileDump(doc, opt.FNamer) // // condenseTopDown(doc, 0, 0) removeEmptyNodes(doc, 0) fileDump(doc, opt.FNamer) // // removeCommentsAndIntertagWhitespace(NdX{doc, 0}) // prevent spacey textnodes around singl child images breakoutImagesFromAnchorTrees(doc) recurseImg2Link(doc) fileDump(doc, opt.FNamer) // // condenseBottomUpV3(doc, 0, 7, map[string]bool{"div": true}) condenseBottomUpV3(doc, 0, 6, map[string]bool{"div": true}) condenseBottomUpV3(doc, 0, 5, map[string]bool{"div": true}) condenseBottomUpV3(doc, 0, 4, map[string]bool{"div": true}) condenseTopDown(doc, 0, 0) removeEmptyNodes(doc, 0) removeEmptyNodes(doc, 0) fileDump(doc, opt.FNamer) // // if opt.Proxify { if opt.ProxyHost == "" { opt.ProxyHost = routes.AppHost() } proxify(doc, opt.ProxyHost, &url.URL{Scheme: "http", Host: opt.RemoteHost}) fileDump(doc, opt.FNamer) } if opt.Beautify { removeCommentsAndIntertagWhitespace(NdX{doc, 0}) reIndent(doc, 0) } // // if opt.AddOutline { addOutlineAttr(doc, 0, []int{0}) } if opt.AddID { addIdAttr(doc, 0, 1) } if opt.AddOutline || opt.AddID { fileDump(doc, opt.FNamer) } // computeXPathStack(doc, 0) if opt.FNamer != nil { osutilpb.Bytes2File(opt.FNamer()+".txt", xPathDump) } return doc, nil }
func Test1(t *testing.T) { lg, lge := loghttp.Logger(nil, nil) // c := prepare(t) // defer c.Close() lg("waiting for webserver") time.Sleep(2 * time.Millisecond) remoteHostname := "www.welt.de" dirs1, _, msg, err := fileserver.GetDirContents(hostWithPref, remoteHostname) if err != nil { lge(err) lg("%s", msg) } lg("dirs1") for _, v := range dirs1 { lg(" %v", v) } least3Files := []string{} for _, v1 := range dirs1 { dirs2, fils2, msg, err := fileserver.GetDirContents(hostWithPref, path.Join(remoteHostname, v1)) _ = dirs2 if err != nil { lge(err) lg("%s", msg) } // lg(" dirs2 %v", stringspb.IndentedDump(dirs2)) // lg(" fils2 %v", stringspb.IndentedDump(fils2)) if len(fils2) > numTotal-1 { for i2, v2 := range fils2 { least3Files = append(least3Files, path.Join(remoteHostname, v1, v2)) if i2 == numTotal-1 { break } } break } } if len(least3Files) < numTotal { lg("not enough files in rss fetcher cache") return } lg("fils2") for _, v := range least3Files { lg(" %v", v) } logdir := prepareLogDir() iter := make([]int, numTotal) for i, _ := range iter { surl := spf("%v/%v", hostWithPref, least3Files[i]) fNamer := FileNamer(logdir, i) fnKey := fNamer() // first call yields key _ = fnKey resBytes, effUrl, err := fetch.UrlGetter(nil, fetch.Options{URL: surl}) if err != nil { lge(err) return } lg("fetched %4.1fkB from %v", float64(len(resBytes))/1024, stringspb.ToLenR(effUrl.String(), 60)) opts := CleaningOptions{Proxify: true} opts.FNamer = fNamer opts.RemoteHost = remoteHostname doc, err := DomClean(resBytes, opts) lge(err) _ = doc } // statistics on elements and attributes sorted1 := sortmap.SortMapByCount(attrDistinct) sorted1.Print(6) fmt.Println() sorted2 := sortmap.SortMapByCount(nodeDistinct) sorted2.Print(6) pf("correct finish\n") }
func receiveUpload(w http.ResponseWriter, r *http.Request, m map[string]interface{}) { lg, _ := loghttp.Logger(w, r) c := appengine.NewContext(r) // parsing multipart before anything else err := r.ParseMultipartForm(1024 * 1024 * 2) if err != nil { lg("Multipart parsing failed: %v", err) return } wpf(w, tplx.ExecTplHelper(tplx.Head, map[string]interface{}{"HtmlTitle": "Receive an Upload"})) defer wpf(w, tplx.Foot) wpf(w, "<pre>") defer wpf(w, "</pre>") fields := []string{"getparam1", "mountname", "description"} for _, v := range fields { lg("%12v => %q", v, r.FormValue(v)) } mountPoint := dsfs.MountPointLast() if len(r.FormValue("mountname")) > 0 { mountPoint = r.FormValue("mountname") } lg("mount point is %v", mountPoint) fs1 := dsfs.New( dsfs.MountName(mountPoint), dsfs.AeContext(c), ) // As closure, since we cannot define dsfs.dsFileSys as parameter funcSave := func(argName string, data []byte) (error, *bytes.Buffer) { b1 := new(bytes.Buffer) fs1 := dsfs.New( dsfs.MountName(mountPoint), dsfs.AeContext(c), ) dir, bname := fs1.SplitX(argName) err := fs1.MkdirAll(dir, 0777) wpf(b1, "mkdir %v - %v\n", dir, err) if err != nil { return err, b1 } err = fs1.WriteFile(path.Join(dir, bname), data, 0777) wpf(b1, "saved file content to %v - %v\n", argName, err) return err, b1 } ff := "filefield" file, handler, err := r.FormFile(ff) if err != nil { lg("error calling FormFile from %q => %v", ff, err) return } if handler == nil { lg("no multipart file %q", ff) } else { lg("extracted file %v", handler.Filename) data, err := ioutil.ReadAll(file) if err != nil { lg("ReadAll on uploaded file failed: %v", err) return } defer file.Close() lg("extracted file content; %v bytes", len(data)) newFilename := docRootDataStore + handler.Filename ext := path.Ext(newFilename) if ext == ".zip" { lg("found zip - treat as dir-tree %q", newFilename) r, err := zip.NewReader(file, int64(len(data))) if err != nil { lg("open as zip failed: %v", err) return } for _, f := range r.File { newFilename = docRootDataStore + f.Name dir, bname := fs1.SplitX(newFilename) if f.FileInfo().IsDir() { lg("\t dir %s", newFilename) err := fs1.MkdirAll(path.Join(dir, bname), 0777) if err != nil { lg("MkdirAll %v failed: %v", newFilename, err) return } } else { lg("\t file %s", newFilename) rc, err := f.Open() if err != nil { return } defer func(rc io.ReadCloser) { if err := rc.Close(); err != nil { panic(err) } }(rc) bts := new(bytes.Buffer) size, err := io.Copy(bts, rc) if err != nil { lg("Could not copy from zipped file %v: %v", newFilename, err) return } err = common.WriteFile(fsi.FileSystem(fs1), path.Join(dir, bname), bts.Bytes()) // err = fs1.WriteFile(path.Join(dir, bname), bts.Bytes(), 0777) if err != nil { lg("WriteFile of zipped file %v failed: %v", newFilename, err) return } lg("\t saved %v - %v Bytes", newFilename, size) } } } else { err, b2 := funcSave(newFilename, data) lg("%s", b2) if err != nil { return } } errMc := memcache.Flush(appengine.NewContext(r)) if errMc != nil { lg("Error flushing memache: %v", errMc) return } lg("--------------------\n") } }
func LevelWiseDeeper(w http.ResponseWriter, r *http.Request, dtree *DirTree, opt LevelWiseDeeperOptions) []FullArticle { lg, lge := loghttp.Logger(w, r) _ = lge depthRump := strings.Count(opt.Rump, "/") arts := []FullArticle{} var fc func(string, *DirTree, int) fc = func(rmp1 string, dr1 *DirTree, lvl int) { // lg(" lvl %2v %v", lvl, dr1.Name) keys := make([]string, 0, len(dr1.Dirs)) for k, _ := range dr1.Dirs { keys = append(keys, k) } // We could sort by LastFound // but we rather seek most current // files *later* sort.Strings(keys) // for debugging clarity for _, key := range keys { dr2 := dr1.Dirs[key] rmp2 := rmp1 + dr2.Name // lg(" %v", rmp2) // // rmp2 a candidate? if len(arts) > opt.MaxNumber-1 { return } if !dr2.EndPoint { continue } semanticUri := condenseTrailingDir(rmp2, opt.CondenseTrailingDirs) depthUri := strings.Count(semanticUri, "/") if depthUri-depthRump <= opt.MaxDepthDiff && depthUri-depthRump >= opt.MinDepthDiff { } else { continue // we could also "break" } if opt.ExcludeDir == rmp2 { lg(" exclude dir %v", opt.ExcludeDir) continue } // lg(" including %v", semanticUri) art := FullArticle{Url: rmp2} if dr2.SrcRSS { art.Mod = dr2.LastFound } arts = append(arts, art) } // // recurse horizontally for _, key := range keys { dr2 := dr1.Dirs[key] rmp2 := rmp1 + dr2.Name if len(dr2.Dirs) == 0 { // lg(" LevelWiseDeeper - no children") continue } fc(rmp2, &dr2, lvl+1) } } fc(opt.Rump, dtree, 0) return arts }