func FetchAndDecodeJSON(r *http.Request, surl, knownProtocol string, lg loghttp.FuncBufUniv, fs fsi.FileSystem) []repo.FullArticle { fullURL := fmt.Sprintf("%s%s?%s=%s&cnt=%v&prot=%v", routes.AppHost(), routes.FetchSimilarURI, routes.URLParamKey, surl, numTotal-1, knownProtocol) // fullURL = fmt.Sprintf("%s%s?%s=%s&cnt=%v", r.URL.Host, repo.routes.FetchSimilarURI, // routes.URLParamKey, surl, numTotal-1) lg("lo fetching %v", fullURL) start := time.Now() fo := fetch.Options{} fo.URL = fullURL bJSON, inf, err := fetch.UrlGetter(r, fo) _ = inf lg(err) if err != nil { lg("msg %v", inf.Msg) return nil } if len(bJSON) == 0 { lg("empty bJSON") return nil } lg("\t\tfetch resp complete after %4.2v secs; %vkB", time.Now().Sub(start).Seconds(), len(bJSON)/1024) var mp map[string][]byte err = json.Unmarshal(bJSON, &mp) lg(err) if err != nil { if _, ok := mp["msg"]; ok { lg("%s", mp["msg"]) } else { lg("%s", bJSON) } return nil } smaxFound := string(mp["lensimilar"]) maxFound := util.Stoi(smaxFound) if maxFound < numTotal-1 { lg("not enough files returned by FetchSimilar 1 - mp[lensimilar] too small: %s", mp["lensimilar"]) return nil } least3Files := make([]repo.FullArticle, maxFound+1) _, ok1 := mp["url_self"] _, ok2 := mp["mod_self"] _, ok3 := mp["bod_self"] if ok1 && ok2 && ok3 { least3Files[0].Url = string(mp["url_self"]) least3Files[0].Mod, err = time.Parse(http.TimeFormat, string(mp["mod_self"])) lg(err) least3Files[0].Body = mp["bod_self"] if len(least3Files[0].Body) < 200 { if !bytes.Contains(least3Files[0].Body, []byte(fetch.MsgNoRdirects)) { lg("found base but its a redirect") return nil } } } lg("found base") for k, v := range mp { if k == "msg" { continue } if strings.HasSuffix(k, "self") { continue } if strings.HasPrefix(k, "url__") { sval := strings.TrimPrefix(k, "url__") val := util.Stoi(sval) // lg("%v %v %s", sval, val, v) least3Files[val+1].Url = string(v) } if strings.HasPrefix(k, "mod__") { sval := strings.TrimPrefix(k, "mod__") val := util.Stoi(sval) // lg("%v %v %s", sval, val, v) least3Files[val+1].Mod, err = time.Parse(http.TimeFormat, string(v)) lg(err) } if strings.HasPrefix(k, "bod__") { sval := strings.TrimPrefix(k, "bod__") val := util.Stoi(sval) least3Files[val+1].Body = v //html.EscapeString(string(v) } } lg("found %v similar; decoding complete after %4.2v secs", maxFound, time.Now().Sub(start).Seconds()) for _, v := range least3Files { lg("%v %v", v.Url, len(v.Body)) } return least3Files }
func DomClean(b []byte, opt CleaningOptions) (*html.Node, error) { lg, lge := loghttp.Logger(nil, nil) _ = lg b = globFixes(b) doc, err := html.Parse(bytes.NewReader(b)) if err != nil { lge(err) return nil, err } if opt.FNamer != nil { osutilpb.Dom2File(opt.FNamer()+".html", doc) } // // cleanseDom(doc, 0) removeCommentsAndIntertagWhitespace(NdX{doc, 0}) fileDump(doc, opt.FNamer) // // condenseTopDown(doc, 0, 0) removeEmptyNodes(doc, 0) fileDump(doc, opt.FNamer) // // removeCommentsAndIntertagWhitespace(NdX{doc, 0}) // prevent spacey textnodes around singl child images breakoutImagesFromAnchorTrees(doc) recurseImg2Link(doc) fileDump(doc, opt.FNamer) // // condenseBottomUpV3(doc, 0, 7, map[string]bool{"div": true}) condenseBottomUpV3(doc, 0, 6, map[string]bool{"div": true}) condenseBottomUpV3(doc, 0, 5, map[string]bool{"div": true}) condenseBottomUpV3(doc, 0, 4, map[string]bool{"div": true}) condenseTopDown(doc, 0, 0) removeEmptyNodes(doc, 0) removeEmptyNodes(doc, 0) fileDump(doc, opt.FNamer) // // if opt.Proxify { if opt.ProxyHost == "" { opt.ProxyHost = routes.AppHost() } proxify(doc, opt.ProxyHost, &url.URL{Scheme: "http", Host: opt.RemoteHost}) fileDump(doc, opt.FNamer) } if opt.Beautify { removeCommentsAndIntertagWhitespace(NdX{doc, 0}) reIndent(doc, 0) } // // if opt.AddOutline { addOutlineAttr(doc, 0, []int{0}) } if opt.AddID { addIdAttr(doc, 0, 1) } if opt.AddOutline || opt.AddID { fileDump(doc, opt.FNamer) } // computeXPathStack(doc, 0) if opt.FNamer != nil { osutilpb.Bytes2File(opt.FNamer()+".txt", xPathDump) } return doc, nil }
"github.com/pbberlin/tools/net/http/fetch" "github.com/pbberlin/tools/net/http/fileserver" "github.com/pbberlin/tools/net/http/loghttp" "github.com/pbberlin/tools/net/http/repo" "github.com/pbberlin/tools/net/http/routes" "github.com/pbberlin/tools/sort/sortmap" "github.com/pbberlin/tools/stringspb" ) const numTotal = 3 // comparable html docs const stageMax = 3 // weedstages const cTestHostOwn = "localhost:63222" var hostWithPref = routes.AppHost() + repo.UriMountNameY func prepare(t *testing.T) aetest.Context { lg, lge := loghttp.Logger(nil, nil) _ = lg c, err := aetest.NewContext(nil) if err != nil { lge(err) t.Fatal(err) } serveFile := func(w http.ResponseWriter, r *http.Request, m map[string]interface{}) { fs1 := repo.GetFS(c) fileserver.FsiFileServer(w, r, fileserver.Options{FS: fs1, Prefix: repo.UriMountNameY})
var docRoot = "" // no relative path, 'cause working dir too flippant var whichType = 0 // which type of filesystem, default is dsfs var memMapFileSys = memfs.New(memfs.DirSort("byDateDesc")) // package variable required as "persistence" var httpFSys = &httpfs.HttpFs{SourceFs: fsi.FileSystem(memMapFileSys)} // memMap is always ready var fileserver1 = http.FileServer(httpFSys.Dir(docRoot)) const mountName = "mntftch" const uriSetType = "/fetch/set-fs-type" const UriMountNameY = "/" + mountName + "/serve-file/" const uriFetchCommandReceiver = "/fetch/command-receive" const uriFetchCommandSender = "/fetch/command-send" var RepoURL = routes.AppHost() + UriMountNameY var msg = []byte(`<p>This is an embedded static http server.</p> <p> It serves previously downloaded pages<br> i.e. from handelsblatt or economist. </p>`) var testCommands = []FetchCommand{ FetchCommand{ Host: "www.handelsblatt.com", SearchPrefix: "/politik/deutschland/aa/bb", }, FetchCommand{ Host: "www.handelsblatt.com", SearchPrefix: "/politik/international/aa/bb",