Ejemplo n.º 1
0
func FetchAndDecodeJSON(r *http.Request, surl, knownProtocol string, lg loghttp.FuncBufUniv, fs fsi.FileSystem) []repo.FullArticle {

	fullURL := fmt.Sprintf("%s%s?%s=%s&cnt=%v&prot=%v", routes.AppHost(), routes.FetchSimilarURI,
		routes.URLParamKey, surl, numTotal-1, knownProtocol)

	// fullURL = fmt.Sprintf("%s%s?%s=%s&cnt=%v", r.URL.Host, repo.routes.FetchSimilarURI,
	// 	routes.URLParamKey, surl, numTotal-1)

	lg("lo fetching %v", fullURL)
	start := time.Now()

	fo := fetch.Options{}
	fo.URL = fullURL
	bJSON, inf, err := fetch.UrlGetter(r, fo)
	_ = inf
	lg(err)
	if err != nil {
		lg("msg %v", inf.Msg)
		return nil
	}
	if len(bJSON) == 0 {
		lg("empty bJSON")
		return nil
	}

	lg("\t\tfetch resp complete after %4.2v secs; %vkB", time.Now().Sub(start).Seconds(), len(bJSON)/1024)

	var mp map[string][]byte
	err = json.Unmarshal(bJSON, &mp)
	lg(err)
	if err != nil {
		if _, ok := mp["msg"]; ok {
			lg("%s", mp["msg"])
		} else {
			lg("%s", bJSON)
		}
		return nil
	}

	smaxFound := string(mp["lensimilar"])
	maxFound := util.Stoi(smaxFound)
	if maxFound < numTotal-1 {
		lg("not enough files returned by FetchSimilar 1 - mp[lensimilar] too small: %s", mp["lensimilar"])
		return nil
	}
	least3Files := make([]repo.FullArticle, maxFound+1)

	_, ok1 := mp["url_self"]
	_, ok2 := mp["mod_self"]
	_, ok3 := mp["bod_self"]
	if ok1 && ok2 && ok3 {
		least3Files[0].Url = string(mp["url_self"])
		least3Files[0].Mod, err = time.Parse(http.TimeFormat, string(mp["mod_self"]))
		lg(err)
		least3Files[0].Body = mp["bod_self"]
		if len(least3Files[0].Body) < 200 {
			if !bytes.Contains(least3Files[0].Body, []byte(fetch.MsgNoRdirects)) {
				lg("found base but its a redirect")
				return nil
			}
		}
	}
	lg("found base")

	for k, v := range mp {
		if k == "msg" {
			continue
		}
		if strings.HasSuffix(k, "self") {
			continue
		}

		if strings.HasPrefix(k, "url__") {
			sval := strings.TrimPrefix(k, "url__")
			val := util.Stoi(sval)
			// lg("%v %v %s", sval, val, v)
			least3Files[val+1].Url = string(v)
		}
		if strings.HasPrefix(k, "mod__") {
			sval := strings.TrimPrefix(k, "mod__")
			val := util.Stoi(sval)
			// lg("%v %v %s", sval, val, v)
			least3Files[val+1].Mod, err = time.Parse(http.TimeFormat, string(v))
			lg(err)
		}

		if strings.HasPrefix(k, "bod__") {
			sval := strings.TrimPrefix(k, "bod__")
			val := util.Stoi(sval)
			least3Files[val+1].Body = v //html.EscapeString(string(v)
		}

	}

	lg("found %v similar; decoding complete after %4.2v secs", maxFound, time.Now().Sub(start).Seconds())

	for _, v := range least3Files {
		lg("%v %v", v.Url, len(v.Body))
	}

	return least3Files

}
Ejemplo n.º 2
0
func DomClean(b []byte, opt CleaningOptions) (*html.Node, error) {

	lg, lge := loghttp.Logger(nil, nil)
	_ = lg

	b = globFixes(b)
	doc, err := html.Parse(bytes.NewReader(b))
	if err != nil {
		lge(err)
		return nil, err
	}

	if opt.FNamer != nil {
		osutilpb.Dom2File(opt.FNamer()+".html", doc)
	}

	//
	//
	cleanseDom(doc, 0)
	removeCommentsAndIntertagWhitespace(NdX{doc, 0})
	fileDump(doc, opt.FNamer)

	//
	//
	condenseTopDown(doc, 0, 0)
	removeEmptyNodes(doc, 0)
	fileDump(doc, opt.FNamer)

	//
	//
	removeCommentsAndIntertagWhitespace(NdX{doc, 0}) // prevent spacey textnodes around singl child images
	breakoutImagesFromAnchorTrees(doc)
	recurseImg2Link(doc)
	fileDump(doc, opt.FNamer)

	//
	//
	condenseBottomUpV3(doc, 0, 7, map[string]bool{"div": true})
	condenseBottomUpV3(doc, 0, 6, map[string]bool{"div": true})
	condenseBottomUpV3(doc, 0, 5, map[string]bool{"div": true})
	condenseBottomUpV3(doc, 0, 4, map[string]bool{"div": true})
	condenseTopDown(doc, 0, 0)

	removeEmptyNodes(doc, 0)
	removeEmptyNodes(doc, 0)

	fileDump(doc, opt.FNamer)

	//
	//
	if opt.Proxify {
		if opt.ProxyHost == "" {
			opt.ProxyHost = routes.AppHost()
		}

		proxify(doc, opt.ProxyHost, &url.URL{Scheme: "http", Host: opt.RemoteHost})
		fileDump(doc, opt.FNamer)
	}

	if opt.Beautify {
		removeCommentsAndIntertagWhitespace(NdX{doc, 0})
		reIndent(doc, 0)

	}

	//
	//
	if opt.AddOutline {
		addOutlineAttr(doc, 0, []int{0})
	}
	if opt.AddID {
		addIdAttr(doc, 0, 1)
	}
	if opt.AddOutline || opt.AddID {
		fileDump(doc, opt.FNamer)
	}

	//
	computeXPathStack(doc, 0)
	if opt.FNamer != nil {
		osutilpb.Bytes2File(opt.FNamer()+".txt", xPathDump)
	}

	return doc, nil

}
Ejemplo n.º 3
0
	"github.com/pbberlin/tools/net/http/fetch"
	"github.com/pbberlin/tools/net/http/fileserver"
	"github.com/pbberlin/tools/net/http/loghttp"
	"github.com/pbberlin/tools/net/http/repo"
	"github.com/pbberlin/tools/net/http/routes"
	"github.com/pbberlin/tools/sort/sortmap"
	"github.com/pbberlin/tools/stringspb"
)

const numTotal = 3 // comparable html docs
const stageMax = 3 // weedstages

const cTestHostOwn = "localhost:63222"

var hostWithPref = routes.AppHost() + repo.UriMountNameY

func prepare(t *testing.T) aetest.Context {

	lg, lge := loghttp.Logger(nil, nil)
	_ = lg

	c, err := aetest.NewContext(nil)
	if err != nil {
		lge(err)
		t.Fatal(err)
	}

	serveFile := func(w http.ResponseWriter, r *http.Request, m map[string]interface{}) {
		fs1 := repo.GetFS(c)
		fileserver.FsiFileServer(w, r, fileserver.Options{FS: fs1, Prefix: repo.UriMountNameY})
Ejemplo n.º 4
0
var docRoot = ""  // no relative path, 'cause working dir too flippant
var whichType = 0 // which type of filesystem, default is dsfs

var memMapFileSys = memfs.New(memfs.DirSort("byDateDesc"))             // package variable required as "persistence"
var httpFSys = &httpfs.HttpFs{SourceFs: fsi.FileSystem(memMapFileSys)} // memMap is always ready
var fileserver1 = http.FileServer(httpFSys.Dir(docRoot))

const mountName = "mntftch"

const uriSetType = "/fetch/set-fs-type"
const UriMountNameY = "/" + mountName + "/serve-file/"

const uriFetchCommandReceiver = "/fetch/command-receive"
const uriFetchCommandSender = "/fetch/command-send"

var RepoURL = routes.AppHost() + UriMountNameY

var msg = []byte(`<p>This is an embedded static http server.</p>
<p>
It serves previously downloaded pages<br>
 i.e. from handelsblatt or economist.
</p>`)

var testCommands = []FetchCommand{
	FetchCommand{
		Host:         "www.handelsblatt.com",
		SearchPrefix: "/politik/deutschland/aa/bb",
	},
	FetchCommand{
		Host:         "www.handelsblatt.com",
		SearchPrefix: "/politik/international/aa/bb",