Ejemplo n.º 1
// requesting via http; not from filesystem
// unused
func fetchDigest(hostWithPrefix, domain string) (*DirTree, error) {

	lg, lge := loghttp.Logger(nil, nil)
	_ = lg

	surl := path.Join(hostWithPrefix, domain, "digest2.json")
	bts, _, err := fetch.UrlGetter(nil, fetch.Options{URL: surl})
	if err != nil {
		return nil, err

	// lg("%s", bts)
	dirTree := &DirTree{Name: "/", Dirs: map[string]DirTree{}, EndPoint: true}

	if err == nil {
		err = json.Unmarshal(bts, dirTree)
		if err != nil {
			return nil, err

	lg("DirTree   %5.2vkB loaded for %v", len(bts)/1024, surl)

	age := time.Now().Sub(dirTree.LastFound)
	lg("DirTree is %5.2v hours old (%v)", age.Hours(), dirTree.LastFound.Format(time.ANSIC))

	return dirTree, nil

Ejemplo n.º 2
// GetDirContents fetches from fileserver - via http
// Parsing the received JSON into string slices
func GetDirContents(hostWithPrefix, dir string) ([]string, []string, *bytes.Buffer, error) {

	lg, lge := loghttp.Logger(nil, nil)
	_ = lg

	var b = new(bytes.Buffer)

	dirs := []string{}
	fils := []string{}

	// build url
	urlSubDirs, err := url.Parse(path.Join(hostWithPrefix, dir))
	if err != nil {
		return dirs, fils, b, err
	sd := urlSubDirs.String()
	sd = common.Directorify(sd)
	wpf(b, "requ subdirs from  %v", sd)

	// make req
	bsubdirs, effU, err := fetch.UrlGetter(nil, fetch.Options{URL: sd})
	if err != nil {
		return dirs, fils, b, err
	wpf(b, "got %s - %v", bsubdirs, effU)

	// parse json
	mpSubDir := []map[string]string{}
	err = json.Unmarshal(bsubdirs, &mpSubDir)
	if err != nil {
		// lg("%s", bsubdirs)
		return dirs, fils, b, err
	wpf(b, "json of subdir is %s", stringspb.IndentedDump(mpSubDir))

	for _, v := range mpSubDir {

		if dir, ok := v["path"]; ok {
			if strings.HasSuffix(dir, "/") {
				dirs = append(dirs, dir)
			} else {
				fils = append(fils, dir)

		if smod, ok := v["mod"]; ok {
			t, err := time.Parse(time.RFC1123Z, smod)
			wpf(b, "age %-6.2v", time.Now().Sub(t).Hours())

	return dirs, fils, b, nil

Ejemplo n.º 3
func formRedirector(w http.ResponseWriter, r *http.Request, m map[string]interface{}) {

	lg, lge := loghttp.Logger(w, r)

	var msg, cntnt, rURL string

	w.Header().Set("Content-type", "text/html; charset=utf-8")
	// w.Header().Set("Content-type", "text/html; charset=latin-1")

	rURL = r.FormValue("redirect-to")
	lg("lo redirect to: %v", rURL)

	if len(r.PostForm) > 0 {
		// loghttp.Pf(w, r, "post unimplemented:<br> %#v <br>\n", r.PostForm)
		// return
		msg += fmt.Sprintf("post converted to get<br>")

	rURL = fmt.Sprintf("%v?1=2&", rURL)
	for key, vals := range r.Form {
		if key == "redirect-to" {
		val := vals[0]
		if util_appengine.IsLocalEnviron() {
			val = strings.Replace(val, " ", "%20", -1)
		rURL = fmt.Sprintf("%v&%v=%v", rURL, key, val)

	bts, inf, err := fetch.UrlGetter(r, fetch.Options{URL: rURL})

	cntnt = string(bts)

	cntnt = insertNewlines.Replace(cntnt)
	cntnt = undouble.Replace(cntnt)

	cntnt = domclean1.ModifyHTML(r, inf.URL, cntnt)

	fmt.Fprintf(w, "%s \n\n", cntnt)
	fmt.Fprintf(w, "%s \n\n", msg)

Ejemplo n.º 4
// Post2Receiver takes commands and http posts them to
// the command receiver
func Post2Receiver(r *http.Request, commands []FetchCommand) (*bytes.Buffer, error) {

	b := new(bytes.Buffer)

	if commands == nil || len(commands) == 0 {
		return b, fmt.Errorf("Slice of commands nil or empty %v", commands)

	ii := instance_mgt.Get(r)
	fullURL := fmt.Sprintf("https://%s%s", ii.PureHostname, uriFetchCommandReceiver)
	wpf(b, "sending to URL:    %v\n", fullURL)

	bcommands, err := json.MarshalIndent(commands, "", "\t")
	if err != nil {
		wpf(b, "marshalling to []byte failed\n")
		return b, err

	req, err := http.NewRequest("POST", fullURL, bytes.NewBuffer(bcommands))
	if err != nil {
		wpf(b, "creation of POST request failed\n")
		return b, err
	req.Header.Set("X-Custom-Header-Counter", "nocounter")
	req.Header.Set("Content-Type", "application/json")

	bts, reqUrl, err := fetch.UrlGetter(r, fetch.Options{Req: req})
	_, _ = bts, reqUrl
	if err != nil {
		wpf(b, "Sending the POST request failed\n")
		return b, err

	wpf(b, "effective req url: %v\n", reqUrl)
	wpf(b, "response body:\n")
	wpf(b, "%s\n", html.EscapeString(string(bts)))

	return b, nil
Ejemplo n.º 5
// Fetches the RSS.xml file.
func rssXMLFile(w http.ResponseWriter, r *http.Request, fs fsi.FileSystem, rssUrl string) (RSS, *url.URL) {

	lg, lge := loghttp.Logger(w, r)

	bts, respInf, err := fetch.UrlGetter(r, fetch.Options{URL: rssUrl})

	bts = bytes.Replace(bts, []byte("content:encoded>"), []byte("content-encoded>S"), -1) // hack

	rssDoc := RSS{}
	err = xml.Unmarshal(bts, &rssDoc)

	// save it
	bdmp := stringspb.IndentedDumpBytes(rssDoc)
	err = fs.MkdirAll(path.Join(docRoot, respInf.URL.Host), 0755)
	err = fs.WriteFile(path.Join(docRoot, respInf.URL.Host, "outp_rss.xml"), bdmp, 0755)
	lg("RSS resp size %5.2vkB, saved to %v", len(bdmp)/1024, respInf.URL.Host+"/outp_rss.xml")

	return rssDoc, respInf.URL
Ejemplo n.º 6
// https://developers.google.com/identity/choose-auth
// https://developers.google.com/identity/sign-in/web/backend-auth
func TokenSignin(w http.ResponseWriter, r *http.Request) {

	lg, _ := loghttp.BuffLoggerUniversal(w, r)

	// w.Header().Set("Access-Control-Allow-Origin", "http://localhost:1313")

	w.Header().Set("Access-Control-Allow-Origin", "http://"+routes.AppHostDev())

	w.Header().Set("Access-Control-Allow-Origin", "*")

	// err := r.ParseMultipartForm(1024 * 1024 * 2)
	err := r.ParseForm()

	myToken := r.Form.Get("idtoken")
	tokSize := fmt.Sprintf("Len of Tok was %v. \n", len(myToken))

	fc1 := func(token *jwt.Token) (interface{}, error) {
		// Don't forget to validate the alg is what you expect:

		log.Printf("algo header is %v\n", token.Header["alg"])
		if _, ok := token.Method.(*jwt.SigningMethodRSA); !ok {
			return nil, fmt.Errorf("Unexpected signing method: %v", token.Header["alg"])
		return token.Header["kid"], nil

	token, err := jwt.Parse(myToken, fc1)

	// No direct error comparison possible; since err is wrapped in another struct
	if err != nil && strings.Contains(err.Error(), jwt.ErrPEMMappingObsolete.Error()) {

		currentPEMsURL := "https://www.googleapis.com/oauth2/v1/certs"
		req, err := http.NewRequest("GET", currentPEMsURL, nil)
		if err != nil {
			lg("creation of pem request failed")
		req.Header.Set("Content-Type", "application/json")

		fo := fetch.Options{Req: req}
		fo.KnownProtocol = "https"
		fo.ForceHTTPSEvenOnDevelopmentServer = true
		bts, inf, err := fetch.UrlGetter(r, fo)
		if err != nil {
			lg("tried to fetch %v, %v", currentPEMsURL, inf.URL)
			lg("msg %v", inf.Msg)
		if len(bts) > 200 {
			var data1 map[string]string
			err = json.Unmarshal(bts, &data1)
			// lg(stringspb.IndentedDumpBytes(data1))
			// w.Write(stringspb.IndentedDumpBytes(data1))
			if len(data1) > 1 {
				lg("PEM mappings updated")
				jwt.MappingToPEM = data1
			} else {
				lg("PEM mapping response contained only %v records; bytes length %v", len(data1), len(bts))


	token, err = jwt.Parse(myToken, fc1)

	if err != nil && strings.Contains(err.Error(), jwt.ErrInvalidKey.Error()) {
		w.Write([]byte("The submitted RSA Key was somehow unparseable. We still accept the token.\n"))
		err = nil
		token.Valid = true

	if err != nil {
		w.Write([]byte("--- " + err.Error() + ".\n"))

	if err == nil && token.Valid {

		tk := ""
		tk += fmt.Sprintf("     Algor:     %v\n", token.Method)
		tk += fmt.Sprintf("     Header:    %v\n", token.Header)
		for k, v := range token.Claims {
			tk += fmt.Sprintf("\t  %-8v %v\n", k, v)

		w.Write([]byte("tokensignin; valid.   \n"))
		sb := "header-sub-not-present"
		if _, ok := token.Claims["sub"]; ok {
			sb = token.Claims["sub"].(string)
		w.Write([]byte("ID from PWT is " + sb + "\n"))

		_, usr, msg1 := login.CheckForNormalUser(r)
		if usr != nil {
			w.Write([]byte("ID from SRV is " + usr.ID + "\n"))
		w.Write([]byte(msg1 + "\n"))

	} else {
		w.Write([]byte("tokensignin; INVALID. \n"))
		w.Write([]byte(stringspb.ToLen(myToken, 30)))

		vrf := fmt.Sprintf("\nhttps://www.googleapis.com/oauth2/v3/tokeninfo?id_token=%v \n", myToken)

Ejemplo n.º 7
func FetchAndDecodeJSON(r *http.Request, surl, knownProtocol string, lg loghttp.FuncBufUniv, fs fsi.FileSystem) []repo.FullArticle {

	fullURL := fmt.Sprintf("%s%s?%s=%s&cnt=%v&prot=%v", routes.AppHost(), routes.FetchSimilarURI,
		routes.URLParamKey, surl, numTotal-1, knownProtocol)

	// fullURL = fmt.Sprintf("%s%s?%s=%s&cnt=%v", r.URL.Host, repo.routes.FetchSimilarURI,
	// 	routes.URLParamKey, surl, numTotal-1)

	lg("lo fetching %v", fullURL)
	start := time.Now()

	fo := fetch.Options{}
	fo.URL = fullURL
	bJSON, inf, err := fetch.UrlGetter(r, fo)
	_ = inf
	if err != nil {
		lg("msg %v", inf.Msg)
		return nil
	if len(bJSON) == 0 {
		lg("empty bJSON")
		return nil

	lg("\t\tfetch resp complete after %4.2v secs; %vkB", time.Now().Sub(start).Seconds(), len(bJSON)/1024)

	var mp map[string][]byte
	err = json.Unmarshal(bJSON, &mp)
	if err != nil {
		if _, ok := mp["msg"]; ok {
			lg("%s", mp["msg"])
		} else {
			lg("%s", bJSON)
		return nil

	smaxFound := string(mp["lensimilar"])
	maxFound := util.Stoi(smaxFound)
	if maxFound < numTotal-1 {
		lg("not enough files returned by FetchSimilar 1 - mp[lensimilar] too small: %s", mp["lensimilar"])
		return nil
	least3Files := make([]repo.FullArticle, maxFound+1)

	_, ok1 := mp["url_self"]
	_, ok2 := mp["mod_self"]
	_, ok3 := mp["bod_self"]
	if ok1 && ok2 && ok3 {
		least3Files[0].Url = string(mp["url_self"])
		least3Files[0].Mod, err = time.Parse(http.TimeFormat, string(mp["mod_self"]))
		least3Files[0].Body = mp["bod_self"]
		if len(least3Files[0].Body) < 200 {
			if !bytes.Contains(least3Files[0].Body, []byte(fetch.MsgNoRdirects)) {
				lg("found base but its a redirect")
				return nil
	lg("found base")

	for k, v := range mp {
		if k == "msg" {
		if strings.HasSuffix(k, "self") {

		if strings.HasPrefix(k, "url__") {
			sval := strings.TrimPrefix(k, "url__")
			val := util.Stoi(sval)
			// lg("%v %v %s", sval, val, v)
			least3Files[val+1].Url = string(v)
		if strings.HasPrefix(k, "mod__") {
			sval := strings.TrimPrefix(k, "mod__")
			val := util.Stoi(sval)
			// lg("%v %v %s", sval, val, v)
			least3Files[val+1].Mod, err = time.Parse(http.TimeFormat, string(v))

		if strings.HasPrefix(k, "bod__") {
			sval := strings.TrimPrefix(k, "bod__")
			val := util.Stoi(sval)
			least3Files[val+1].Body = v //html.EscapeString(string(v)


	lg("found %v similar; decoding complete after %4.2v secs", maxFound, time.Now().Sub(start).Seconds())

	for _, v := range least3Files {
		lg("%v %v", v.Url, len(v.Body))

	return least3Files

Ejemplo n.º 8
func requestPay(w http.ResponseWriter, r *http.Request, m map[string]interface{}) {

	lg, b := loghttp.BuffLoggerUniversal(w, r)
	closureOverBuf := func(bUnused *bytes.Buffer) {
		loghttp.Pf(w, r, b.String())
	defer closureOverBuf(b) // the argument is ignored,
	r.Header.Set("X-Custom-Header-Counter", "nocounter")

	protoc := "https://"
	if appengine.IsDevAppServer() {
		protoc = "http://"

	host := appengine.DefaultVersionHostname(appengine.NewContext(r))
	if appengine.IsDevAppServer() {
		host = "not-loclhost"

	confirmURL := fmt.Sprintf("%v%v%v", protoc, host, uriConfirmPayment)
	confirmURL = url.QueryEscape(confirmURL)

	addrURL := fmt.Sprintf("https://%v/api/receive?method=create&address=%v&callback=%v&customsecret=49&api_code=%v",
		blockChainHost, bitCoinAddress, confirmURL, apiKey)

	req, err := http.NewRequest("GET", addrURL, nil)
	if err != nil {
	bts, inf, err := fetch.UrlGetter(r, fetch.Options{Req: req})
	bts = bytes.Replace(bts, []byte(`","`), []byte(`", "`), -1)

	if err != nil {

	lg("response body 1:\n")
	lg("%s\n", string(bts))

	lg("response body 2:\n")
	var data1 map[string]interface{}
	err = json.Unmarshal(bts, &data1)
	// lg("%#v", data1)

	inputAddress, ok := data1["input_address"].(string)
	if !ok {
		lg("input address could not be casted to string; is type %T", data1["input_address"])
	feePercent, ok := data1["fee_percent"].(float64)
	if !ok {
		lg("fee percent could not be casted to float64; is type %T", data1["fee_percent"])

	lg("Input Adress will be %q; fee percent will be %4.2v", inputAddress, feePercent)

Ejemplo n.º 9
func fetchSimForm(w http.ResponseWriter, r *http.Request, m map[string]interface{}) {

	lg, b := loghttp.BuffLoggerUniversal(w, r)
	closureOverBuf := func(bUnused *bytes.Buffer) {
		loghttp.Pf(w, r, b.String())
	defer closureOverBuf(b) // the argument is ignored,

	r.Header.Set("X-Custom-Header-Counter", "nocounter")

	// on live server => always use https
	if r.URL.Scheme != "https" && !util_appengine.IsLocalEnviron() {
		r.URL.Scheme = "https"
		r.URL.Host = r.Host
		lg("lo - redirect %v", r.URL.String())
		http.Redirect(w, r, r.URL.String(), http.StatusFound)

	err := r.ParseForm()

	rURL := ""
	if r.FormValue(routes.URLParamKey) != "" {
		rURL = r.FormValue(routes.URLParamKey)
	if len(rURL) == 0 {

		wpf(b, tplx.ExecTplHelper(tplx.Head, map[string]interface{}{"HtmlTitle": "Find similar HTML URLs"}))
		defer wpf(b, tplx.Foot)

		tm := map[string]string{
			"val":       "www.welt.de/politik/ausland/article146154432/Tuerkische-Bodentruppen-marschieren-im-Nordirak-ein.html",
			"fieldname": routes.URLParamKey,
		tplForm := tt.Must(tt.New("tplName01").Parse(htmlForm))
		tplForm.Execute(b, tm)

	} else {

		fullURL := fmt.Sprintf("https://%s%s?%s=%s&cnt=%s&prot=%s", r.Host, routes.FetchSimilarURI,
			routes.URLParamKey, rURL, r.FormValue("cnt"), r.FormValue("prot"))
		lg("lo - sending to URL 1: %v", fullURL)

		fo := fetch.Options{}
		fo.URL = fullURL
		bts, inf, err := fetch.UrlGetter(r, fo)
		_ = inf
		if err != nil {

		if len(bts) == 0 {
			lg("empty bts")

		var mp map[string][]byte
		err = json.Unmarshal(bts, &mp)
		if err != nil {
			lg("%s", bts)

		w.Header().Set("Content-Type", "text/html; charset=utf-8")
		if _, ok := mp["msg"]; ok {

		for k, v := range mp {
			if k != "msg" {
				wpf(w, "<br><br>%s:\n", k)
				if true {
					wpf(w, "len %v", len(v))
				} else {
					wpf(w, "%s", html.EscapeString(string(v)))


Ejemplo n.º 10
// Fetch takes a RSS XML uri and fetches some of its documents.
// It uses a three staged pipeline for parallel fetching.
// Results are stored into the given filesystem fs.
// Config points to the source of RSS XML,
// and has some rules for conflating URI directories.
// uriPrefix and config.DesiredNumber tell the func
// which subdirs of the RSS dir should be fetched - and how many at max.
func FetchUsingRSS(w http.ResponseWriter, r *http.Request,
	fs fsi.FileSystem, config FetchCommand,
) {

	lg, b := loghttp.BuffLoggerUniversal(w, r)
	closureOverBuf := func(bUnused *bytes.Buffer) {
		loghttp.Pf(w, r, b.String())
	defer closureOverBuf(b) // the argument is ignored,

	if config.Host == "" {
		lg(" empty host; returning")

	config = addDefaults(config)

	// Fetching the rssXML takes time.
	// We do it before the timouts of the pipeline stages are set off.
	lg(" ")
	if config.Host == "test.economist.com" {
		switchTData(w, r)

	// lg(stringspb.IndentedDump(config))
	dirTree := &DirTree{Name: "/", Dirs: map[string]DirTree{}, EndPoint: true}

	fnDigest := path.Join(docRoot, config.Host, "digest2.json")
	loadDigest(w, r, lg, fs, fnDigest, dirTree) // previous

	age := time.Now().Sub(dirTree.LastFound)
	lg("DirTree is %5.2v hours old (%v)", age.Hours(), dirTree.LastFound.Format(time.ANSIC))
	if age.Hours() > 0.001 {

		rssUrl := matchingRSSURI(w, r, config)
		if rssUrl == "" {
			m := new(MyWorker)
			m.r = r
			m.lg = lg
			m.fs1 = fs
			m.SURL = path.Join(config.Host, config.SearchPrefix)
			_, _, _, err := fetchSave(m)
			if err != nil {
		} else {
			rssUrl = path.Join(config.Host, rssUrl)
			rssDoc, rssUrlObj := rssXMLFile(w, r, fs, rssUrl)
			_ = rssUrlObj
			rssDoc2DirTree(w, r, dirTree, rssDoc, config.Host)

		saveDigest(lg, fs, fnDigest, dirTree)

	// lg(dirTree.String())
	// setting up a 3 staged pipeline from bottom up
	var fullArticles []FullArticle

	var inn chan *FullArticle = make(chan *FullArticle) // jobs are stuffed in here
	var out chan *FullArticle = make(chan *FullArticle) // completed jobs are delivered here
	var fin chan struct{} = make(chan struct{})         // downstream signals end to upstream
	var stage3Wait sync.WaitGroup

	// stage 3
	// fire up the "collector", a fan-in
	go func() {
		// 400 good value; critical point at 35
		// economist.com required 800 ms
		const delayInitial = 1200
		const delayRefresh = 800
		cout := time.After(time.Millisecond * delayInitial)
		for {
			select {

			case fa := <-out:
				fullArticles = append(fullArticles, *fa)
				pth := fetch.PathFromStringUrl(fa.Url)
				lg("    fetched   %v - %v ", fa.Mod.Format("15:04:05"), stringspb.Ellipsoider(pth, 50))
				cout = time.After(time.Millisecond * delayRefresh) // refresh timeout
			case <-cout:
				lg("timeout after %v articles", len(fullArticles))
				// we are using channel == nil - channel closed combinations
				// inspired by http://dave.cheney.net/2013/04/30/curious-channels
				out = nil // not close(out) => case above is now blocked
				lg("fin closed; out nilled")

	// stage 2
	for i := 0; i < numWorkers; i++ {
		// fire up a dedicated fetcher routine, a worker
		// we are using channel == nil - channel closed combinations
		// inspired by http://dave.cheney.net/2013/04/30/curious-channels
		go func() {
			var a *FullArticle
			for {
				select {
				case a = <-inn:
					var err error
					var inf fetch.Info
					a.Body, inf, err = fetch.UrlGetter(r, fetch.Options{URL: a.Url})
					if a.Mod.IsZero() {
						a.Mod = inf.Mod
					select {
					case out <- a:
					case <-fin:
						lg("    worker spinning down; branch 1; abandoning %v", a.Url)
					a = new(FullArticle)
				case <-fin:
					if a != nil && a.Url != "" {
						u, _ := url.Parse(a.Url)
						lg("    abandoned %v", u.Path)
					} else {
						lg("    worker spinning down; branch 2")

	// loading stage 1
	uriPrefix := config.SearchPrefix
	found := 0
	uriPrefixExcl := "impossible"
	for i := 0; i < 15; i++ {
		lg("  searching for prefix   %v    - excl %q    - %v of %v", uriPrefix, uriPrefixExcl, found, config.DesiredNumber)
		found += stuffStage1(w, r, config, inn, fin, dirTree,
			uriPrefixExcl, uriPrefix, config.DesiredNumber-found)

		if found >= config.DesiredNumber {

		if uriPrefix == "/" || uriPrefix == "." {
			lg("  root exhausted")

		newPrefix := path.Dir(uriPrefix)
		uriPrefixExcl = uriPrefix
		uriPrefix = newPrefix
	lg("  found %v of %v", found, config.DesiredNumber)

	lg("stage3Wait.Wait() before")
	lg("stage3Wait.Wait() after")

	// workers spin down earlier -
	// but ae log writer and response writer need some time
	// to record the spin-down messages
	time.Sleep(120 * time.Millisecond)

	// compile out directory statistics
	histoDir := map[string]int{}
	for _, a := range fullArticles {
		u, err := url.Parse(a.Url)
		semanticUri := condenseTrailingDir(u.Path, config.CondenseTrailingDirs)
		dir := path.Dir(semanticUri)
	sr := sortmap.SortMapByCount(histoDir)
	_ = sr

	// Create dirs
	for k, _ := range histoDir {
		dir := path.Join(docRoot, k) // config.Host already contained in k
		err := fs.MkdirAll(dir, 0755)
		err = fs.Chtimes(dir, time.Now(), time.Now())

	// Saving as files
	for _, a := range fullArticles {
		if len(a.Body) == 0 {
		u, err := url.Parse(a.Url)
		u.Fragment = ""
		u.RawQuery = ""
		semanticUri := condenseTrailingDir(u.RequestURI(), config.CondenseTrailingDirs)
		p := path.Join(docRoot, semanticUri)
		err = fs.WriteFile(p, a.Body, 0644)
		err = fs.Chtimes(p, a.Mod, a.Mod)

		b, err := json.MarshalIndent(histoDir, "  ", "\t")
		fnDigest := path.Join(docRoot, config.Host, "fetchDigest.json")
		err = fs.WriteFile(fnDigest, b, 0755)

	// fsm, ok := memfs.Unwrap(fs)
	// if ok {
	// 	fsm.Dump()
	// }

Ejemplo n.º 11
func Test1(t *testing.T) {

	lg, b := loghttp.BuffLoggerUniversal(nil, nil)
	_ = b

	c, err := aetest.NewContext(nil)
	if err != nil {
	defer c.Close()
	fs := GetFS(c, 2)

	remoteHostname := "www.welt.de"
	remoteHostname = "www.welt.de/politik/ausland"

	dirs1, _, msg, err := fileserver.GetDirContents(repo.RepoURL, remoteHostname)
	if err != nil {
		lg("%s", msg)

	for _, v := range dirs1 {
		lg("    %v", v)

	least3URLs := []string{}
	for _, v1 := range dirs1 {

		p := path.Join(remoteHostname, v1)
		dirs2, fils2, msg, err := fileserver.GetDirContents(repo.RepoURL, p)
		_ = dirs2
		if err != nil {
			lg("%s", msg)
		// lg("  dirs2 %v", stringspb.IndentedDump(dirs2))
		// lg("  fils2 %v", stringspb.IndentedDump(fils2))

		for _, v2 := range fils2 {
			least3URLs = append(least3URLs, path.Join(remoteHostname, v1, v2))

	if len(least3URLs) < numTotal {
		lg("not enough files in rss fetcher cache")
	} else {
		least3URLs = least3URLs[:numTotal+1]

	for _, v := range least3URLs {
		lg("    %v", v)

	// domclean

	least3Files := make([]repo.FullArticle, 0, len(least3URLs))
	for i := 0; i < len(least3URLs); i++ {

		surl := spf("%v/%v", repo.RepoURL, least3URLs[i])

		fNamer := domclean2.FileNamer(logDir, i)
		fNamer() // first call yields key

		resBytes, inf, err := fetch.UrlGetter(nil, fetch.Options{URL: surl})
		if err != nil {
		lg("fetched %4.1fkB from %v", float64(len(resBytes))/1024, stringspb.ToLenR(inf.URL.String(), 60))

		fa := repo.FullArticle{}
		fa.Url = inf.URL.String()
		fa.Mod = inf.Mod
		fa.Body = resBytes
		least3Files = append(least3Files, fa)


	doc := Dedup(least3Files, lg, fs)

	fNamer := domclean2.FileNamer(logDir, 0)
	fNamer() // first call yields key
	fsPerm := GetFS(c, 2)
	fileDump(lg, fsPerm, doc, fNamer, "_fin.html")

	pf("MapSimiliarCompares: %v SimpleCompares: %v LevenstheinComp: %v\n", breakMapsTooDistinct, appliedLevenshtein, appliedCompare)
	pf("correct finish\n")

Ejemplo n.º 12
// Fetches URL if local file is outdated.
// saves fetched file
// link extraction, link addition to treeX now accumulated one level higher
// bool return value: use existing => true
func fetchSave(m *MyWorker) ([]byte, time.Time, bool, error) {

	// w http.ResponseWriter,
	// r *http.Request,

	// Determine FileName
	ourl, err := fetch.URLFromString(m.SURL)
	fc := FetchCommand{}
	fc.Host = ourl.Host
	fc = addDefaults(fc)
	semanticUri := condenseTrailingDir(m.SURL, fc.CondenseTrailingDirs)
	fn := path.Join(docRoot, semanticUri)

	m.lg("crawlin %q", m.SURL)

	// File already exists?
	// Open file for age check
	var bts []byte
	var mod time.Time
	f := func() error {
		file1, err := m.fs1.Open(fn)
		// m.lg(err) // file may simply not exist
		if err != nil {
			return err // file may simply not exist
		defer file1.Close() // file close *fast* at the end of *this* anonymous func

		fi, err := file1.Stat()
		if err != nil {
			return err

		if fi.IsDir() {
			m.lg("\t\t file is a directory, skipping - %v", fn)
			return fmt.Errorf("is directory: %v", fn)

		mod = fi.ModTime()
		age := time.Now().Sub(mod)
		if age.Hours() > 10 {
			m.lg("\t\t file %4.2v hours old, refetch ", age.Hours())
			return fmt.Errorf("too old: %v", fn)

		m.lg("\t\t file only %4.2v hours old, take %4.2vkB from datastore", age.Hours(), fi.Size()/1024)
		bts, err = ioutil.ReadAll(file1)
		if err != nil {
			return err
		return nil

	err = f()
	if err == nil {
		return bts, mod, true, err

	// Fetch
	bts, inf, err := fetch.UrlGetter(m.r, fetch.Options{URL: m.SURL, KnownProtocol: m.Protocol, RedirectHandling: 1})
	if err != nil {
		if inf.Status != http.StatusNotFound {
			m.lg("tried to fetch %v, %v", m.SURL, inf.URL)
			m.lg("msg %v", inf.Msg)
			return []byte{}, inf.Mod, false, err
		// In our traversing upwards, we might encounter "directory links" that have no index.html.
		// For a *derived* URL, this is no error.
		bts = []byte(" ... not found ... ")
	if inf.Mod.IsZero() {
		inf.Mod = time.Now().Add(-75 * time.Minute)

	// main request still exists?
	if false {
		var cx context.Context
		cx = util_appengine.SafelyExtractGaeContext(m.r)
		if cx == nil {
			m.lg("timed out - returning")
			return bts, inf.Mod, false, fmt.Errorf("req timed out")

	m.lg("retrivd+saved %q; %vkB ", inf.URL.Host+inf.URL.Path, len(bts)/1024)

	if len(bts) > 1024*1024-1 {
		bts = removeScriptsAndComments(m.lg, bts)
		m.lg("size reduced_1 to %vkB ", len(bts)/1024)

		// if len(bts) > 1024*1024-1 {
		// 	bts = snappy.Encode(nil, bts)
		// 	fn = strings.Replace(fn, ".html", ".snap.html", -1)
		// 	m.lg("size reduced_2 to %vkB ", len(bts)/1024)
		// }

	dir := path.Dir(fn)
	err = m.fs1.MkdirAll(dir, 0755)
	err = m.fs1.Chtimes(dir, time.Now(), time.Now())
	err = m.fs1.WriteFile(fn, bts, 0644)
	err = m.fs1.Chtimes(fn, inf.Mod, inf.Mod)

	return bts, inf.Mod, false, nil

Ejemplo n.º 13
func Test1(t *testing.T) {

	lg, lge := loghttp.Logger(nil, nil)

	// c := prepare(t)
	// defer c.Close()

	lg("waiting for webserver")
	time.Sleep(2 * time.Millisecond)

	remoteHostname := "www.welt.de"

	dirs1, _, msg, err := fileserver.GetDirContents(hostWithPref, remoteHostname)
	if err != nil {
		lg("%s", msg)

	for _, v := range dirs1 {
		lg("    %v", v)

	least3Files := []string{}
	for _, v1 := range dirs1 {

		dirs2, fils2, msg, err := fileserver.GetDirContents(hostWithPref, path.Join(remoteHostname, v1))
		_ = dirs2
		if err != nil {
			lg("%s", msg)
		// lg("  dirs2 %v", stringspb.IndentedDump(dirs2))
		// lg("  fils2 %v", stringspb.IndentedDump(fils2))

		if len(fils2) > numTotal-1 {
			for i2, v2 := range fils2 {
				least3Files = append(least3Files, path.Join(remoteHostname, v1, v2))
				if i2 == numTotal-1 {

	if len(least3Files) < numTotal {
		lg("not enough files in rss fetcher cache")

	for _, v := range least3Files {
		lg("    %v", v)

	logdir := prepareLogDir()

	iter := make([]int, numTotal)

	for i, _ := range iter {

		surl := spf("%v/%v", hostWithPref, least3Files[i])

		fNamer := FileNamer(logdir, i)
		fnKey := fNamer() // first call yields key
		_ = fnKey

		resBytes, effUrl, err := fetch.UrlGetter(nil, fetch.Options{URL: surl})
		if err != nil {
		lg("fetched %4.1fkB from %v", float64(len(resBytes))/1024, stringspb.ToLenR(effUrl.String(), 60))
		opts := CleaningOptions{Proxify: true}
		opts.FNamer = fNamer
		opts.RemoteHost = remoteHostname
		doc, err := DomClean(resBytes, opts)
		_ = doc


	// statistics on elements and attributes
	sorted1 := sortmap.SortMapByCount(attrDistinct)
	sorted2 := sortmap.SortMapByCount(nodeDistinct)

	pf("correct finish\n")

Ejemplo n.º 14
// handleFetchURL either displays a form for requesting an url
// or it returns the URL´s contents.
func handleFetchURL(w http.ResponseWriter, r *http.Request, m map[string]interface{}) {

	lg, b := loghttp.BuffLoggerUniversal(w, r)
	_ = b

	// on live server => always use https
	if r.URL.Scheme != "https" && !util_appengine.IsLocalEnviron() {
		r.URL.Scheme = "https"
		r.URL.Host = r.Host
		lg("lo - redirect %v", r.URL.String())
		http.Redirect(w, r, r.URL.String(), http.StatusFound)

		To distinguish between posted and getted value,
		we check the "post-only" slice of values first.
		If nothing's there, but FormValue *has* a value,
		then it was "getted", otherwise "posted"
	rURL := ""
	urlAs := ""
	err := r.ParseForm()
	if r.PostFormValue(routes.URLParamKey) != "" {
		urlAs += "url posted "
		rURL = r.PostFormValue(routes.URLParamKey)

	if r.FormValue(routes.URLParamKey) != "" {
		if rURL == "" {
			urlAs += "url getted "
			rURL = r.FormValue(routes.URLParamKey)
	// lg("received %v:  %q", urlAs, rURL)

	if len(rURL) == 0 {

		tplAdder, tplExec := tplx.FuncTplBuilder(w, r)
		tplAdder("n_html_title", "Fetch some http data", nil)

		m := map[string]string{
			"protocol": "https",
			"host":     r.Host, // not  fetch.HostFromReq(r)
			"path":     routes.ProxifyURI,
			"name":     routes.URLParamKey,
			"val":      "google.com",
		if util_appengine.IsLocalEnviron() {
			m["protocol"] = "http"
		tplAdder("n_cont_0", c_formFetchUrl, m)
		tplExec(w, r)

	} else {

		r.Header.Set("X-Custom-Header-Counter", "nocounter")

		bts, inf, err := fetch.UrlGetter(r, fetch.Options{URL: rURL})

		tp := mime.TypeByExtension(path.Ext(inf.URL.Path))
		if false {
			ext := path.Ext(rURL)
			ext = strings.ToLower(ext)
			tp = mime.TypeByExtension(ext)
		w.Header().Set("Content-Type", tp)
		// w.Header().Set("Content-type", "text/html; charset=latin-1")

		if r.FormValue("dbg") != "" {
			w.Header().Set("Content-type", "text/html; charset=utf-8")
			fmt.Fprintf(w, "%s<br>\n  %s<br>\n %v", inf.URL.Path, tp, inf.URL.String())

		opts := domclean2.CleaningOptions{Proxify: true}
		opts.Beautify = true // "<a> Linktext without trailing space"
		opts.RemoteHost = fetch.HostFromStringUrl(rURL)

		// opts.ProxyHost = routes.AppHost()
		opts.ProxyHost = fetch.HostFromReq(r)
		if !util_appengine.IsLocalEnviron() {
			opts.ProxyHost = fetch.HostFromReq(r)

		doc, err := domclean2.DomClean(bts, opts)

		var bufRend bytes.Buffer
		err = html.Render(&bufRend, doc)

