Esempio n. 1
0
// UrlGetter universal http getter for app engine and standalone go programs.
// Previously response was returned. Forgot why. Dropped it.
func UrlGetter(gaeReq *http.Request, options Options) (
	[]byte, Info, error,
) {

	options.LogLevel = 2

	var err error
	var inf Info = Info{}

	if options.LogLevel > 0 {
		if options.Req != nil {
			inf.Msg += fmt.Sprintf("orig req url: %#v\n", options.Req.URL.String())
		} else {
			inf.Msg += fmt.Sprintf("orig str url: %#v\n", options.URL)
		}
	}

	//
	// Either take provided request
	// Or build one from options.URL
	if options.Req == nil {
		ourl, err := URLFromString(options.URL) // Normalize
		if err != nil {
			return nil, inf, err
		}
		options.URL = ourl.String()
		options.Req, err = http.NewRequest("GET", options.URL, nil)
		if err != nil {
			return nil, inf, err
		}
	} else {
		if options.Req.URL.Scheme == "" {
			options.Req.URL.Scheme = "https"
		}
	}
	r := options.Req

	if len(options.KnownProtocol) > 1 {
		if strings.HasSuffix(options.KnownProtocol, ":") {
			options.KnownProtocol = strings.TrimSuffix(options.KnownProtocol, ":")
		}
		if options.KnownProtocol == "http" || options.KnownProtocol == "https" {
			r.URL.Scheme = options.KnownProtocol
			inf.Msg += fmt.Sprintf("Using known protocol %q\n", options.KnownProtocol)
		}
	}

	//
	// Unifiy appengine plain http.client
	client := &http.Client{}
	if gaeReq == nil {
		client.Timeout = time.Duration(5 * time.Second) // GAE does not allow
	} else {
		c := util_appengine.SafelyExtractGaeContext(gaeReq)
		if c != nil {

			ctxOld := oldAE.NewContext(gaeReq)
			client = oldFetch.Client(ctxOld)

			// this does not prevent urlfetch: SSL_CERTIFICATE_ERROR
			// it merely leads to err = "DEADLINE_EXCEEDED"
			tr := oldFetch.Transport{Context: ctxOld, AllowInvalidServerCertificate: true}
			// thus
			tr = oldFetch.Transport{Context: ctxOld, AllowInvalidServerCertificate: false}

			tr.Deadline = 20 * time.Second // only possible on aeOld

			client.Transport = &tr
			// client.Timeout = 20 * time.Second // also not in google.golang.org/appengine/urlfetch

		} else {
			return nil, inf, ErrNoContext
		}

		// appengine dev server => always fallback to http
		if c != nil && appengine.IsDevAppServer() && !options.ForceHTTPSEvenOnDevelopmentServer {
			r.URL.Scheme = "http"
		}
	}

	inf.URL = r.URL

	if options.RedirectHandling == 1 {
		client.CheckRedirect = func(req *http.Request, via []*http.Request) error {

			if len(via) == 1 && req.URL.Path == via[0].URL.Path+"/" {
				// allow redirect from /gesundheit to /gesundheit/
				return nil
			}

			spath := "\n"
			for _, v := range via {
				spath += v.URL.Path + "\n"
			}
			spath += req.URL.Path + "\n"
			return fmt.Errorf("%v %v", MsgNoRdirects, spath)
		}
	}

	if options.LogLevel > 0 {
		inf.Msg += fmt.Sprintf("url standardized to %q  %q %q \n", r.URL.Scheme, r.URL.Host, r.URL.RequestURI())
	}

	//
	//
	// Respond to test.economist.com directly from memory
	if _, ok := TestData[r.URL.Host+r.URL.Path]; ok {
		return TestData[r.URL.Host+r.URL.Path], inf, nil
	}

	// The actual call
	// =============================

	resp, err := client.Do(r)

	// Swallow redirect errors
	if err != nil {
		if options.RedirectHandling == 1 {
			serr := err.Error()
			if strings.Contains(serr, MsgNoRdirects) {
				bts := []byte(serr)
				inf.Mod = time.Now().Add(-10 * time.Minute)
				return bts, inf, nil
			}
		}
	}

	isHTTPSProblem := false
	if err != nil {
		isHTTPSProblem = strings.Contains(err.Error(), "SSL_CERTIFICATE_ERROR") ||
			strings.Contains(err.Error(), "tls: oversized record received with length")
	}

	// Under narrow conditions => fallback to http
	if err != nil {
		if isHTTPSProblem && r.URL.Scheme == "https" && r.Method == "GET" {
			r.URL.Scheme = "http"
			var err2nd error
			resp, err2nd = client.Do(r)
			// while protocol http may go through
			// next obstacle might be - again - a redirect error:
			if err2nd != nil {
				if options.RedirectHandling == 1 {
					serr := err2nd.Error()
					if strings.Contains(serr, MsgNoRdirects) {
						bts := []byte(serr)
						inf.Mod = time.Now().Add(-10 * time.Minute)
						addFallBackSuccessInfo(options, &inf, r, err)
						return bts, inf, nil
					}
				}

				return nil, inf, fmt.Errorf("GET fallback to http failed with %v", err2nd)
			}
			addFallBackSuccessInfo(options, &inf, r, err)
			err = nil // CLEAR error
		}
	}

	//
	// Final error handler
	//
	if err != nil {
		hintAE := ""
		if isHTTPSProblem && r.URL.Scheme == "https" {
			// Not GET but POST:
			// We cannot do a fallback for a post request - the r.Body.Reader is consumed
			// options.r.URL.Scheme = "http"
			// resp, err = client.Do(options.Req)
			return nil, inf, fmt.Errorf("Cannot do https requests. Possible reason: Dev server: %v", err)
		} else if strings.Contains(
			err.Error(),
			"net/http: Client Transport of type init.failingTransport doesn't support CancelRequest; Timeout not supported",
		) {
			hintAE = "\nDid you forget to submit the AE Request?\n"
		}
		return nil, inf, fmt.Errorf("request failed: %v - %v", err, hintAE)
	}

	//
	// We got response, but
	// explicit bad response from server
	if resp.StatusCode != http.StatusOK {

		if resp.StatusCode == http.StatusBadRequest || // 400
			resp.StatusCode == http.StatusNotFound || // 404
			false {
			dmp := ""
			for k, v := range resp.Header {
				dmp += fmt.Sprintf("key: %v - val %v\n", k, v)
			}
			dmp = ""
			dmp += stringspb.IndentedDump(r.URL)

			bts, errRd := ioutil.ReadAll(resp.Body)
			if errRd != nil {
				return nil, inf, fmt.Errorf("cannot read resp body: %v", errRd)
			}
			if len(bts) > 2*1024 {
				btsApdx := append([]byte(" ...omitted... "), bts[len(bts)-100:]...)
				bts = append(bts[2*1024:], btsApdx...)
			}
			defer resp.Body.Close()

			err2 := fmt.Errorf("resp %v: %v \n%v \n<pre>%s</pre>", resp.StatusCode, r.URL.String(), dmp, bts)

			if r.URL.Path == "" {
				r.URL.Path = "/"
			}
			var err2nd error
			resp, err2nd = client.Do(r)
			if err2nd != nil {
				return nil, inf, fmt.Errorf("again error %v \n%v", err2nd, err2)
			}
			if resp.StatusCode != http.StatusOK {
				inf.Status = resp.StatusCode
				return nil, inf, fmt.Errorf("again Status NotOK %v \n%v", resp.StatusCode, err2)
			}
			log.Printf("successful retry with '/' to %v after %v\n", r.URL.String(), err)
			err = nil // CLEAR error

			// return nil, inf, err2

		} else {
			return nil, inf, fmt.Errorf("bad http resp code: %v - %v", resp.StatusCode, r.URL.String())
		}
	}

	bts, err := ioutil.ReadAll(resp.Body)
	if err != nil {
		return nil, inf, fmt.Errorf("cannot read resp body: %v", err)
	}
	defer resp.Body.Close()

	// time stamp
	var tlm time.Time // time last modified
	lm := resp.Header.Get("Last-Modified")
	if lm != "" {
		tlm, err = time.Parse(time.RFC1123, lm) // Last-Modified: Sat, 29 Aug 2015 21:15:39 GMT
		if err != nil {
			tlm, err = time.Parse(time.RFC1123Z, lm) // with numeric time zone
			if err != nil {
				var zeroTime time.Time
				tlm = zeroTime
			}
		}
	}
	inf.Mod = tlm
	// log.Printf("    hdr  %v %v\n", lm, tlm.Format(time.ANSIC))

	return bts, inf, nil

}
Esempio n. 2
0
func Get(r *http.Request) *Instance {
	c := util_appengine.SafelyExtractGaeContext(r)
	return GetByContext(c)
}
Esempio n. 3
0
// Fetches URL if local file is outdated.
// saves fetched file
//
// link extraction, link addition to treeX now accumulated one level higher
// bool return value: use existing => true
func fetchSave(m *MyWorker) ([]byte, time.Time, bool, error) {

	// w http.ResponseWriter,
	// r *http.Request,

	// Determine FileName
	ourl, err := fetch.URLFromString(m.SURL)
	fc := FetchCommand{}
	fc.Host = ourl.Host
	fc = addDefaults(fc)
	semanticUri := condenseTrailingDir(m.SURL, fc.CondenseTrailingDirs)
	fn := path.Join(docRoot, semanticUri)

	m.lg("crawlin %q", m.SURL)

	// File already exists?
	// Open file for age check
	var bts []byte
	var mod time.Time
	f := func() error {
		file1, err := m.fs1.Open(fn)
		// m.lg(err) // file may simply not exist
		if err != nil {
			return err // file may simply not exist
		}
		defer file1.Close() // file close *fast* at the end of *this* anonymous func

		fi, err := file1.Stat()
		m.lg(err)
		if err != nil {
			return err
		}

		if fi.IsDir() {
			m.lg("\t\t file is a directory, skipping - %v", fn)
			return fmt.Errorf("is directory: %v", fn)
		}

		mod = fi.ModTime()
		age := time.Now().Sub(mod)
		if age.Hours() > 10 {
			m.lg("\t\t file %4.2v hours old, refetch ", age.Hours())
			return fmt.Errorf("too old: %v", fn)
		}

		m.lg("\t\t file only %4.2v hours old, take %4.2vkB from datastore", age.Hours(), fi.Size()/1024)
		bts, err = ioutil.ReadAll(file1)
		if err != nil {
			return err
		}
		return nil
	}

	err = f()
	if err == nil {
		return bts, mod, true, err
	}

	//
	// Fetch
	bts, inf, err := fetch.UrlGetter(m.r, fetch.Options{URL: m.SURL, KnownProtocol: m.Protocol, RedirectHandling: 1})
	m.lg(err)
	if err != nil {
		if inf.Status != http.StatusNotFound {
			m.lg("tried to fetch %v, %v", m.SURL, inf.URL)
			m.lg("msg %v", inf.Msg)
			return []byte{}, inf.Mod, false, err
		}
		// In our traversing upwards, we might encounter "directory links" that have no index.html.
		// For a *derived* URL, this is no error.
		bts = []byte(" ... not found ... ")
	}
	if inf.Mod.IsZero() {
		inf.Mod = time.Now().Add(-75 * time.Minute)
	}

	//
	//
	// main request still exists?
	if false {
		var cx context.Context
		cx = util_appengine.SafelyExtractGaeContext(m.r)
		if cx == nil {
			m.lg("timed out - returning")
			return bts, inf.Mod, false, fmt.Errorf("req timed out")
		}
	}

	m.lg("retrivd+saved %q; %vkB ", inf.URL.Host+inf.URL.Path, len(bts)/1024)

	if len(bts) > 1024*1024-1 {
		bts = removeScriptsAndComments(m.lg, bts)
		m.lg("size reduced_1 to %vkB ", len(bts)/1024)

		// if len(bts) > 1024*1024-1 {
		// 	bts = snappy.Encode(nil, bts)
		// 	fn = strings.Replace(fn, ".html", ".snap.html", -1)
		// 	m.lg("size reduced_2 to %vkB ", len(bts)/1024)
		// }
	}

	//
	//
	dir := path.Dir(fn)
	err = m.fs1.MkdirAll(dir, 0755)
	m.lg(err)
	err = m.fs1.Chtimes(dir, time.Now(), time.Now())
	m.lg(err)
	err = m.fs1.WriteFile(fn, bts, 0644)
	m.lg(err)
	err = m.fs1.Chtimes(fn, inf.Mod, inf.Mod)
	m.lg(err)

	return bts, inf.Mod, false, nil

}