Esempio n. 1
0
// Init - init request structure
func (r *request) Init(logger zap.Logger) {
	r.client = new(http.Client)
	r.logger = logger
	r.client.CheckRedirect = func(req *http.Request, via []*http.Request) error {
		if len(via) >= 10 {
			return errors.New("stopped after 10 redirects")
		}
		if len(via) == 0 {
			return nil
		}

		r.meta.SetState(database.StateDublicate)
		r.meta.SetStatusCode(301)

		hostID, _ := r.hostMng.CheckURL(req.URL)
		copyURL := *req.URL
		r.meta = proxy.NewMeta(hostID, NormalizeURL(&copyURL), r.meta)
		if !hostID.Valid {
			r.meta.SetState(database.StateExternal)
		}

		for attr, val := range via[0].Header {
			if _, ok := req.Header[attr]; !ok {
				req.Header[attr] = val
			}
		}

		return nil
	}
}
Esempio n. 2
0
func (r *request) get(u *url.URL) (int64, error) {
	urlStr := u.String()
	r.urls = make(map[string]sql.NullInt64)
	hostID, robotOk := r.hostMng.CheckURL(u)
	r.meta = proxy.NewMeta(hostID, urlStr, nil)

	if !hostID.Valid {
		r.meta.SetState(database.StateExternal)
	}
	if !robotOk {
		r.meta.SetState(database.StateDisabledByRobotsTxt)
		log.Printf("INFO: URL %s blocked by robot.txt", urlStr)
		return 0, nil
	}

	startTime := time.Now()
	request := &http.Request{
		Method:     "GET",
		URL:        u,
		Proto:      "HTTP/1.1",
		ProtoMajor: 1,
		ProtoMinor: 1,
		Header: map[string][]string{
			"User-Agent":      {"Mozilla/5.0 (compatible; GoWebSearch/0.1)"},
			"Accept":          {"text/html;q=0.9,*/*;q=0.1"},
			"Accept-Encoding": {"gzip;q=0.9,identity;q=0.5,*;q=0.1"},
			"Accept-Language": {"ru-RU,ru;q=0.9,en-US;q=0.2,en;q=0.1"},
			"Accept-Charset":  {"utf-8;q=0.9,windows-1251;q=0.8,koi8-r;q=0.7,*;q=0.1"},
		},
		Body: nil,
		Host: u.Host,
	}

	response, err := r.client.Do(request)
	if err != nil {
		r.meta.SetState(database.StateConnectError)
		return 0, err
	}

	if r.meta.GetState() != database.StateSuccess {
		// here or early - logging!!!
		return 0, nil
	}

	// hostID, robotOk := r.hostMng.CheckURL(u)
	// 		if !robotOk {
	// 		r.meta.SetState(database.StateDisabledByRobotsTxt)
	// 		return fmt.Errorf("INFO: URL %s blocked by robot.txt", NormalizeURL(&copyURL))
	// 	}

	loggerURL := r.logger.With(zap.String("url", r.meta.GetURL()))

	RequestDurationMs := int64(time.Since(startTime) / time.Millisecond)
	loggerURL.Debug(DbgRequestDuration, zap.Int64("duration", RequestDurationMs))

	parser := newResponseParser(loggerURL, r.hostMng, r.meta)
	err = parser.Run(response)
	if err == nil {
		r.urls = parser.URLs
	} else {
		werrors.LogError(loggerURL, err)
		err = nil
	}

	return parser.BodyDurationMs, err
}