// Init - init request structure func (r *request) Init(logger zap.Logger) { r.client = new(http.Client) r.logger = logger r.client.CheckRedirect = func(req *http.Request, via []*http.Request) error { if len(via) >= 10 { return errors.New("stopped after 10 redirects") } if len(via) == 0 { return nil } r.meta.SetState(database.StateDublicate) r.meta.SetStatusCode(301) hostID, _ := r.hostMng.CheckURL(req.URL) copyURL := *req.URL r.meta = proxy.NewMeta(hostID, NormalizeURL(©URL), r.meta) if !hostID.Valid { r.meta.SetState(database.StateExternal) } for attr, val := range via[0].Header { if _, ok := req.Header[attr]; !ok { req.Header[attr] = val } } return nil } }
func (r *request) get(u *url.URL) (int64, error) { urlStr := u.String() r.urls = make(map[string]sql.NullInt64) hostID, robotOk := r.hostMng.CheckURL(u) r.meta = proxy.NewMeta(hostID, urlStr, nil) if !hostID.Valid { r.meta.SetState(database.StateExternal) } if !robotOk { r.meta.SetState(database.StateDisabledByRobotsTxt) log.Printf("INFO: URL %s blocked by robot.txt", urlStr) return 0, nil } startTime := time.Now() request := &http.Request{ Method: "GET", URL: u, Proto: "HTTP/1.1", ProtoMajor: 1, ProtoMinor: 1, Header: map[string][]string{ "User-Agent": {"Mozilla/5.0 (compatible; GoWebSearch/0.1)"}, "Accept": {"text/html;q=0.9,*/*;q=0.1"}, "Accept-Encoding": {"gzip;q=0.9,identity;q=0.5,*;q=0.1"}, "Accept-Language": {"ru-RU,ru;q=0.9,en-US;q=0.2,en;q=0.1"}, "Accept-Charset": {"utf-8;q=0.9,windows-1251;q=0.8,koi8-r;q=0.7,*;q=0.1"}, }, Body: nil, Host: u.Host, } response, err := r.client.Do(request) if err != nil { r.meta.SetState(database.StateConnectError) return 0, err } if r.meta.GetState() != database.StateSuccess { // here or early - logging!!! return 0, nil } // hostID, robotOk := r.hostMng.CheckURL(u) // if !robotOk { // r.meta.SetState(database.StateDisabledByRobotsTxt) // return fmt.Errorf("INFO: URL %s blocked by robot.txt", NormalizeURL(©URL)) // } loggerURL := r.logger.With(zap.String("url", r.meta.GetURL())) RequestDurationMs := int64(time.Since(startTime) / time.Millisecond) loggerURL.Debug(DbgRequestDuration, zap.Int64("duration", RequestDurationMs)) parser := newResponseParser(loggerURL, r.hostMng, r.meta) err = parser.Run(response) if err == nil { r.urls = parser.URLs } else { werrors.LogError(loggerURL, err) err = nil } return parser.BodyDurationMs, err }