func (w *Worker) AskRobots(url *url.URL) (bool, *heroshi.FetchResult) { robots_url_str := fmt.Sprintf("%s://%s/robots.txt", url.Scheme, url.Host) robots_url, err := url.Parse(robots_url_str) if err != nil { return false, heroshi.ErrorResult(url, err.Error()) } fetch_result := w.Fetch(robots_url) if !fetch_result.Success { fetch_result.Status = "Robots download error: " + fetch_result.Status return false, fetch_result } var robots *robotstxt.RobotsData robots, err = robotstxt.FromStatusAndBytes(fetch_result.StatusCode, fetch_result.Body) if err != nil { fetch_result.Status = "Robots parse error: " + err.Error() return false, fetch_result } allow := robots.TestAgent(url.Path, w.UserAgent) if !allow { return allow, heroshi.ErrorResult(url, "Robots disallow") } return allow, nil }
// Downloads url and returns whatever result was. // This function WILL NOT follow redirects. func (w *Worker) Download(url *url.URL) (result *heroshi.FetchResult) { w.hostLimits.Acquire(url.Host, w.HostConcurrency) defer w.hostLimits.Release(url.Host) req, err := http.NewRequest("GET", url.String(), nil) if err != nil { return heroshi.ErrorResult(url, err.Error()) } req.Header.Set("User-Agent", w.UserAgent) options := &heroshi.RequestOptions{ ConnectTimeout: w.ConnectTimeout, ReadTimeout: w.IOTimeout, WriteTimeout: w.IOTimeout, ReadLimit: w.ReadLimit, KeepaliveTimeout: w.KeepaliveTimeout, Stat: new(heroshi.RequestStat), } result = heroshi.Fetch(w.transport, req, options, w.FetchTimeout) if w.SkipBody { result.Body = nil } result.Stat = options.Stat w.transport.CloseIdleConnections(false) return result }
func (w *Worker) Fetch(url *url.URL) (result *heroshi.FetchResult) { original_url := url started := time.Now() defer func() { if result != nil { ended := time.Now() result.TotalTime = uint((ended.Sub(started)) / time.Millisecond) } }() for redirect := uint(0); redirect <= w.FollowRedirects; redirect++ { if url.Scheme == "" || url.Host == "" { return heroshi.ErrorResult(url, "Incorrect URL: "+url.String()) } // The /robots.txt is always allowed, check others. if w.SkipRobots || url.Path == "/robots.txt" { } else { var allow bool allow, result = w.AskRobots(url) if !allow { return result } } //result = w.CacheOrDownload(url) result = w.Download(url) if ShouldRedirect(result.StatusCode) { location := result.Headers.Get("Location") var err error url, err = url.Parse(location) if err != nil { return heroshi.ErrorResult(original_url, err.Error()) } continue } // no redirects required return result } return result }
func stdinReader(stop chan bool) { defer func() { stop <- true }() var line string var u *url.URL var err error stdinReader := bufio.NewReader(os.Stdin) for { lineBytes, readErr := stdinReader.ReadBytes('\n') if readErr != nil && readErr != io.EOF { panic("At ReadBytes") return } lineBytes = bytes.TrimSpace(lineBytes) if len(lineBytes) == 0 { goto Next } line = string(lineBytes) u, err = url.Parse(line) if err != nil { u = &url.URL{ Host: line, } result := heroshi.ErrorResult(u, err.Error()) reportJson, _ := encodeResult(line, result) reports <- reportJson } else { urls <- u } Next: if readErr == io.EOF { return } } }