Beispiel #1
0
func (w *Worker) AskRobots(url *url.URL) (bool, *heroshi.FetchResult) {
	robots_url_str := fmt.Sprintf("%s://%s/robots.txt", url.Scheme, url.Host)
	robots_url, err := url.Parse(robots_url_str)
	if err != nil {
		return false, heroshi.ErrorResult(url, err.Error())
	}

	fetch_result := w.Fetch(robots_url)

	if !fetch_result.Success {
		fetch_result.Status = "Robots download error: " + fetch_result.Status
		return false, fetch_result
	}

	var robots *robotstxt.RobotsData
	robots, err = robotstxt.FromStatusAndBytes(fetch_result.StatusCode, fetch_result.Body)
	if err != nil {
		fetch_result.Status = "Robots parse error: " + err.Error()
		return false, fetch_result
	}

	allow := robots.TestAgent(url.Path, w.UserAgent)
	if !allow {
		return allow, heroshi.ErrorResult(url, "Robots disallow")
	}

	return allow, nil
}
Beispiel #2
0
// Downloads url and returns whatever result was.
// This function WILL NOT follow redirects.
func (w *Worker) Download(url *url.URL) (result *heroshi.FetchResult) {
	w.hostLimits.Acquire(url.Host, w.HostConcurrency)
	defer w.hostLimits.Release(url.Host)

	req, err := http.NewRequest("GET", url.String(), nil)
	if err != nil {
		return heroshi.ErrorResult(url, err.Error())
	}
	req.Header.Set("User-Agent", w.UserAgent)

	options := &heroshi.RequestOptions{
		ConnectTimeout:   w.ConnectTimeout,
		ReadTimeout:      w.IOTimeout,
		WriteTimeout:     w.IOTimeout,
		ReadLimit:        w.ReadLimit,
		KeepaliveTimeout: w.KeepaliveTimeout,
		Stat:             new(heroshi.RequestStat),
	}
	result = heroshi.Fetch(w.transport, req, options, w.FetchTimeout)
	if w.SkipBody {
		result.Body = nil
	}
	result.Stat = options.Stat
	w.transport.CloseIdleConnections(false)

	return result
}
Beispiel #3
0
func (w *Worker) Fetch(url *url.URL) (result *heroshi.FetchResult) {
	original_url := url
	started := time.Now()
	defer func() {
		if result != nil {
			ended := time.Now()
			result.TotalTime = uint((ended.Sub(started)) / time.Millisecond)
		}
	}()

	for redirect := uint(0); redirect <= w.FollowRedirects; redirect++ {
		if url.Scheme == "" || url.Host == "" {
			return heroshi.ErrorResult(url, "Incorrect URL: "+url.String())
		}

		// The /robots.txt is always allowed, check others.
		if w.SkipRobots || url.Path == "/robots.txt" {
		} else {
			var allow bool
			allow, result = w.AskRobots(url)
			if !allow {
				return result
			}
		}

		//result = w.CacheOrDownload(url)
		result = w.Download(url)
		if ShouldRedirect(result.StatusCode) {
			location := result.Headers.Get("Location")
			var err error
			url, err = url.Parse(location)
			if err != nil {
				return heroshi.ErrorResult(original_url, err.Error())
			}
			continue
		}

		// no redirects required
		return result
	}
	return result
}
func stdinReader(stop chan bool) {
	defer func() { stop <- true }()

	var line string
	var u *url.URL
	var err error
	stdinReader := bufio.NewReader(os.Stdin)
	for {
		lineBytes, readErr := stdinReader.ReadBytes('\n')
		if readErr != nil && readErr != io.EOF {
			panic("At ReadBytes")
			return
		}

		lineBytes = bytes.TrimSpace(lineBytes)
		if len(lineBytes) == 0 {
			goto Next
		}
		line = string(lineBytes)

		u, err = url.Parse(line)
		if err != nil {
			u = &url.URL{
				Host: line,
			}
			result := heroshi.ErrorResult(u, err.Error())
			reportJson, _ := encodeResult(line, result)
			reports <- reportJson
		} else {
			urls <- u
		}

	Next:
		if readErr == io.EOF {
			return
		}
	}
}