Exemple #1
0
func (q *WorkQueue) SeedFromRobots(scope []*url.URL, clientFactory client.ClientFactory) {
	for _, scopeURL := range scope {
		robotsData, err := robots.GetRobotsForURL(scopeURL, clientFactory)
		if err != nil {
			logging.Logf(logging.LogWarning, "Unable to get robots.txt data: %s", err)
		} else {
			for _, path := range robotsData.GetAllPaths() {
				pathURL := *scopeURL
				pathURL.Path = path
				// Filter will handle if this is out of scope
				q.AddURLs(scopeURL.ResolveReference(&pathURL))
			}
		}
	}
}
Exemple #2
0
// Filter data from robots.txt
func (f *WorkFilter) AddRobotsFilter(scope []*url.URL, clientFactory client.ClientFactory) {
	for _, scopeURL := range scope {
		logging.Logf(logging.LogDebug, "Getting robots.txt exclusions for %s", scopeURL)
		robotsData, err := robots.GetRobotsForURL(scopeURL, clientFactory)
		if err != nil {
			logging.Logf(logging.LogWarning, "Unable to get robots.txt data: %s", err)
		} else {
			for _, disallowed := range robotsData.GetForUserAgent(f.settings.UserAgent) {
				disallowedURL := *scopeURL
				disallowedURL.Path = disallowed
				logging.Logf(logging.LogDebug, "Disallowing URL by robots: %s", &disallowedURL)
				f.FilterURL(&disallowedURL)
			}
		}
	}
}