func (q *WorkQueue) SeedFromRobots(scope []*url.URL, clientFactory client.ClientFactory) { for _, scopeURL := range scope { robotsData, err := robots.GetRobotsForURL(scopeURL, clientFactory) if err != nil { logging.Logf(logging.LogWarning, "Unable to get robots.txt data: %s", err) } else { for _, path := range robotsData.GetAllPaths() { pathURL := *scopeURL pathURL.Path = path // Filter will handle if this is out of scope q.AddURLs(scopeURL.ResolveReference(&pathURL)) } } } }
// Filter data from robots.txt func (f *WorkFilter) AddRobotsFilter(scope []*url.URL, clientFactory client.ClientFactory) { for _, scopeURL := range scope { logging.Logf(logging.LogDebug, "Getting robots.txt exclusions for %s", scopeURL) robotsData, err := robots.GetRobotsForURL(scopeURL, clientFactory) if err != nil { logging.Logf(logging.LogWarning, "Unable to get robots.txt data: %s", err) } else { for _, disallowed := range robotsData.GetForUserAgent(f.settings.UserAgent) { disallowedURL := *scopeURL disallowedURL.Path = disallowed logging.Logf(logging.LogDebug, "Disallowing URL by robots: %s", &disallowedURL) f.FilterURL(&disallowedURL) } } } }