func ExampleCrawl() { // Set custom options opts := gocrawl.NewOptions(new(ExampleExtender)) // should always set your robot name so that it looks for the most // specific rules possible in robots.txt. opts.RobotUserAgent = "Example" // and reflect that in the user-agent string used to make requests, // ideally with a link so site owners can contact you if there's an issue opts.UserAgent = "Mozilla/5.0 (compatible; Example/1.0; +http://example.com)" opts.CrawlDelay = 1 * time.Second opts.LogFlags = gocrawl.LogAll // Play nice with ddgo when running the test! opts.MaxVisits = 2 // Create crawler and start at root of duckduckgo c := gocrawl.NewCrawlerWithOptions(opts) c.Run("https://duckduckgo.com/") // Remove "x" before Output: to activate the example (will run on go test) // xOutput: voluntarily fail to see log output }
func CustomCrawl() { opts := gocrawl.NewOptions(new(CustomExtender)) opts.CrawlDelay = 3 * time.Second c := gocrawl.NewCrawlerWithOptions(opts) c.Run("http://www.douban.com/photos/album/75978669/?start=0") }
func main() { opts := gocrawl.NewOptions(new(ExampleExtender)) opts.CrawlDelay = 0 opts.LogFlags = gocrawl.LogNone opts.SameHostOnly = false // opts.MaxVisits = 4 c := gocrawl.NewCrawlerWithOptions(opts) // c.Run(gocrawl.S{"https://duckduckgo.com/": DEPTH}) // c.Run(gocrawl.S{"http://cellipede.com:4235/": DEPTH}) c.Run(gocrawl.S{"http://cellipede.com/": DEPTH}) }
func main() { ext := &Ext{&gocrawl.DefaultExtender{}} // Set custom options opts := gocrawl.NewOptions(ext) opts.CrawlDelay = 1 * time.Second opts.LogFlags = gocrawl.LogError opts.SameHostOnly = false opts.MaxVisits = 100 c := gocrawl.NewCrawlerWithOptions(opts) c.Run("http://0value.com") }
func ExampleCrawl() { // Set custom options opts := gocrawl.NewOptions(new(ExampleExtender)) opts.CrawlDelay = 1 * time.Second opts.LogFlags = gocrawl.LogAll // Play nice with ddgo when running the test! opts.MaxVisits = 2 // Create crawler and start at root of duckduckgo c := gocrawl.NewCrawlerWithOptions(opts) c.Run("https://duckduckgo.com/") }
func crawlSite(siteConfig SiteConfig) <-chan string { files := make(chan string, 10) crawler := new(CrawlerExtender) crawler.files = files crawler.Section = siteConfig.Section crawler.outDir = outDir crawler.skips = siteConfig.Skip crawler.isSectionLinks = siteConfig.IsSectionLinks opts := gocrawl.NewOptions(crawler) opts.CrawlDelay = 1 * time.Second opts.MaxVisits = siteConfig.Depth c := gocrawl.NewCrawlerWithOptions(opts) go func() { defer close(files) c.Run(siteConfig.Url) }() return files }