Ejemplo n.º 1
0
func ExampleCrawl() {
	// Set custom options
	opts := gocrawl.NewOptions(new(ExampleExtender))

	// should always set your robot name so that it looks for the most
	// specific rules possible in robots.txt.
	opts.RobotUserAgent = "Example"
	// and reflect that in the user-agent string used to make requests,
	// ideally with a link so site owners can contact you if there's an issue
	opts.UserAgent = "Mozilla/5.0 (compatible; Example/1.0; +http://example.com)"

	opts.CrawlDelay = 1 * time.Second
	opts.LogFlags = gocrawl.LogAll

	// Play nice with ddgo when running the test!
	opts.MaxVisits = 2

	// Create crawler and start at root of duckduckgo
	c := gocrawl.NewCrawlerWithOptions(opts)
	c.Run("https://duckduckgo.com/")

	// Remove "x" before Output: to activate the example (will run on go test)

	// xOutput: voluntarily fail to see log output
}
Ejemplo n.º 2
0
func CustomCrawl() {
	opts := gocrawl.NewOptions(new(CustomExtender))
	opts.CrawlDelay = 3 * time.Second

	c := gocrawl.NewCrawlerWithOptions(opts)
	c.Run("http://www.douban.com/photos/album/75978669/?start=0")
}
func main() {
	opts := gocrawl.NewOptions(new(ExampleExtender))
	opts.CrawlDelay = 0
	opts.LogFlags = gocrawl.LogNone
	opts.SameHostOnly = false
	// opts.MaxVisits = 4
	c := gocrawl.NewCrawlerWithOptions(opts)
	// c.Run(gocrawl.S{"https://duckduckgo.com/": DEPTH})
	// c.Run(gocrawl.S{"http://cellipede.com:4235/": DEPTH})
	c.Run(gocrawl.S{"http://cellipede.com/": DEPTH})
}
Ejemplo n.º 4
0
func main() {
	ext := &Ext{&gocrawl.DefaultExtender{}}
	// Set custom options
	opts := gocrawl.NewOptions(ext)
	opts.CrawlDelay = 1 * time.Second
	opts.LogFlags = gocrawl.LogError
	opts.SameHostOnly = false
	opts.MaxVisits = 100

	c := gocrawl.NewCrawlerWithOptions(opts)
	c.Run("http://0value.com")
}
Ejemplo n.º 5
0
func ExampleCrawl() {
	// Set custom options
	opts := gocrawl.NewOptions(new(ExampleExtender))
	opts.CrawlDelay = 1 * time.Second
	opts.LogFlags = gocrawl.LogAll

	// Play nice with ddgo when running the test!
	opts.MaxVisits = 2

	// Create crawler and start at root of duckduckgo
	c := gocrawl.NewCrawlerWithOptions(opts)
	c.Run("https://duckduckgo.com/")
}
Ejemplo n.º 6
0
func crawlSite(siteConfig SiteConfig) <-chan string {
	files := make(chan string, 10)
	crawler := new(CrawlerExtender)
	crawler.files = files
	crawler.Section = siteConfig.Section
	crawler.outDir = outDir
	crawler.skips = siteConfig.Skip
	crawler.isSectionLinks = siteConfig.IsSectionLinks
	opts := gocrawl.NewOptions(crawler)
	opts.CrawlDelay = 1 * time.Second

	opts.MaxVisits = siteConfig.Depth

	c := gocrawl.NewCrawlerWithOptions(opts)
	go func() {
		defer close(files)
		c.Run(siteConfig.Url)
	}()
	return files
}