Пример #1
0
func crawlManager(nodes []string) {

	seen := map[interface{}]struct{}{}

	ch := disqchan.NewChan("found_urls", true, nodes...)
	client := tasque.NewClient(5*time.Second, nodes...)

	tasque.NewTask("crawl").Set("url", "http://news.google.com").Do(client)

	rc := ch.RecvChan()
	i := 0
	for href := range rc {
		i++
		if i%100 == 0 {
			fmt.Println("%d (%d) urls seen", i, len(seen))
		}

		_, found := seen[href]
		if found {
			continue
		}
		seen[href] = struct{}{}

		//fmt.Println(href)
		tasque.NewTask("crawl").Set("url", href).Do(client)
	}

}
Пример #2
0
func NewCrawlHandler(persistDir string, addrs ...string) *CrawlHandler {
	return &CrawlHandler{
		seenUrls: make(map[string]struct{}),
		lock:     sync.RWMutex{},
		dir:      persistDir,
		ch:       disqchan.NewChan("found_urls", false, addrs...),
	}
}