func crawlManager(nodes []string) { seen := map[interface{}]struct{}{} ch := disqchan.NewChan("found_urls", true, nodes...) client := tasque.NewClient(5*time.Second, nodes...) tasque.NewTask("crawl").Set("url", "http://news.google.com").Do(client) rc := ch.RecvChan() i := 0 for href := range rc { i++ if i%100 == 0 { fmt.Println("%d (%d) urls seen", i, len(seen)) } _, found := seen[href] if found { continue } seen[href] = struct{}{} //fmt.Println(href) tasque.NewTask("crawl").Set("url", href).Do(client) } }
func NewCrawlHandler(persistDir string, addrs ...string) *CrawlHandler { return &CrawlHandler{ seenUrls: make(map[string]struct{}), lock: sync.RWMutex{}, dir: persistDir, ch: disqchan.NewChan("found_urls", false, addrs...), } }