func newDiskHeap(t *testing.T, name string, bufsize int) *diskheap.DiskHeap { db, err := bolt.Open(name, 0644, nil) if err != nil { t.Fatal(err) } return diskheap.New(db, nil, bufsize) }
func main() { defer profile.Start(profile.CPUProfile, profile.ProfilePath(".")).Stop() logger := log15.Root() logger.SetHandler(log15.MultiHandler( log15.LvlFilterHandler( log15.LvlInfo, log15.Must.FileHandler(filepath.Join(dir, "crawler.log"), log15.LogfmtFormat()), ), log15.LvlFilterHandler(log15.LvlError, log15.StdoutHandler), )) csv, err := os.Open(seedfile) if err != nil { log.Fatal(err) } var urls []string scanner := bufio.NewScanner(csv) for scanner.Scan() { url := scanner.Text() if !strings.HasPrefix(url, "#") { urls = append(urls, url) } } if err := scanner.Err(); err != nil { log.Fatal(err) } csv.Close() pattern := &extract.Pattern{ File: []string{ "", "*.?htm?", `/[^\.]*/`, `/.*\.(jpg|JPG|png|PNG|jpeg|JPEG|gif|GIF)/`, `/.*\.(php|jsp|aspx|asp|cgi|do)/`, "*.css", "*.js", }, // ExcludeFile: []string{ // "*.doc?", "*.xls?", "*.ppt?", // "*.pdf", "*.rar", "*.zip", // "*.ico", "*.apk", "*.exe", // "*.mp4", "*.mkv", // }, } ctrl = &Controller{ extractor: &extract.Extractor{ Matcher: extract.MustCompile(pattern), MaxDepth: 4, Pos: []struct{ Tag, Attr string }{ {"a", "href"}, {"img", "src"}, {"link", "href"}, {"script", "src"}, }, SniffFlags: extract.SniffWindowLocation, Redirect: true, SpanHosts: true, SubDomain: true, ResolveIP: true, }, downloader: &download.Downloader{ Dir: dir, }, trie: urltrie.NewHosts(threshold), count: count.NewHosts(), fingerprint: fingerprint.NewStore(0, 4, 4096), limiter: ratelimit.New(rate), logger: logger.New("worker", "controller"), } ctrl.complete.hosts = make(map[string]bool) store, err := boltstore.New(filepath.Join(dir, "bolt.db"), nil, nil) if err != nil { log.Fatal(err) } // queue, err := diskqueue.NewDefault(store.DB) queue := ratelimitq.NewWaitQueue(&ratelimitq.Option{ Limit: ctrl.Interval, Secondary: diskheap.New(store.DB, []byte("HEAP"), 16), }) go func() { http.Handle("/count/", http.HandlerFunc(handleCount)) log.Fatal(http.ListenAndServe("localhost:7869", nil)) }() cw := crawler.New(&crawler.Config{ Controller: ctrl, Logger: logger, Store: store, Queue: queue, }) if err := cw.Crawl(urls[offset-1 : offset-1+nseed]...); err != nil { log.Fatal(err) } cw.Wait() }