func initGet() { init := func(st crawler.Store) { for i := 0; i < bench_get_size; i++ { now := time.Now().UTC() st.PutNX(&crawler.URL{ URL: *mustParse(fmt.Sprintf("http://example.com/foo/bar/%d", i)), Last: now, }) } } testMem.once.Do(func() { testMem.store = crawler.NewMemStore() init(testMem.store) }) testBolt.once.Do(func() { os.Remove(tmpbolt) testBolt.store, _ = boltstore.New(tmpbolt, nil, nil) init(testBolt.store) }) testLevel.once.Do(func() { os.RemoveAll(tmplevel) testLevel.store, _ = levelstore.New(tmplevel, nil, nil) init(testLevel.store) }) }
func TestBolt(t *testing.T) { f, err := ioutil.TempFile("", "test_bolt") if err != nil { t.Fatal(err) } tmpfile := f.Name() f.Close() defer os.Remove(tmpfile) bs, err := boltstore.New(tmpfile, nil, nil) if err != nil { t.Fatal(err) } StoreTest(t, bs) }
func BenchmarkBoltPut(b *testing.B) { f, err := ioutil.TempFile("", "test_bolt") if err != nil { b.Fatal(err) } tmpfile := f.Name() f.Close() defer os.Remove(tmpfile) bs, err := boltstore.New(tmpfile, nil, nil) if err != nil { b.Fatal(err) } // b.N = 2000 benchPut(b, bs, "BoltStore") }
func main() { defer profile.Start(profile.CPUProfile, profile.ProfilePath(".")).Stop() logger := log15.Root() logger.SetHandler(log15.MultiHandler( log15.LvlFilterHandler( log15.LvlInfo, log15.Must.FileHandler(filepath.Join(dir, "crawler.log"), log15.LogfmtFormat()), ), log15.LvlFilterHandler(log15.LvlError, log15.StdoutHandler), )) csv, err := os.Open(seedfile) if err != nil { log.Fatal(err) } var urls []string scanner := bufio.NewScanner(csv) for scanner.Scan() { url := scanner.Text() if !strings.HasPrefix(url, "#") { urls = append(urls, url) } } if err := scanner.Err(); err != nil { log.Fatal(err) } csv.Close() pattern := &extract.Pattern{ File: []string{ "", "*.?htm?", `/[^\.]*/`, `/.*\.(jpg|JPG|png|PNG|jpeg|JPEG|gif|GIF)/`, `/.*\.(php|jsp|aspx|asp|cgi|do)/`, "*.css", "*.js", }, // ExcludeFile: []string{ // "*.doc?", "*.xls?", "*.ppt?", // "*.pdf", "*.rar", "*.zip", // "*.ico", "*.apk", "*.exe", // "*.mp4", "*.mkv", // }, } ctrl = &Controller{ extractor: &extract.Extractor{ Matcher: extract.MustCompile(pattern), MaxDepth: 4, Pos: []struct{ Tag, Attr string }{ {"a", "href"}, {"img", "src"}, {"link", "href"}, {"script", "src"}, }, SniffFlags: extract.SniffWindowLocation, Redirect: true, SpanHosts: true, SubDomain: true, ResolveIP: true, }, downloader: &download.Downloader{ Dir: dir, }, trie: urltrie.NewHosts(threshold), count: count.NewHosts(), fingerprint: fingerprint.NewStore(0, 4, 4096), limiter: ratelimit.New(rate), logger: logger.New("worker", "controller"), } ctrl.complete.hosts = make(map[string]bool) store, err := boltstore.New(filepath.Join(dir, "bolt.db"), nil, nil) if err != nil { log.Fatal(err) } // queue, err := diskqueue.NewDefault(store.DB) queue := ratelimitq.NewWaitQueue(&ratelimitq.Option{ Limit: ctrl.Interval, Secondary: diskheap.New(store.DB, []byte("HEAP"), 16), }) go func() { http.Handle("/count/", http.HandlerFunc(handleCount)) log.Fatal(http.ListenAndServe("localhost:7869", nil)) }() cw := crawler.New(&crawler.Config{ Controller: ctrl, Logger: logger, Store: store, Queue: queue, }) if err := cw.Crawl(urls[offset-1 : offset-1+nseed]...); err != nil { log.Fatal(err) } cw.Wait() }