Example #1
0
func initGet() {
	init := func(st crawler.Store) {
		for i := 0; i < bench_get_size; i++ {
			now := time.Now().UTC()
			st.PutNX(&crawler.URL{
				URL:  *mustParse(fmt.Sprintf("http://example.com/foo/bar/%d", i)),
				Last: now,
			})
		}
	}
	testMem.once.Do(func() {
		testMem.store = crawler.NewMemStore()
		init(testMem.store)
	})
	testBolt.once.Do(func() {
		os.Remove(tmpbolt)
		testBolt.store, _ = boltstore.New(tmpbolt, nil, nil)
		init(testBolt.store)
	})
	testLevel.once.Do(func() {
		os.RemoveAll(tmplevel)
		testLevel.store, _ = levelstore.New(tmplevel, nil, nil)
		init(testLevel.store)
	})
}
Example #2
0
func TestBolt(t *testing.T) {
	f, err := ioutil.TempFile("", "test_bolt")
	if err != nil {
		t.Fatal(err)
	}
	tmpfile := f.Name()
	f.Close()
	defer os.Remove(tmpfile)

	bs, err := boltstore.New(tmpfile, nil, nil)
	if err != nil {
		t.Fatal(err)
	}
	StoreTest(t, bs)
}
Example #3
0
func BenchmarkBoltPut(b *testing.B) {
	f, err := ioutil.TempFile("", "test_bolt")
	if err != nil {
		b.Fatal(err)
	}
	tmpfile := f.Name()
	f.Close()
	defer os.Remove(tmpfile)

	bs, err := boltstore.New(tmpfile, nil, nil)
	if err != nil {
		b.Fatal(err)
	}
	// b.N = 2000
	benchPut(b, bs, "BoltStore")
}
Example #4
0
func main() {
	defer profile.Start(profile.CPUProfile, profile.ProfilePath(".")).Stop()

	logger := log15.Root()
	logger.SetHandler(log15.MultiHandler(
		log15.LvlFilterHandler(
			log15.LvlInfo,
			log15.Must.FileHandler(filepath.Join(dir, "crawler.log"), log15.LogfmtFormat()),
		),
		log15.LvlFilterHandler(log15.LvlError, log15.StdoutHandler),
	))

	csv, err := os.Open(seedfile)
	if err != nil {
		log.Fatal(err)
	}
	var urls []string
	scanner := bufio.NewScanner(csv)
	for scanner.Scan() {
		url := scanner.Text()
		if !strings.HasPrefix(url, "#") {
			urls = append(urls, url)
		}
	}
	if err := scanner.Err(); err != nil {
		log.Fatal(err)
	}
	csv.Close()

	pattern := &extract.Pattern{
		File: []string{
			"", "*.?htm?", `/[^\.]*/`,
			`/.*\.(jpg|JPG|png|PNG|jpeg|JPEG|gif|GIF)/`,
			`/.*\.(php|jsp|aspx|asp|cgi|do)/`,
			"*.css", "*.js",
		},
		// ExcludeFile: []string{
		// 	"*.doc?", "*.xls?", "*.ppt?",
		// 	"*.pdf", "*.rar", "*.zip",
		// 	"*.ico", "*.apk", "*.exe",
		// 	"*.mp4", "*.mkv",
		// },
	}
	ctrl = &Controller{
		extractor: &extract.Extractor{
			Matcher:  extract.MustCompile(pattern),
			MaxDepth: 4,
			Pos: []struct{ Tag, Attr string }{
				{"a", "href"},
				{"img", "src"},
				{"link", "href"},
				{"script", "src"},
			},
			SniffFlags: extract.SniffWindowLocation,
			Redirect:   true,
			SpanHosts:  true,
			SubDomain:  true,
			ResolveIP:  true,
		},
		downloader: &download.Downloader{
			Dir: dir,
		},
		trie:        urltrie.NewHosts(threshold),
		count:       count.NewHosts(),
		fingerprint: fingerprint.NewStore(0, 4, 4096),
		limiter:     ratelimit.New(rate),
		logger:      logger.New("worker", "controller"),
	}
	ctrl.complete.hosts = make(map[string]bool)

	store, err := boltstore.New(filepath.Join(dir, "bolt.db"), nil, nil)
	if err != nil {
		log.Fatal(err)
	}
	// queue, err := diskqueue.NewDefault(store.DB)
	queue := ratelimitq.NewWaitQueue(&ratelimitq.Option{
		Limit:     ctrl.Interval,
		Secondary: diskheap.New(store.DB, []byte("HEAP"), 16),
	})

	go func() {
		http.Handle("/count/", http.HandlerFunc(handleCount))
		log.Fatal(http.ListenAndServe("localhost:7869", nil))
	}()

	cw := crawler.New(&crawler.Config{
		Controller: ctrl,
		Logger:     logger,
		Store:      store,
		Queue:      queue,
	})
	if err := cw.Crawl(urls[offset-1 : offset-1+nseed]...); err != nil {
		log.Fatal(err)
	}
	cw.Wait()
}