Ejemplo n.º 1
0
func makeSemanticizer() Semanticizer {
	cm, _ := countmin.New(10, 4)
	db, _ := storage.MakeDB(":memory:", true, &storage.Settings{MaxNGram: 2})
	allq, _ := prepareAllQuery(db)
	sem := Semanticizer{db: db, ngramcount: cm, maxNGram: 2, allQuery: allq}

	for _, h := range hash.NGrams([]string{"Hello", "world"}, 2, 2) {
		_, err := db.Exec(`insert into linkstats values (?, 0, 1)`, h)
		if err == nil {
			_, err = db.Exec(`insert into titles values (0, "dmr")`)
		}
		if err != nil {
			panic(err)
		}
	}
	return sem
}
Ejemplo n.º 2
0
func realMain(dbpath, dumppath, download string, nrows, ncols, maxNGram int,
	logger *log.Logger) {

	var err error
	check := func() {
		if err != nil {
			panic(err)
		}
	}

	if download != "" {
		dumppath, err = wikidump.Download(download, dumppath, true)
		check()
	} else if dumppath == "" {
		panic("no --download and no dumppath specified (try --help)")
	}

	f, err := open(dumppath)
	check()
	defer f.Close()

	logger.Printf("Creating database at %s", dbpath)
	db, err := storage.MakeDB(dbpath, true,
		&storage.Settings{Dumpname: dumppath, MaxNGram: uint(maxNGram)})
	check()

	// The numbers here are completely arbitrary.
	nworkers := runtime.GOMAXPROCS(0)
	articles := make(chan *wikidump.Page, 10*nworkers)
	linkch := make(chan *processedLink, 10*nworkers)
	redirch := make(chan *wikidump.Redirect, 10*nworkers)

	// Clean up and tokenize articles, extract links, count n-grams.
	counters := make(chan *countmin.Sketch, nworkers)
	counterTotal, err := countmin.New(nrows, ncols)
	check()

	go wikidump.GetPages(f, articles, redirch)

	logger.Printf("processing dump with %d workers", nworkers)
	var narticles uint32
	for i := 0; i < nworkers; i++ {
		// These signal completion by sending on counters.
		go func() {
			counters <- processPages(articles, linkch, &narticles,
				nrows, ncols, maxNGram)
		}()
	}

	var wg sync.WaitGroup

	wg.Add(1)
	go func() {
		for i := 0; i < nworkers; i++ {
			counterTotal.Sum(<-counters)
		}
		close(counters) // Force panic for programmer error.
		close(linkch)   // We know the workers are done now.
		wg.Done()
	}()

	// Collect redirects. We store these in nworkers slices to avoid having
	// to copy them into a single structure.
	// The allRedirects channel MUST be buffered.
	wg.Add(nworkers)
	allRedirects := make(chan []wikidump.Redirect, nworkers)
	var nredirs uint32
	for i := 0; i < nworkers; i++ {
		go func() {
			slice := collectRedirects(redirch)
			atomic.AddUint32(&nredirs, uint32(len(slice)))
			allRedirects <- slice
			wg.Done()
		}()
	}

	go pageProgress(&narticles, logger, &wg)

	err = storeLinks(db, linkch)

	wg.Wait()
	close(allRedirects)
	// Check error from storeLinks now, after goroutines have stopped.
	check()

	logger.Printf("Processing redirects")
	bar := pb.StartNew(int(nredirs))
	for slice := range allRedirects {
		err = storage.StoreRedirects(db, slice, bar)
		check()
	}
	bar.Finish()

	err = storage.StoreCM(db, counterTotal)
	check()

	logger.Println("Finalizing database")
	err = storage.Finalize(db)
	check()
	err = db.Close()
	check()
}