func makeSemanticizer() Semanticizer { cm, _ := countmin.New(10, 4) db, _ := storage.MakeDB(":memory:", true, &storage.Settings{MaxNGram: 2}) allq, _ := prepareAllQuery(db) sem := Semanticizer{db: db, ngramcount: cm, maxNGram: 2, allQuery: allq} for _, h := range hash.NGrams([]string{"Hello", "world"}, 2, 2) { _, err := db.Exec(`insert into linkstats values (?, 0, 1)`, h) if err == nil { _, err = db.Exec(`insert into titles values (0, "dmr")`) } if err != nil { panic(err) } } return sem }
func realMain(dbpath, dumppath, download string, nrows, ncols, maxNGram int, logger *log.Logger) { var err error check := func() { if err != nil { panic(err) } } if download != "" { dumppath, err = wikidump.Download(download, dumppath, true) check() } else if dumppath == "" { panic("no --download and no dumppath specified (try --help)") } f, err := open(dumppath) check() defer f.Close() logger.Printf("Creating database at %s", dbpath) db, err := storage.MakeDB(dbpath, true, &storage.Settings{Dumpname: dumppath, MaxNGram: uint(maxNGram)}) check() // The numbers here are completely arbitrary. nworkers := runtime.GOMAXPROCS(0) articles := make(chan *wikidump.Page, 10*nworkers) linkch := make(chan *processedLink, 10*nworkers) redirch := make(chan *wikidump.Redirect, 10*nworkers) // Clean up and tokenize articles, extract links, count n-grams. counters := make(chan *countmin.Sketch, nworkers) counterTotal, err := countmin.New(nrows, ncols) check() go wikidump.GetPages(f, articles, redirch) logger.Printf("processing dump with %d workers", nworkers) var narticles uint32 for i := 0; i < nworkers; i++ { // These signal completion by sending on counters. go func() { counters <- processPages(articles, linkch, &narticles, nrows, ncols, maxNGram) }() } var wg sync.WaitGroup wg.Add(1) go func() { for i := 0; i < nworkers; i++ { counterTotal.Sum(<-counters) } close(counters) // Force panic for programmer error. close(linkch) // We know the workers are done now. wg.Done() }() // Collect redirects. We store these in nworkers slices to avoid having // to copy them into a single structure. // The allRedirects channel MUST be buffered. wg.Add(nworkers) allRedirects := make(chan []wikidump.Redirect, nworkers) var nredirs uint32 for i := 0; i < nworkers; i++ { go func() { slice := collectRedirects(redirch) atomic.AddUint32(&nredirs, uint32(len(slice))) allRedirects <- slice wg.Done() }() } go pageProgress(&narticles, logger, &wg) err = storeLinks(db, linkch) wg.Wait() close(allRedirects) // Check error from storeLinks now, after goroutines have stopped. check() logger.Printf("Processing redirects") bar := pb.StartNew(int(nredirs)) for slice := range allRedirects { err = storage.StoreRedirects(db, slice, bar) check() } bar.Finish() err = storage.StoreCM(db, counterTotal) check() logger.Println("Finalizing database") err = storage.Finalize(db) check() err = db.Close() check() }