func TestCM(t *testing.T) { var err error check := func() { if err != nil { t.Fatal(err) } } cm, _ := countmin.New(5, 16) db, err := MakeDB(":memory:", true, &Settings{"foowiki.xml.bz2", 8}) check() for _, i := range []uint32{1, 6, 13, 7, 8, 20, 44} { cm.Add(i, i+5) } err = StoreCM(db, cm) check() got, err := LoadCM(db) check() if !reflect.DeepEqual(cm.Counts(), got.Counts()) { t.Errorf("expected %v, got %v", cm.Counts(), got) } }
func processPages(articles <-chan *wikidump.Page, linkch chan<- *processedLink, narticles *uint32, nrows, ncols, maxN int) *countmin.Sketch { ngramcount, err := countmin.New(nrows, ncols) if err != nil { // Shouldn't happen; we already constructed a count-min sketch // with the exact same size in main. panic(err) } for a := range articles { text := wikidump.Cleanup(a.Text) links := wikidump.ExtractLinks(text) for link, freq := range links { linkch <- processLink(&link, freq, maxN) } tokens := nlp.Tokenize(text) for _, h := range hash.NGrams(tokens, 1, maxN) { ngramcount.Add1(h) } atomic.AddUint32(narticles, 1) } return ngramcount }
func makeSemanticizer() Semanticizer { cm, _ := countmin.New(10, 4) db, _ := storage.MakeDB(":memory:", true, &storage.Settings{MaxNGram: 2}) allq, _ := prepareAllQuery(db) sem := Semanticizer{db: db, ngramcount: cm, maxNGram: 2, allQuery: allq} for _, h := range hash.NGrams([]string{"Hello", "world"}, 2, 2) { _, err := db.Exec(`insert into linkstats values (?, 0, 1)`, h) if err == nil { _, err = db.Exec(`insert into titles values (0, "dmr")`) } if err != nil { panic(err) } } return sem }
func main() { kingpin.Parse() log.SetPrefix("dumpparser ") var err error check := func() { if err != nil { log.Fatal(err) } } if *download != "" { *dumppath, err = wikidump.Download(*download, *dumppath, true) check() } else if *dumppath == "" { log.Fatal("no --download and no dumppath specified (try --help)") } f, err := open(*dumppath) check() defer f.Close() log.Printf("Creating database at %s", *dbpath) db, err := storage.MakeDB(*dbpath, true, &storage.Settings{*dumppath, uint(*maxNGram)}) check() // The numbers here are completely arbitrary. nworkers := runtime.GOMAXPROCS(0) articles := make(chan *wikidump.Page, 10*nworkers) links := make(chan map[wikidump.Link]int, 10*nworkers) redirects := make(chan *wikidump.Redirect, 100) var wg sync.WaitGroup // Collect redirects. wg.Add(1) redirmap := make(map[string]string) go func() { for r := range redirects { redirmap[r.Title] = r.Target } wg.Done() }() // Clean up and tokenize articles, extract links, count n-grams. maxN := int(*maxNGram) counters := make([]*countmin.Sketch, nworkers) var worker sync.WaitGroup worker.Add(nworkers) log.Printf("%d workers", nworkers) for i := 0; i < nworkers; i++ { counters[i], err = countmin.New(int(*nrows), int(*ncols)) check() go func(ngramcount *countmin.Sketch) { for a := range articles { text := wikidump.Cleanup(a.Text) links <- wikidump.ExtractLinks(text) tokens := nlp.Tokenize(text) for _, h := range hash.NGrams(tokens, 1, maxN) { ngramcount.Add1(h) } } worker.Done() }(counters[i]) } wg.Add(1) go func() { worker.Wait() close(links) for i := 1; i < nworkers; i++ { counters[0].Sum(counters[i]) } counters = counters[:1] wg.Done() }() // Collect links and store them in the database. wg.Add(1) done := make(chan struct{}) go func() { if slerr := storeLinks(db, links, maxN); slerr != nil { panic(slerr) } wg.Done() }() go wikidump.GetPages(f, articles, redirects) wg.Wait() close(done) log.Printf("Processing %d redirects", len(redirmap)) storage.ProcessRedirects(db, redirmap, true) err = storage.StoreCM(db, counters[0]) check() log.Println("Finalizing database") err = storage.Finalize(db) check() err = db.Close() check() }
func realMain(dbpath, dumppath, download string, nrows, ncols, maxNGram int, logger *log.Logger) { var err error check := func() { if err != nil { panic(err) } } if download != "" { dumppath, err = wikidump.Download(download, dumppath, true) check() } else if dumppath == "" { panic("no --download and no dumppath specified (try --help)") } f, err := open(dumppath) check() defer f.Close() logger.Printf("Creating database at %s", dbpath) db, err := storage.MakeDB(dbpath, true, &storage.Settings{Dumpname: dumppath, MaxNGram: uint(maxNGram)}) check() // The numbers here are completely arbitrary. nworkers := runtime.GOMAXPROCS(0) articles := make(chan *wikidump.Page, 10*nworkers) linkch := make(chan *processedLink, 10*nworkers) redirch := make(chan *wikidump.Redirect, 10*nworkers) // Clean up and tokenize articles, extract links, count n-grams. counters := make(chan *countmin.Sketch, nworkers) counterTotal, err := countmin.New(nrows, ncols) check() go wikidump.GetPages(f, articles, redirch) logger.Printf("processing dump with %d workers", nworkers) var narticles uint32 for i := 0; i < nworkers; i++ { // These signal completion by sending on counters. go func() { counters <- processPages(articles, linkch, &narticles, nrows, ncols, maxNGram) }() } var wg sync.WaitGroup wg.Add(1) go func() { for i := 0; i < nworkers; i++ { counterTotal.Sum(<-counters) } close(counters) // Force panic for programmer error. close(linkch) // We know the workers are done now. wg.Done() }() // Collect redirects. We store these in nworkers slices to avoid having // to copy them into a single structure. // The allRedirects channel MUST be buffered. wg.Add(nworkers) allRedirects := make(chan []wikidump.Redirect, nworkers) var nredirs uint32 for i := 0; i < nworkers; i++ { go func() { slice := collectRedirects(redirch) atomic.AddUint32(&nredirs, uint32(len(slice))) allRedirects <- slice wg.Done() }() } go pageProgress(&narticles, logger, &wg) err = storeLinks(db, linkch) wg.Wait() close(allRedirects) // Check error from storeLinks now, after goroutines have stopped. check() logger.Printf("Processing redirects") bar := pb.StartNew(int(nredirs)) for slice := range allRedirects { err = storage.StoreRedirects(db, slice, bar) check() } bar.Finish() err = storage.StoreCM(db, counterTotal) check() logger.Println("Finalizing database") err = storage.Finalize(db) check() err = db.Close() check() }