func TestStoreLinks(t *testing.T) { db, _ := storage.MakeDB(":memory:", true, &storage.Settings{"bla", 3}) links := make(chan map[wikidump.Link]int) go func() { links <- map[wikidump.Link]int{ wikidump.Link{Anchor: "semanticizest", Target: "Entity_linking"}: 2, wikidump.Link{Anchor: "NER", Target: "Named_entity_recognition"}: 3, } links <- map[wikidump.Link]int{ wikidump.Link{Anchor: "semanticizest", Target: "Entity_linking"}: 1, } close(links) }() if err := storeLinks(db, links, 3); err != nil { t.Error(err) } var count float64 q := `select count from linkstats where targetid = (select id from titles where title="Entity_linking")` err := db.QueryRow(q).Scan(&count) if err != nil { t.Fatal(err) } else if count != 3 { t.Errorf("expected count=3.0, got %f\n", count) } }
func main() { kingpin.Parse() log.SetPrefix("dumpparser ") var err error check := func() { if err != nil { log.Fatal(err) } } if *download != "" { *dumppath, err = wikidump.Download(*download, *dumppath, true) check() } else if *dumppath == "" { log.Fatal("no --download and no dumppath specified (try --help)") } f, err := open(*dumppath) check() defer f.Close() log.Printf("Creating database at %s", *dbpath) db, err := storage.MakeDB(*dbpath, true, &storage.Settings{*dumppath, uint(*maxNGram)}) check() // The numbers here are completely arbitrary. nworkers := runtime.GOMAXPROCS(0) articles := make(chan *wikidump.Page, 10*nworkers) links := make(chan map[wikidump.Link]int, 10*nworkers) redirects := make(chan *wikidump.Redirect, 100) var wg sync.WaitGroup // Collect redirects. wg.Add(1) redirmap := make(map[string]string) go func() { for r := range redirects { redirmap[r.Title] = r.Target } wg.Done() }() // Clean up and tokenize articles, extract links, count n-grams. maxN := int(*maxNGram) counters := make([]*countmin.Sketch, nworkers) var worker sync.WaitGroup worker.Add(nworkers) log.Printf("%d workers", nworkers) for i := 0; i < nworkers; i++ { counters[i], err = countmin.New(int(*nrows), int(*ncols)) check() go func(ngramcount *countmin.Sketch) { for a := range articles { text := wikidump.Cleanup(a.Text) links <- wikidump.ExtractLinks(text) tokens := nlp.Tokenize(text) for _, h := range hash.NGrams(tokens, 1, maxN) { ngramcount.Add1(h) } } worker.Done() }(counters[i]) } wg.Add(1) go func() { worker.Wait() close(links) for i := 1; i < nworkers; i++ { counters[0].Sum(counters[i]) } counters = counters[:1] wg.Done() }() // Collect links and store them in the database. wg.Add(1) done := make(chan struct{}) go func() { if slerr := storeLinks(db, links, maxN); slerr != nil { panic(slerr) } wg.Done() }() go wikidump.GetPages(f, articles, redirects) wg.Wait() close(done) log.Printf("Processing %d redirects", len(redirmap)) storage.ProcessRedirects(db, redirmap, true) err = storage.StoreCM(db, counters[0]) check() log.Println("Finalizing database") err = storage.Finalize(db) check() err = db.Close() check() }