Beispiel #1
0
func TestStoreLinks(t *testing.T) {
	db, _ := storage.MakeDB(":memory:", true, &storage.Settings{"bla", 3})

	links := make(chan map[wikidump.Link]int)
	go func() {
		links <- map[wikidump.Link]int{
			wikidump.Link{Anchor: "semanticizest", Target: "Entity_linking"}: 2,
			wikidump.Link{Anchor: "NER", Target: "Named_entity_recognition"}: 3,
		}
		links <- map[wikidump.Link]int{
			wikidump.Link{Anchor: "semanticizest", Target: "Entity_linking"}: 1,
		}
		close(links)
	}()

	if err := storeLinks(db, links, 3); err != nil {
		t.Error(err)
	}

	var count float64
	q := `select count from linkstats
	      where targetid = (select id from titles where title="Entity_linking")`
	err := db.QueryRow(q).Scan(&count)
	if err != nil {
		t.Fatal(err)
	} else if count != 3 {
		t.Errorf("expected count=3.0, got %f\n", count)
	}
}
Beispiel #2
0
func main() {
	kingpin.Parse()

	log.SetPrefix("dumpparser ")

	var err error
	check := func() {
		if err != nil {
			log.Fatal(err)
		}
	}

	if *download != "" {
		*dumppath, err = wikidump.Download(*download, *dumppath, true)
		check()
	} else if *dumppath == "" {
		log.Fatal("no --download and no dumppath specified (try --help)")
	}

	f, err := open(*dumppath)
	check()
	defer f.Close()

	log.Printf("Creating database at %s", *dbpath)
	db, err := storage.MakeDB(*dbpath, true,
		&storage.Settings{*dumppath, uint(*maxNGram)})
	check()

	// The numbers here are completely arbitrary.
	nworkers := runtime.GOMAXPROCS(0)
	articles := make(chan *wikidump.Page, 10*nworkers)
	links := make(chan map[wikidump.Link]int, 10*nworkers)
	redirects := make(chan *wikidump.Redirect, 100)

	var wg sync.WaitGroup

	// Collect redirects.
	wg.Add(1)
	redirmap := make(map[string]string)
	go func() {
		for r := range redirects {
			redirmap[r.Title] = r.Target
		}
		wg.Done()
	}()

	// Clean up and tokenize articles, extract links, count n-grams.
	maxN := int(*maxNGram)
	counters := make([]*countmin.Sketch, nworkers)

	var worker sync.WaitGroup
	worker.Add(nworkers)
	log.Printf("%d workers", nworkers)
	for i := 0; i < nworkers; i++ {
		counters[i], err = countmin.New(int(*nrows), int(*ncols))
		check()

		go func(ngramcount *countmin.Sketch) {
			for a := range articles {
				text := wikidump.Cleanup(a.Text)
				links <- wikidump.ExtractLinks(text)

				tokens := nlp.Tokenize(text)
				for _, h := range hash.NGrams(tokens, 1, maxN) {
					ngramcount.Add1(h)
				}
			}
			worker.Done()
		}(counters[i])
	}

	wg.Add(1)
	go func() {
		worker.Wait()
		close(links)

		for i := 1; i < nworkers; i++ {
			counters[0].Sum(counters[i])
		}
		counters = counters[:1]

		wg.Done()
	}()

	// Collect links and store them in the database.
	wg.Add(1)
	done := make(chan struct{})
	go func() {
		if slerr := storeLinks(db, links, maxN); slerr != nil {
			panic(slerr)
		}
		wg.Done()
	}()

	go wikidump.GetPages(f, articles, redirects)

	wg.Wait()
	close(done)

	log.Printf("Processing %d redirects", len(redirmap))
	storage.ProcessRedirects(db, redirmap, true)

	err = storage.StoreCM(db, counters[0])
	check()

	log.Println("Finalizing database")
	err = storage.Finalize(db)
	check()
	err = db.Close()
	check()
}