Пример #1
0
func TestCM(t *testing.T) {
	var err error
	check := func() {
		if err != nil {
			t.Fatal(err)
		}
	}

	cm, _ := countmin.New(5, 16)
	db, err := MakeDB(":memory:", true, &Settings{"foowiki.xml.bz2", 8})
	check()

	for _, i := range []uint32{1, 6, 13, 7, 8, 20, 44} {
		cm.Add(i, i+5)
	}

	err = StoreCM(db, cm)
	check()

	got, err := LoadCM(db)
	check()

	if !reflect.DeepEqual(cm.Counts(), got.Counts()) {
		t.Errorf("expected %v, got %v", cm.Counts(), got)
	}
}
Пример #2
0
func processPages(articles <-chan *wikidump.Page,
	linkch chan<- *processedLink, narticles *uint32,
	nrows, ncols, maxN int) *countmin.Sketch {

	ngramcount, err := countmin.New(nrows, ncols)
	if err != nil {
		// Shouldn't happen; we already constructed a count-min sketch
		// with the exact same size in main.
		panic(err)
	}

	for a := range articles {
		text := wikidump.Cleanup(a.Text)
		links := wikidump.ExtractLinks(text)
		for link, freq := range links {
			linkch <- processLink(&link, freq, maxN)
		}

		tokens := nlp.Tokenize(text)
		for _, h := range hash.NGrams(tokens, 1, maxN) {
			ngramcount.Add1(h)
		}
		atomic.AddUint32(narticles, 1)
	}
	return ngramcount
}
Пример #3
0
func makeSemanticizer() Semanticizer {
	cm, _ := countmin.New(10, 4)
	db, _ := storage.MakeDB(":memory:", true, &storage.Settings{MaxNGram: 2})
	allq, _ := prepareAllQuery(db)
	sem := Semanticizer{db: db, ngramcount: cm, maxNGram: 2, allQuery: allq}

	for _, h := range hash.NGrams([]string{"Hello", "world"}, 2, 2) {
		_, err := db.Exec(`insert into linkstats values (?, 0, 1)`, h)
		if err == nil {
			_, err = db.Exec(`insert into titles values (0, "dmr")`)
		}
		if err != nil {
			panic(err)
		}
	}
	return sem
}
Пример #4
0
func main() {
	kingpin.Parse()

	log.SetPrefix("dumpparser ")

	var err error
	check := func() {
		if err != nil {
			log.Fatal(err)
		}
	}

	if *download != "" {
		*dumppath, err = wikidump.Download(*download, *dumppath, true)
		check()
	} else if *dumppath == "" {
		log.Fatal("no --download and no dumppath specified (try --help)")
	}

	f, err := open(*dumppath)
	check()
	defer f.Close()

	log.Printf("Creating database at %s", *dbpath)
	db, err := storage.MakeDB(*dbpath, true,
		&storage.Settings{*dumppath, uint(*maxNGram)})
	check()

	// The numbers here are completely arbitrary.
	nworkers := runtime.GOMAXPROCS(0)
	articles := make(chan *wikidump.Page, 10*nworkers)
	links := make(chan map[wikidump.Link]int, 10*nworkers)
	redirects := make(chan *wikidump.Redirect, 100)

	var wg sync.WaitGroup

	// Collect redirects.
	wg.Add(1)
	redirmap := make(map[string]string)
	go func() {
		for r := range redirects {
			redirmap[r.Title] = r.Target
		}
		wg.Done()
	}()

	// Clean up and tokenize articles, extract links, count n-grams.
	maxN := int(*maxNGram)
	counters := make([]*countmin.Sketch, nworkers)

	var worker sync.WaitGroup
	worker.Add(nworkers)
	log.Printf("%d workers", nworkers)
	for i := 0; i < nworkers; i++ {
		counters[i], err = countmin.New(int(*nrows), int(*ncols))
		check()

		go func(ngramcount *countmin.Sketch) {
			for a := range articles {
				text := wikidump.Cleanup(a.Text)
				links <- wikidump.ExtractLinks(text)

				tokens := nlp.Tokenize(text)
				for _, h := range hash.NGrams(tokens, 1, maxN) {
					ngramcount.Add1(h)
				}
			}
			worker.Done()
		}(counters[i])
	}

	wg.Add(1)
	go func() {
		worker.Wait()
		close(links)

		for i := 1; i < nworkers; i++ {
			counters[0].Sum(counters[i])
		}
		counters = counters[:1]

		wg.Done()
	}()

	// Collect links and store them in the database.
	wg.Add(1)
	done := make(chan struct{})
	go func() {
		if slerr := storeLinks(db, links, maxN); slerr != nil {
			panic(slerr)
		}
		wg.Done()
	}()

	go wikidump.GetPages(f, articles, redirects)

	wg.Wait()
	close(done)

	log.Printf("Processing %d redirects", len(redirmap))
	storage.ProcessRedirects(db, redirmap, true)

	err = storage.StoreCM(db, counters[0])
	check()

	log.Println("Finalizing database")
	err = storage.Finalize(db)
	check()
	err = db.Close()
	check()
}
Пример #5
0
func realMain(dbpath, dumppath, download string, nrows, ncols, maxNGram int,
	logger *log.Logger) {

	var err error
	check := func() {
		if err != nil {
			panic(err)
		}
	}

	if download != "" {
		dumppath, err = wikidump.Download(download, dumppath, true)
		check()
	} else if dumppath == "" {
		panic("no --download and no dumppath specified (try --help)")
	}

	f, err := open(dumppath)
	check()
	defer f.Close()

	logger.Printf("Creating database at %s", dbpath)
	db, err := storage.MakeDB(dbpath, true,
		&storage.Settings{Dumpname: dumppath, MaxNGram: uint(maxNGram)})
	check()

	// The numbers here are completely arbitrary.
	nworkers := runtime.GOMAXPROCS(0)
	articles := make(chan *wikidump.Page, 10*nworkers)
	linkch := make(chan *processedLink, 10*nworkers)
	redirch := make(chan *wikidump.Redirect, 10*nworkers)

	// Clean up and tokenize articles, extract links, count n-grams.
	counters := make(chan *countmin.Sketch, nworkers)
	counterTotal, err := countmin.New(nrows, ncols)
	check()

	go wikidump.GetPages(f, articles, redirch)

	logger.Printf("processing dump with %d workers", nworkers)
	var narticles uint32
	for i := 0; i < nworkers; i++ {
		// These signal completion by sending on counters.
		go func() {
			counters <- processPages(articles, linkch, &narticles,
				nrows, ncols, maxNGram)
		}()
	}

	var wg sync.WaitGroup

	wg.Add(1)
	go func() {
		for i := 0; i < nworkers; i++ {
			counterTotal.Sum(<-counters)
		}
		close(counters) // Force panic for programmer error.
		close(linkch)   // We know the workers are done now.
		wg.Done()
	}()

	// Collect redirects. We store these in nworkers slices to avoid having
	// to copy them into a single structure.
	// The allRedirects channel MUST be buffered.
	wg.Add(nworkers)
	allRedirects := make(chan []wikidump.Redirect, nworkers)
	var nredirs uint32
	for i := 0; i < nworkers; i++ {
		go func() {
			slice := collectRedirects(redirch)
			atomic.AddUint32(&nredirs, uint32(len(slice)))
			allRedirects <- slice
			wg.Done()
		}()
	}

	go pageProgress(&narticles, logger, &wg)

	err = storeLinks(db, linkch)

	wg.Wait()
	close(allRedirects)
	// Check error from storeLinks now, after goroutines have stopped.
	check()

	logger.Printf("Processing redirects")
	bar := pb.StartNew(int(nredirs))
	for slice := range allRedirects {
		err = storage.StoreRedirects(db, slice, bar)
		check()
	}
	bar.Finish()

	err = storage.StoreCM(db, counterTotal)
	check()

	logger.Println("Finalizing database")
	err = storage.Finalize(db)
	check()
	err = db.Close()
	check()
}