Beispiel #1
0
func processPages(articles <-chan *wikidump.Page,
	linkch chan<- *processedLink, narticles *uint32,
	nrows, ncols, maxN int) *countmin.Sketch {

	ngramcount, err := countmin.New(nrows, ncols)
	if err != nil {
		// Shouldn't happen; we already constructed a count-min sketch
		// with the exact same size in main.
		panic(err)
	}

	for a := range articles {
		text := wikidump.Cleanup(a.Text)
		links := wikidump.ExtractLinks(text)
		for link, freq := range links {
			linkch <- processLink(&link, freq, maxN)
		}

		tokens := nlp.Tokenize(text)
		for _, h := range hash.NGrams(tokens, 1, maxN) {
			ngramcount.Add1(h)
		}
		atomic.AddUint32(narticles, 1)
	}
	return ngramcount
}
Beispiel #2
0
func processLink(link *wikidump.Link, freq, maxN int) *processedLink {
	tokens := nlp.Tokenize(link.Anchor)
	n := min(maxN, len(tokens))
	hashes := hash.NGrams(tokens, n, n)
	count := float64(freq)
	if len(hashes) > 1 {
		count = 1 / float64(len(hashes))
	}
	return &processedLink{link.Target, hashes, count}
}
Beispiel #3
0
func (sem semanticizer) allCandidates(s string) (cands []candidate, err error) {
	tokens := nlp.Tokenize(s)
	for _, h := range hash.NGrams(tokens, 1, int(sem.maxNGram)) {
		add, err := sem.candidates(h)
		if err != nil {
			break
		}
		cands = append(cands, add...)
	}
	return
}
Beispiel #4
0
func storeLinks(db *sql.DB, links <-chan map[wikidump.Link]int,
	maxN int) (err error) {

	insTitle, err := db.Prepare(`insert or ignore into titles values (NULL, ?)`)
	if err != nil {
		return
	}
	insLink, err := db.Prepare(
		`insert or ignore into linkstats values
		 (?, (select id from titles where title = ?), 0)`)
	if err != nil {
		return
	}
	update, err := db.Prepare(
		`update linkstats set count = count + ?
		 where ngramhash = ?
		 and targetid = (select id from titles where title =?)`)
	if err != nil {
		return
	}

	for linkFreq := range links {
		for link, freq := range linkFreq {
			tokens := nlp.Tokenize(link.Anchor)
			n := min(maxN, len(tokens))
			hashes := hash.NGrams(tokens, n, n)
			count := float64(freq)
			if len(hashes) > 1 {
				count = 1 / float64(len(hashes))
			}
			for _, h := range hashes {
				_, err = insTitle.Exec(link.Target)
				if err != nil {
					return
				}
				_, err = insLink.Exec(h, link.Target)
				if err != nil {
					return
				}
				_, err = update.Exec(count, h, link.Target)
				if err != nil {
					return
				}
			}
		}
	}
	return
}
Beispiel #5
0
func main() {
	kingpin.Parse()

	log.SetPrefix("dumpparser ")

	var err error
	check := func() {
		if err != nil {
			log.Fatal(err)
		}
	}

	if *download != "" {
		*dumppath, err = wikidump.Download(*download, *dumppath, true)
		check()
	} else if *dumppath == "" {
		log.Fatal("no --download and no dumppath specified (try --help)")
	}

	f, err := open(*dumppath)
	check()
	defer f.Close()

	log.Printf("Creating database at %s", *dbpath)
	db, err := storage.MakeDB(*dbpath, true,
		&storage.Settings{*dumppath, uint(*maxNGram)})
	check()

	// The numbers here are completely arbitrary.
	nworkers := runtime.GOMAXPROCS(0)
	articles := make(chan *wikidump.Page, 10*nworkers)
	links := make(chan map[wikidump.Link]int, 10*nworkers)
	redirects := make(chan *wikidump.Redirect, 100)

	var wg sync.WaitGroup

	// Collect redirects.
	wg.Add(1)
	redirmap := make(map[string]string)
	go func() {
		for r := range redirects {
			redirmap[r.Title] = r.Target
		}
		wg.Done()
	}()

	// Clean up and tokenize articles, extract links, count n-grams.
	maxN := int(*maxNGram)
	counters := make([]*countmin.Sketch, nworkers)

	var worker sync.WaitGroup
	worker.Add(nworkers)
	log.Printf("%d workers", nworkers)
	for i := 0; i < nworkers; i++ {
		counters[i], err = countmin.New(int(*nrows), int(*ncols))
		check()

		go func(ngramcount *countmin.Sketch) {
			for a := range articles {
				text := wikidump.Cleanup(a.Text)
				links <- wikidump.ExtractLinks(text)

				tokens := nlp.Tokenize(text)
				for _, h := range hash.NGrams(tokens, 1, maxN) {
					ngramcount.Add1(h)
				}
			}
			worker.Done()
		}(counters[i])
	}

	wg.Add(1)
	go func() {
		worker.Wait()
		close(links)

		for i := 1; i < nworkers; i++ {
			counters[0].Sum(counters[i])
		}
		counters = counters[:1]

		wg.Done()
	}()

	// Collect links and store them in the database.
	wg.Add(1)
	done := make(chan struct{})
	go func() {
		if slerr := storeLinks(db, links, maxN); slerr != nil {
			panic(slerr)
		}
		wg.Done()
	}()

	go wikidump.GetPages(f, articles, redirects)

	wg.Wait()
	close(done)

	log.Printf("Processing %d redirects", len(redirmap))
	storage.ProcessRedirects(db, redirmap, true)

	err = storage.StoreCM(db, counters[0])
	check()

	log.Println("Finalizing database")
	err = storage.Finalize(db)
	check()
	err = db.Close()
	check()
}
Beispiel #6
0
// Get all candidate entity mentions for the string s.
//
// A candidate entity's anchor text must be exactly s.
func (sem Semanticizer) ExactMatch(s string) (cands []Entity, err error) {
	tokens := nlp.Tokenize(s)
	h := hash.NGrams(tokens, len(tokens), len(tokens))[0]
	return sem.candidates(h, 0, len(tokens))
}