func main() {
	// Create a corpus object from the test corpus.
	corpus := corpustools.CorpusFromFile("../data/brown.txt", true, false)
	fmt.Println(corpus.Info())

	// Get the list of comparison terms.
	t1 := time.Now()
	seqs := make([][]int, 0)
	for order := 1; order <= 3; order++ {
		for _, ngram := range corpus.Ngrams(order) {
			if corpus.Frequency(ngram) >= 20 {
				seqs = append(seqs, ngram)
			}
		}
	}
	t2 := time.Now()
	fmt.Printf("%d sequences in nearest neighbor set (took %v).\n", len(seqs), t2.Sub(t1))

	// Compute and report the nearest neighbors.
	t1 = time.Now()
	for i := 0; i < 100; i++ {
		results := corpus.NearestNeighbors(seqs[i], seqs)
		fmt.Printf("Top 10 nearest neighbors of '%v' are...\n", corpus.ToString(seqs[i]))
		for j := 0; j < 10; j++ {
			fmt.Printf("'%v' score=%v\n", corpus.ToString(results[j].Seq), results[j].Val)
		}
		fmt.Println()
	}
	t2 = time.Now()
	fmt.Printf("Took %v.\n", t2.Sub(t1))
}
Esempio n. 2
0
func main() {
	// Get the path for the test corpus.
	var path, _ = os.Getwd()
	path_parts := strings.Split(path, "/")
	path_parts = path_parts[:len(path_parts)-1]
	for _, part := range []string{"data", "brown.txt"} {
		path_parts = append(path_parts, part)
	}
	corpusfile := strings.Join(path_parts, "/")

	// Create a corpus object from the test corpus.
	lowerCase, returnChars := true, false
	corpus := corpustools.CorpusFromFile(corpusfile, lowerCase, returnChars)
	fmt.Println(corpus.Info())
}
func main() {
	// Load the corpus as a lower-case sequence of characters.
	corpus := corpustools.CorpusFromFile("../data/brown.txt", true, true)
	fmt.Println(corpus.Info())

	// Enumerate all the ngrams we want to explore.
	seqs := make([][]int, 0)
	for length := 3; length <= 10; length++ {
		ngrams := corpus.Ngrams(length)
		for _, ngram := range ngrams {
			if corpus.Frequency(ngram) >= 600 {
				seqs = append(seqs, ngram)
			}
		}
	}
	fmt.Printf("%d ngrams to be explored.\n", len(seqs))

	// Initialize the MDLSegmenter.
	mdlseg := corpustools.NewMDLSegmenter(corpus)

	// Initialize the output file.
	of, err := os.Create("mdl_segmentation_results.csv")
	if err != nil {
		log.Fatal(err)
	}
	defer of.Close()
	of.Write([]byte("sequence,sequence_as_text,freq,dl_model,dl_data,dl_total\n"))

	// Compute baseline description length.
	dl_model_baseline, dl_data_baseline := mdlseg.DescriptionLength()
	of.Write([]byte(fmt.Sprintf("%v,%v,%v,%v,%v,%v\n", "null", "null", 0, dl_model_baseline, dl_data_baseline, dl_model_baseline+dl_data_baseline)))

	// Identify the corpus subsequences which minimize the description length of the corpus.
	dlds := make(corpustools.Results, 0)
	for i, seq := range seqs {
		t := time.Now()
		mdlseg.AddNgram(seq)
		dl_model, dl_data := mdlseg.DescriptionLength()
		mdlseg.RemoveNgram(seq)
		dlds = append(dlds, corpustools.Result{Seq: seq, Val: dl_model + dl_data})
		// Report on performance.
		fmt.Printf("  Sequence %d/%d: %10v (%10v) --> %10.2f, %10.2f (took %v)\n", i+1, len(seqs), corpus.ToString(seq), seq, dl_model, dl_data, time.Now().Sub(t))
		// Write result to CSV file.
		of.Write([]byte(fmt.Sprintf("%v,%v,%v,%v,%v,%v\n", seq, corpus.ToString(seq), corpus.Frequency(seq), dl_model, dl_data, dl_model+dl_data)))
	}
}
Esempio n. 4
0
func main() {
	// Create a corpus from a text file.
	corpus := corpustools.CorpusFromFile("../data/brown.txt", true, false)
	fmt.Println(corpus.Info())

	// Iterate over various orders and generate the ngrams of this length.
	for n := 1; n <= 10; n++ {
		t1 := time.Now()
		ngrams := corpus.Ngrams(n)
		t2 := time.Now()
		fmt.Printf("%d %dgrams found in %v.\n", len(ngrams), n, t2.Sub(t1))
	}

	// Report the frequencies of ngrams.
	for n := 1; n <= 10; n++ {
		ngrams := corpus.Ngrams(n)
		for j, ngram := range ngrams {
			fmt.Printf("%dgram %d = %v (%v) has frequency of %d.\n", n, j, corpus.ToString(ngram), ngram, corpus.Frequency(ngram))
		}
	}
}
Esempio n. 5
0
func main() {
	// Get the path for the test corpus.
	var path, _ = os.Getwd()
	path_parts := strings.Split(path, "/")
	path_parts = path_parts[:len(path_parts)-1]
	for _, part := range []string{"data", "brown.txt"} {
		path_parts = append(path_parts, part)
	}
	corpusfile := strings.Join(path_parts, "/")

	// Create a corpus object from the test corpus.
	lowerCase, returnChars := true, false
	corpus := corpustools.CorpusFromFile(corpusfile, lowerCase, returnChars)
	fmt.Println(corpus.Info())

	// Calculate the mean cross-entropy of the corpus trained on itself
	corpus_sequence := corpus.Corpus()
	for predictor_length := 0; predictor_length <= 5; predictor_length++ {
		probs := corpus.ProbabilityTransitions(corpus_sequence, predictor_length)
		_, L_mn := corpustools.SummarizeProbabilities(probs)
		fmt.Printf("The mean cross-entropy of the corpus with itself using length %d predictors is %.2f bits.\n", predictor_length, L_mn)
	}
}
Esempio n. 6
0
func main() {
	// Create a corpus from a text file.
	corpus := corpustools.CorpusFromFile("../data/brown.txt", true, false)
	fmt.Println(corpus.Info())

	// Compute the mutual information associated with ngrams of varying length.
	min_freq := 2
	for n := 2; n <= 4; n++ {
		results := make(corpustools.Results, 0)
		for _, ngram := range corpus.Ngrams(n) {
			if corpus.Frequency(ngram) >= min_freq {
				I := corpus.MutualInformation(ngram)
				results = append(results, corpustools.Result{ngram, I})
			}
		}
		sort.Sort(corpustools.ResultsReverseSort{results})
		fmt.Printf("%dgrams with the highest mutual information:\n", n)
		for i, result := range results {
			fmt.Printf("%d: %v (%v) --> %v\n", i+1, corpus.ToString(result.Seq), result.Seq, result.Val)
		}
		fmt.Printf("\n")
	}
}