func main() { // Create a corpus object from the test corpus. corpus := corpustools.CorpusFromFile("../data/brown.txt", true, false) fmt.Println(corpus.Info()) // Get the list of comparison terms. t1 := time.Now() seqs := make([][]int, 0) for order := 1; order <= 3; order++ { for _, ngram := range corpus.Ngrams(order) { if corpus.Frequency(ngram) >= 20 { seqs = append(seqs, ngram) } } } t2 := time.Now() fmt.Printf("%d sequences in nearest neighbor set (took %v).\n", len(seqs), t2.Sub(t1)) // Compute and report the nearest neighbors. t1 = time.Now() for i := 0; i < 100; i++ { results := corpus.NearestNeighbors(seqs[i], seqs) fmt.Printf("Top 10 nearest neighbors of '%v' are...\n", corpus.ToString(seqs[i])) for j := 0; j < 10; j++ { fmt.Printf("'%v' score=%v\n", corpus.ToString(results[j].Seq), results[j].Val) } fmt.Println() } t2 = time.Now() fmt.Printf("Took %v.\n", t2.Sub(t1)) }
func main() { // Get the path for the test corpus. var path, _ = os.Getwd() path_parts := strings.Split(path, "/") path_parts = path_parts[:len(path_parts)-1] for _, part := range []string{"data", "brown.txt"} { path_parts = append(path_parts, part) } corpusfile := strings.Join(path_parts, "/") // Create a corpus object from the test corpus. lowerCase, returnChars := true, false corpus := corpustools.CorpusFromFile(corpusfile, lowerCase, returnChars) fmt.Println(corpus.Info()) }
func main() { // Load the corpus as a lower-case sequence of characters. corpus := corpustools.CorpusFromFile("../data/brown.txt", true, true) fmt.Println(corpus.Info()) // Enumerate all the ngrams we want to explore. seqs := make([][]int, 0) for length := 3; length <= 10; length++ { ngrams := corpus.Ngrams(length) for _, ngram := range ngrams { if corpus.Frequency(ngram) >= 600 { seqs = append(seqs, ngram) } } } fmt.Printf("%d ngrams to be explored.\n", len(seqs)) // Initialize the MDLSegmenter. mdlseg := corpustools.NewMDLSegmenter(corpus) // Initialize the output file. of, err := os.Create("mdl_segmentation_results.csv") if err != nil { log.Fatal(err) } defer of.Close() of.Write([]byte("sequence,sequence_as_text,freq,dl_model,dl_data,dl_total\n")) // Compute baseline description length. dl_model_baseline, dl_data_baseline := mdlseg.DescriptionLength() of.Write([]byte(fmt.Sprintf("%v,%v,%v,%v,%v,%v\n", "null", "null", 0, dl_model_baseline, dl_data_baseline, dl_model_baseline+dl_data_baseline))) // Identify the corpus subsequences which minimize the description length of the corpus. dlds := make(corpustools.Results, 0) for i, seq := range seqs { t := time.Now() mdlseg.AddNgram(seq) dl_model, dl_data := mdlseg.DescriptionLength() mdlseg.RemoveNgram(seq) dlds = append(dlds, corpustools.Result{Seq: seq, Val: dl_model + dl_data}) // Report on performance. fmt.Printf(" Sequence %d/%d: %10v (%10v) --> %10.2f, %10.2f (took %v)\n", i+1, len(seqs), corpus.ToString(seq), seq, dl_model, dl_data, time.Now().Sub(t)) // Write result to CSV file. of.Write([]byte(fmt.Sprintf("%v,%v,%v,%v,%v,%v\n", seq, corpus.ToString(seq), corpus.Frequency(seq), dl_model, dl_data, dl_model+dl_data))) } }
func main() { // Create a corpus from a text file. corpus := corpustools.CorpusFromFile("../data/brown.txt", true, false) fmt.Println(corpus.Info()) // Iterate over various orders and generate the ngrams of this length. for n := 1; n <= 10; n++ { t1 := time.Now() ngrams := corpus.Ngrams(n) t2 := time.Now() fmt.Printf("%d %dgrams found in %v.\n", len(ngrams), n, t2.Sub(t1)) } // Report the frequencies of ngrams. for n := 1; n <= 10; n++ { ngrams := corpus.Ngrams(n) for j, ngram := range ngrams { fmt.Printf("%dgram %d = %v (%v) has frequency of %d.\n", n, j, corpus.ToString(ngram), ngram, corpus.Frequency(ngram)) } } }
func main() { // Get the path for the test corpus. var path, _ = os.Getwd() path_parts := strings.Split(path, "/") path_parts = path_parts[:len(path_parts)-1] for _, part := range []string{"data", "brown.txt"} { path_parts = append(path_parts, part) } corpusfile := strings.Join(path_parts, "/") // Create a corpus object from the test corpus. lowerCase, returnChars := true, false corpus := corpustools.CorpusFromFile(corpusfile, lowerCase, returnChars) fmt.Println(corpus.Info()) // Calculate the mean cross-entropy of the corpus trained on itself corpus_sequence := corpus.Corpus() for predictor_length := 0; predictor_length <= 5; predictor_length++ { probs := corpus.ProbabilityTransitions(corpus_sequence, predictor_length) _, L_mn := corpustools.SummarizeProbabilities(probs) fmt.Printf("The mean cross-entropy of the corpus with itself using length %d predictors is %.2f bits.\n", predictor_length, L_mn) } }
func main() { // Create a corpus from a text file. corpus := corpustools.CorpusFromFile("../data/brown.txt", true, false) fmt.Println(corpus.Info()) // Compute the mutual information associated with ngrams of varying length. min_freq := 2 for n := 2; n <= 4; n++ { results := make(corpustools.Results, 0) for _, ngram := range corpus.Ngrams(n) { if corpus.Frequency(ngram) >= min_freq { I := corpus.MutualInformation(ngram) results = append(results, corpustools.Result{ngram, I}) } } sort.Sort(corpustools.ResultsReverseSort{results}) fmt.Printf("%dgrams with the highest mutual information:\n", n) for i, result := range results { fmt.Printf("%d: %v (%v) --> %v\n", i+1, corpus.ToString(result.Seq), result.Seq, result.Val) } fmt.Printf("\n") } }