// Function turns file into slice with feature symbols // - feature == 0 -> position in paragraph // - feature == 1 -> sentence length // now for else func CreateObservationSequence(filename string, states int) [][]int { output := make([][]int, 0, 0) s_number := 0 reader_full := GetReader(filename) for bpar, e := reader_full.ReadBytes('\n'); e == nil; bpar, e = reader_full.ReadBytes('\n') { var sentences []string par := nlptk.Paragraph{0, string(bpar[:len(bpar)-1])} if sentences = par.GetParts(); len(par.Text) <= 1 || len(sentences) == 0 { continue } for i, s := range sentences { sentence := nlptk.Sentence{s_number, s[:len(s)-1]} output = append(output, []int{i + 1, len(sentence.GetParts())}) // summary state } if s_number++; 2*s_number >= states { break } } return output }
// Extracts, trims from special signs and counts "bare" words in learning set. // Result dictionary is sent to channel. func WordCount(filename string, dict chan map[string]int) { word_counter := make(map[string]int) file, err := os.Open(SETDIR + filename) if err != nil { fmt.Println("Error reading file", filename) os.Exit(1) } reader := bufio.NewReader(file) for bpar, e := reader.ReadBytes('\n'); e == nil; bpar, e = reader.ReadBytes('\n') { paragraph := nlptk.Paragraph{0, string(bpar)} sentences := paragraph.GetParts() for _, sentence := range sentences { s := nlptk.Sentence{0, sentence} words := s.GetParts() if len(words) == 0 { continue } for _, word := range words { word_counter[word]++ word_counter["TOTAL"]++ } } } dict <- word_counter }
// Analyze full text and summarization to prepare observations: // - vectors of features // - binary table of sentence in summarization presence func ObserveFile(filename, full_dir, summ_dir string, states int) [][]int { reader_full := GetReader(full_dir + filename) reader_summ := GetReader(summ_dir + filename) s_counter := make([][]int, 0, 0) s_number, p_number := 0, 0 spar, _ := reader_summ.ReadBytes('\n') summarization := nlptk.Paragraph{p_number, string(spar)} sum_sentences := summarization.Text for bpar, e := reader_full.ReadBytes('\n'); e == nil; bpar, e = reader_full.ReadBytes('\n') { var sentences []string paragraph := nlptk.Paragraph{p_number, string(bpar[:len(bpar)-1])} if sentences = paragraph.GetParts(); len(paragraph.Text) <= 1 || len(sentences) == 0 { continue } for i, s := range sentences { sentence := nlptk.Sentence{s_number, s[:len(s)-1]} if strings.Contains(sum_sentences, s[:len(s)-1]) { // summary s_counter = append(s_counter, []int{0, 0}) s_counter = append(s_counter, []int{i + 1, len(sentence.GetParts())}) } else { // non-summary s_counter = append(s_counter, []int{i + 1, len(sentence.GetParts())}) s_counter = append(s_counter, []int{0, 0}) } if s_number++; 2*s_number >= states { return s_counter } } p_number++ } fmt.Println("Sequence", filename, s_counter) return s_counter }
// Prints sequence of states (appear, not appear) given by slice func PrintSequence(filename string, sequence []int) string { output := make([]string, 0, 0) s_number := 0 reader_full := GetReader(filename) writer := GetWriter(filename) for bpar, e := reader_full.ReadBytes('\n'); e == nil; bpar, e = reader_full.ReadBytes('\n') { paragraph := nlptk.Paragraph{0, string(bpar[:len(bpar)-1])} var sentences []string // check if paragraph is empty if len(paragraph.Text) <= 1 { continue } // just in case avoid kinky sentences if sentences = paragraph.GetParts(); len(sentences) == 0 { continue } for _, s := range sentences { for _, v := range sequence { if v == 2*s_number+1 { output = append(output, s) writer.Write([]byte(s)) } } if s_number++; 2*s_number+1 >= len(sequence) { writer.Flush() return strings.Join(output, ". ") } } } writer.Flush() return strings.Join(output, ". ") }