Exemple #1
0
// Function turns file into slice with feature symbols
// 	- feature == 0 -> position in paragraph
//  - feature == 1 -> sentence length // now for else
func CreateObservationSequence(filename string, states int) [][]int {

	output := make([][]int, 0, 0)

	s_number := 0

	reader_full := GetReader(filename)

	for bpar, e := reader_full.ReadBytes('\n'); e == nil; bpar, e = reader_full.ReadBytes('\n') {

		var sentences []string
		par := nlptk.Paragraph{0, string(bpar[:len(bpar)-1])}

		if sentences = par.GetParts(); len(par.Text) <= 1 || len(sentences) == 0 {
			continue
		}

		for i, s := range sentences {
			sentence := nlptk.Sentence{s_number, s[:len(s)-1]}

			output = append(output, []int{i + 1, len(sentence.GetParts())}) // summary state
		}
		if s_number++; 2*s_number >= states {
			break
		}
	}
	return output
}
Exemple #2
0
// Extracts, trims from special signs and counts "bare" words in learning set.
// Result dictionary is sent to channel.
func WordCount(filename string, dict chan map[string]int) {
	word_counter := make(map[string]int)
	file, err := os.Open(SETDIR + filename)

	if err != nil {
		fmt.Println("Error reading file", filename)
		os.Exit(1)
	}

	reader := bufio.NewReader(file)

	for bpar, e := reader.ReadBytes('\n'); e == nil; bpar, e = reader.ReadBytes('\n') {
		paragraph := nlptk.Paragraph{0, string(bpar)}
		sentences := paragraph.GetParts()

		for _, sentence := range sentences {
			s := nlptk.Sentence{0, sentence}
			words := s.GetParts()

			if len(words) == 0 {
				continue
			}

			for _, word := range words {
				word_counter[word]++
				word_counter["TOTAL"]++
			}
		}
	}
	dict <- word_counter
}
Exemple #3
0
// Analyze full text and summarization to prepare observations:
//	- vectors of features
//	- binary table of sentence in summarization presence
func ObserveFile(filename, full_dir, summ_dir string, states int) [][]int {

	reader_full := GetReader(full_dir + filename)
	reader_summ := GetReader(summ_dir + filename)

	s_counter := make([][]int, 0, 0)
	s_number, p_number := 0, 0

	spar, _ := reader_summ.ReadBytes('\n')
	summarization := nlptk.Paragraph{p_number, string(spar)}
	sum_sentences := summarization.Text

	for bpar, e := reader_full.ReadBytes('\n'); e == nil; bpar, e = reader_full.ReadBytes('\n') {

		var sentences []string
		paragraph := nlptk.Paragraph{p_number, string(bpar[:len(bpar)-1])}

		if sentences = paragraph.GetParts(); len(paragraph.Text) <= 1 || len(sentences) == 0 {
			continue
		}

		for i, s := range sentences {
			sentence := nlptk.Sentence{s_number, s[:len(s)-1]}

			if strings.Contains(sum_sentences, s[:len(s)-1]) {
				// summary
				s_counter = append(s_counter, []int{0, 0})
				s_counter = append(s_counter, []int{i + 1, len(sentence.GetParts())})
			} else {
				// non-summary
				s_counter = append(s_counter, []int{i + 1, len(sentence.GetParts())})
				s_counter = append(s_counter, []int{0, 0})
			}

			if s_number++; 2*s_number >= states {
				return s_counter
			}
		}
		p_number++
	}
	fmt.Println("Sequence", filename, s_counter)
	return s_counter
}