func Tokenize(text string, stopwords map[string]int, words map[string]int) {
	var s scanner.Scanner
	s.Init(strings.NewReader(text))
	tok := s.Scan()
	for tok != scanner.EOF {
		if tok == scanner.String {
			Tokenize(strings.Trim(s.TokenText(), "\"`"), stopwords, words)
		} else if tok == scanner.Char {
			Tokenize(strings.Trim(s.TokenText(), "'"), stopwords, words)
		} else if tok == scanner.Ident {
			word := s.TokenText()
			if _, ok := stopwords[word]; !ok && len(word) > 2 {
				stem, err := snowball.Stem(word, "english", true)
				if err != nil {
					fmt.Errorf("Couldnt stem word: %s", word)
					stem = word
				}
				if _, ok := stopwords[stem]; !ok {
					words[stem] += 1
				}
			}
		}
		tok = s.Scan()
	}
}
示例#2
0
func main() {

	start := time.Now()

	text_lower := strings.ToLower(text)
	tokens := strings.Split(text_lower, separator)
	for key, token := range tokens {
		tokens[key] = strings.Trim(token, cutset)
	}

	text_words := make([]SentimentWord, 0)
	result_tone := 0
	for key, token := range tokens {
		stemmed, err := snowball.Stem(token, "russian", true)
		check(err)
		tokens[key] = stemmed
		word := SentimentWord{Word: stemmed}
		word.GetTone()
		result_tone += word.Tone
		text_words = append(text_words, word)
	}

	fmt.Println(result_tone)
	fmt.Println(time.Since(start))

	//combined := readFile("dictionary/combined_stemmed.csv")
	//stemmed := FindDuplicates(combined)
	//writeFile("dictionary/combined_stemmed.csv", stemmed)
}
示例#3
0
文件: stem.go 项目: golang-cjr/xep
func Stem(msg string) {
	defer func() {
		recover()
	}()

	for w := range split(msg) {
		if stem, err := snowball.Stem(w.word, w.lang, true); err == nil {
			var m map[string]int
			ok := false
			if m, ok = words.wm[stem]; !ok {
				m = make(map[string]int)
				words.wm[stem] = m
			}
			if x, ok := m[w.word]; ok {
				m[w.word] = x + 1
			} else {
				m[w.word] = 1
			}
		}
	}
	words.last = time.Now().Unix()
	//log.Println(words.wm)
}
示例#4
0
func readFile(filename string) []SentimentWord {
	file, err := os.Open(filename)
	check(err)

	defer file.Close()

	words := make([]SentimentWord, 0)
	scanner := bufio.NewScanner(file)
	for scanner.Scan() {
		line := scanner.Text()
		word := SentimentWord{}
		word.FromCSV(line)
		stemmed, err := snowball.Stem(word.Word, "russian", true)
		check(err)
		word.Word = stemmed
		words = append(words, word)
	}

	err = scanner.Err()
	check(err)

	return words
}
示例#5
0
func main() {

	var language *string = flag.String("l", "english", "Language")
	var infile *string = flag.String("i", "", "Input file for stemming")
	flag.Parse()

	f, err := os.Open(*infile)
	if err != nil {
		log.Fatal(err)
	}

	bf := bufio.NewReader(f)

	for {
		line, isPrefix, err := bf.ReadLine()

		if err == io.EOF {
			break
		}

		if err != nil {
			log.Fatal(err)
		}

		if isPrefix {
			log.Fatal("Error: Unexpected long line reading", f.Name())
		}

		word := strings.TrimSpace(string(line))
		stemmed, err := snowball.Stem(word, *language, true)
		if err != nil {
			log.Println(err)
			break
		}
		fmt.Println(stemmed)
	}
}