func Tokenize(text string, stopwords map[string]int, words map[string]int) { var s scanner.Scanner s.Init(strings.NewReader(text)) tok := s.Scan() for tok != scanner.EOF { if tok == scanner.String { Tokenize(strings.Trim(s.TokenText(), "\"`"), stopwords, words) } else if tok == scanner.Char { Tokenize(strings.Trim(s.TokenText(), "'"), stopwords, words) } else if tok == scanner.Ident { word := s.TokenText() if _, ok := stopwords[word]; !ok && len(word) > 2 { stem, err := snowball.Stem(word, "english", true) if err != nil { fmt.Errorf("Couldnt stem word: %s", word) stem = word } if _, ok := stopwords[stem]; !ok { words[stem] += 1 } } } tok = s.Scan() } }
func main() { start := time.Now() text_lower := strings.ToLower(text) tokens := strings.Split(text_lower, separator) for key, token := range tokens { tokens[key] = strings.Trim(token, cutset) } text_words := make([]SentimentWord, 0) result_tone := 0 for key, token := range tokens { stemmed, err := snowball.Stem(token, "russian", true) check(err) tokens[key] = stemmed word := SentimentWord{Word: stemmed} word.GetTone() result_tone += word.Tone text_words = append(text_words, word) } fmt.Println(result_tone) fmt.Println(time.Since(start)) //combined := readFile("dictionary/combined_stemmed.csv") //stemmed := FindDuplicates(combined) //writeFile("dictionary/combined_stemmed.csv", stemmed) }
func Stem(msg string) { defer func() { recover() }() for w := range split(msg) { if stem, err := snowball.Stem(w.word, w.lang, true); err == nil { var m map[string]int ok := false if m, ok = words.wm[stem]; !ok { m = make(map[string]int) words.wm[stem] = m } if x, ok := m[w.word]; ok { m[w.word] = x + 1 } else { m[w.word] = 1 } } } words.last = time.Now().Unix() //log.Println(words.wm) }
func readFile(filename string) []SentimentWord { file, err := os.Open(filename) check(err) defer file.Close() words := make([]SentimentWord, 0) scanner := bufio.NewScanner(file) for scanner.Scan() { line := scanner.Text() word := SentimentWord{} word.FromCSV(line) stemmed, err := snowball.Stem(word.Word, "russian", true) check(err) word.Word = stemmed words = append(words, word) } err = scanner.Err() check(err) return words }
func main() { var language *string = flag.String("l", "english", "Language") var infile *string = flag.String("i", "", "Input file for stemming") flag.Parse() f, err := os.Open(*infile) if err != nil { log.Fatal(err) } bf := bufio.NewReader(f) for { line, isPrefix, err := bf.ReadLine() if err == io.EOF { break } if err != nil { log.Fatal(err) } if isPrefix { log.Fatal("Error: Unexpected long line reading", f.Name()) } word := strings.TrimSpace(string(line)) stemmed, err := snowball.Stem(word, *language, true) if err != nil { log.Println(err) break } fmt.Println(stemmed) } }