func main() { flag.Usage = func() { fmt.Fprintf(os.Stderr, "usage: %s FILENAME\n", os.Args[0]) flag.PrintDefaults() } lang := flag.String("lang", "english", "stemmer language") flag.Parse() if flag.NArg() != 1 { fmt.Fprintf(os.Stderr, "error: wrong number of arguments\n") os.Exit(1) } fmt.Println("Using snowball version", snowball.Version) stmr, err := snowball.New(*lang) if err != nil { fmt.Fprintf(os.Stderr, "error: %s\n", err) os.Exit(1) } data, err := ioutil.ReadFile(flag.Arg(0)) if err != nil { fmt.Fprintf(os.Stderr, "error: can't open %s - %s\n", flag.Arg(0), err) os.Exit(1) } re := regexp.MustCompile("[a-zA-Z]+") for _, field := range re.FindAll(data, -1) { word := string(bytes.ToLower(field)) fmt.Printf("%s -> %s\n", word, stmr.Stem(word)) } }
func NewStemmerFilter(lang string) (*StemmerFilter, error) { stemmer, err := snowball.New(lang) if err != nil { return nil, err } return &StemmerFilter{ lang: lang, stemmer: stemmer, }, nil }
func openGraph(path string) (*graph, error) { if _, err := os.Stat(path); err != nil { if !os.IsNotExist(err) { return nil, err } err = initGraph(path, defaultGraphOptions) if err != nil { return nil, err } } url := fmt.Sprintf("file:%s?cache=shared&mode=rwc", path) db, err := sql.Open("sqlite3", url) if err != nil { return nil, err } err = pragmas(db) if err != nil { return nil, err } stmts := new(stmts) err = prepareInfoSql(db, stmts) if err != nil { return nil, err } g := &graph{db: db, lock: sync.RWMutex{}, q: stmts} g.order = g.getOrder() err = prepareSql(db, stmts, g.order) if err != nil { return nil, err } lang, err := g.getInfoString("stemmer") if lang != "" { s, err := snowball.New(lang) if err != nil { log.Printf("Error initializing stemmer: %s", err) } else { g.stemmer = newCobeStemmer(s) } } g.endTokenID = g.getOrCreateToken("") g.endContextID = g.getOrCreateNode(g.endContext()) return g, nil }
func NewStemmerFilter(lang string) (*StemmerFilter, error) { stemmerPool := make(chan *snowball.Stemmer, 4) for i := 0; i < 4; i++ { stemmer, err := snowball.New(lang) if err != nil { return nil, err } stemmerPool <- stemmer } return &StemmerFilter{ lang: lang, stemmerPool: stemmerPool, }, nil }
func (g *graph) setStemmer(lang string) error { snow, err := snowball.New(lang) if err != nil { return err } stemmer := newCobeStemmer(snow) g.deleteTokenStems() g.updateTokenStems(stemmer) g.setInfoString("stemmer", lang) g.stemmer = stemmer return nil }
func TestCobeStemmer(t *testing.T) { snow, _ := snowball.New("english") s := newCobeStemmer(snow) // Straight port of the Python cobe stemmer. var tests = []struct { token string expected string }{ {"foo", "foo"}, {"jumping", "jump"}, {"running", "run"}, {"Foo", "foo"}, {"FOO", "foo"}, {"FOO'S'", "foo"}, {"FOOING", "foo"}, {"Fooing", "foo"}, {":)", ":)"}, {":-)", ":)"}, {": )", ":)"}, {":()", ":("}, {":-(", ":("}, {": (", ":("}, {":' (", ":("}, } for ti, tt := range tests { stem := s.Stem(tt.token) if tt.expected != stem { t.Errorf("[%d] %s\n%s !=\n%s", ti, tt.token, stem, tt.expected) } } }
func NewSnowball() (Stemmer, error) { return snowball.New("english") }