Пример #1
0
func main() {
	flag.Usage = func() {
		fmt.Fprintf(os.Stderr, "usage: %s FILENAME\n", os.Args[0])
		flag.PrintDefaults()
	}
	lang := flag.String("lang", "english", "stemmer language")
	flag.Parse()

	if flag.NArg() != 1 {
		fmt.Fprintf(os.Stderr, "error: wrong number of arguments\n")
		os.Exit(1)
	}

	fmt.Println("Using snowball version", snowball.Version)

	stmr, err := snowball.New(*lang)
	if err != nil {
		fmt.Fprintf(os.Stderr, "error: %s\n", err)
		os.Exit(1)
	}

	data, err := ioutil.ReadFile(flag.Arg(0))
	if err != nil {
		fmt.Fprintf(os.Stderr, "error: can't open %s - %s\n", flag.Arg(0), err)
		os.Exit(1)
	}

	re := regexp.MustCompile("[a-zA-Z]+")

	for _, field := range re.FindAll(data, -1) {
		word := string(bytes.ToLower(field))
		fmt.Printf("%s -> %s\n", word, stmr.Stem(word))
	}
}
Пример #2
0
func NewStemmerFilter(lang string) (*StemmerFilter, error) {
	stemmer, err := snowball.New(lang)
	if err != nil {
		return nil, err
	}
	return &StemmerFilter{
		lang:    lang,
		stemmer: stemmer,
	}, nil
}
Пример #3
0
func openGraph(path string) (*graph, error) {
	if _, err := os.Stat(path); err != nil {
		if !os.IsNotExist(err) {
			return nil, err
		}

		err = initGraph(path, defaultGraphOptions)
		if err != nil {
			return nil, err
		}
	}

	url := fmt.Sprintf("file:%s?cache=shared&mode=rwc", path)

	db, err := sql.Open("sqlite3", url)
	if err != nil {
		return nil, err
	}

	err = pragmas(db)
	if err != nil {
		return nil, err
	}

	stmts := new(stmts)
	err = prepareInfoSql(db, stmts)
	if err != nil {
		return nil, err
	}

	g := &graph{db: db, lock: sync.RWMutex{}, q: stmts}
	g.order = g.getOrder()

	err = prepareSql(db, stmts, g.order)
	if err != nil {
		return nil, err
	}

	lang, err := g.getInfoString("stemmer")
	if lang != "" {
		s, err := snowball.New(lang)
		if err != nil {
			log.Printf("Error initializing stemmer: %s", err)
		} else {
			g.stemmer = newCobeStemmer(s)
		}
	}

	g.endTokenID = g.getOrCreateToken("")
	g.endContextID = g.getOrCreateNode(g.endContext())

	return g, nil
}
Пример #4
0
func NewStemmerFilter(lang string) (*StemmerFilter, error) {
	stemmerPool := make(chan *snowball.Stemmer, 4)
	for i := 0; i < 4; i++ {
		stemmer, err := snowball.New(lang)
		if err != nil {
			return nil, err
		}
		stemmerPool <- stemmer
	}
	return &StemmerFilter{
		lang:        lang,
		stemmerPool: stemmerPool,
	}, nil
}
Пример #5
0
func (g *graph) setStemmer(lang string) error {
	snow, err := snowball.New(lang)
	if err != nil {
		return err
	}

	stemmer := newCobeStemmer(snow)

	g.deleteTokenStems()
	g.updateTokenStems(stemmer)
	g.setInfoString("stemmer", lang)
	g.stemmer = stemmer

	return nil
}
Пример #6
0
func TestCobeStemmer(t *testing.T) {
	snow, _ := snowball.New("english")
	s := newCobeStemmer(snow)

	// Straight port of the Python cobe stemmer.
	var tests = []struct {
		token    string
		expected string
	}{
		{"foo", "foo"},
		{"jumping", "jump"},
		{"running", "run"},

		{"Foo", "foo"},
		{"FOO", "foo"},
		{"FOO'S'", "foo"},
		{"FOOING", "foo"},
		{"Fooing", "foo"},

		{":)", ":)"},
		{":-)", ":)"},
		{":    )", ":)"},

		{":()", ":("},
		{":-(", ":("},
		{":    (", ":("},
		{":'    (", ":("},
	}

	for ti, tt := range tests {
		stem := s.Stem(tt.token)
		if tt.expected != stem {
			t.Errorf("[%d] %s\n%s !=\n%s", ti, tt.token, stem, tt.expected)
		}
	}
}
Пример #7
0
func NewSnowball() (Stemmer, error) {
	return snowball.New("english")
}