func init() { // prepare tokenizer b, _ := data.Asset("data/english.json") // load the training data training, _ := sentences.LoadTraining(b) // create the default sentence tokenizer tokenizer = sentences.NewSentenceTokenizer(training) }
// English customized sentence tokenizer. func NewSentenceTokenizer(s *sentences.Storage) (*sentences.DefaultSentenceTokenizer, error) { training := s if training == nil { b, err := data.Asset("data/english.json") if err != nil { return nil, err } training, err = sentences.LoadTraining(b) if err != nil { return nil, err } } // supervisor abbreviations abbrevs := []string{"sgt", "gov", "no"} for _, abbr := range abbrevs { training.AbbrevTypes.Add(abbr) } lang := sentences.NewPunctStrings() word := NewWordTokenizer(lang) annotations := sentences.NewAnnotations(training, lang, word) ortho := &sentences.OrthoContext{ Storage: training, PunctStrings: lang, TokenType: word, TokenFirst: word, } multiPunct := &MultiPunctWordAnnotation{ Storage: training, TokenParser: word, TokenGrouper: &sentences.DefaultTokenGrouper{}, Ortho: ortho, } annotations = append(annotations, multiPunct) tokenizer := &sentences.DefaultSentenceTokenizer{ Storage: training, PunctStrings: lang, WordTokenizer: word, Annotations: annotations, } return tokenizer, nil }