func TokenizePage(r io.Reader) ([]string, string) { res := []string{} z := html.NewTokenizer(r) isTitle := false title := "" loop: for { tt := z.Next() switch tt { case html.ErrorToken: break loop case html.TextToken: text := string(z.Text()) if isTitle { title = cleanTitle(text) continue } res = append(res, bstrings.TokenizeWords(text)...) case html.EndTagToken: tn, _ := z.TagName() if string(tn) == "title" { isTitle = false } case html.StartTagToken: tn, _ := z.TagName() if string(tn) == "title" { isTitle = true } } } return res, title }
func (bsState *BsState) evaluatePhrase(v interface{}) (*BsResult, error) { var phrase string switch tv := v.(type) { case io.Reader: bytes, err := ioutil.ReadAll(tv) if err != nil { return nil, err } phrase = string(bytes) case string: phrase = tv default: return nil, errors.New(fmt.Sprintf("Unkown type v, %q", v)) } words := bstrings.TokenizeWords(phrase) return &BsResult{bstrings.TruncatePhrase(phrase, 10), bsState.EvaluateBs(words)}, nil }
func (bsState *BsState) trainWithPhrase(phrase string, bs bool) { words := bstrings.TokenizeWords(phrase) bsState.enlargeCorpus(words, bs) }