Ejemplo n.º 1
0
func TokenizePage(r io.Reader) ([]string, string) {
	res := []string{}
	z := html.NewTokenizer(r)
	isTitle := false
	title := ""
loop:
	for {
		tt := z.Next()
		switch tt {
		case html.ErrorToken:
			break loop
		case html.TextToken:
			text := string(z.Text())
			if isTitle {
				title = cleanTitle(text)
				continue
			}
			res = append(res, bstrings.TokenizeWords(text)...)
		case html.EndTagToken:
			tn, _ := z.TagName()
			if string(tn) == "title" {
				isTitle = false
			}
		case html.StartTagToken:
			tn, _ := z.TagName()
			if string(tn) == "title" {
				isTitle = true
			}
		}
	}
	return res, title
}
Ejemplo n.º 2
0
func (bsState *BsState) evaluatePhrase(v interface{}) (*BsResult, error) {
	var phrase string
	switch tv := v.(type) {
	case io.Reader:
		bytes, err := ioutil.ReadAll(tv)
		if err != nil {
			return nil, err
		}
		phrase = string(bytes)
	case string:
		phrase = tv
	default:
		return nil, errors.New(fmt.Sprintf("Unkown type v, %q", v))
	}
	words := bstrings.TokenizeWords(phrase)
	return &BsResult{bstrings.TruncatePhrase(phrase, 10), bsState.EvaluateBs(words)}, nil
}
Ejemplo n.º 3
0
func (bsState *BsState) trainWithPhrase(phrase string, bs bool) {
	words := bstrings.TokenizeWords(phrase)
	bsState.enlargeCorpus(words, bs)
}