/* Compute frequency of all words. */ func (ftr *WordsAllFrequency) Compute(dataset tabula.DatasetInterface) { allWords := GetAllWordList() col := dataset.GetColumnByName("additions") for _, rec := range col.Records { r := tabula.NewRecordReal(float64(0)) s := rec.String() if len(s) == 0 { ftr.PushBack(r) continue } s = clean.WikiText(s) if len(s) == 0 { ftr.PushBack(r) continue } inWords := tekstus.StringSplitWords(s, true, false) freq := tekstus.WordsFrequenciesOf(inWords, allWords, false) r.SetFloat(Round(freq)) ftr.PushBack(r) } }
/* Compute the longest word in inserted text. */ func (ftr *LongestWord) Compute(dataset tabula.DatasetInterface) { adds := dataset.GetColumnByName("additions") addslen := adds.Len() for x, rec := range adds.Records { text := rec.String() textlen := len(text) if textlen == 0 { ftr.PushBack(tabula.NewRecordInt(int64(0))) continue } text = clean.WikiText(text) inWords := tekstus.StringSplitWords(text, true, true) slong, _ := tekstus.WordsFindLongest(inWords) if DEBUG >= 2 { fmt.Printf("[feature] %d/%d longest word: %q\n", x, addslen, slong) } slonglen := int64(len(slong)) ftr.PushBack(tabula.NewRecordInt(slonglen)) } }
func TestStringSplitWords(t *testing.T) { for _, td := range dataStringSplitWords { got := tekstus.StringSplitWords(td.text, true, true) assert(t, td.exp, got, true) } }
/* Compute the frequency of inserted words. */ func (ftr *TermFrequency) Compute(dataset tabula.DatasetInterface) { newrevidx := dataset.GetColumnByName("newrevisionid") adds := dataset.GetColumnByName("additions") recordslen := len(adds.Records) for x, rec := range adds.Records { r := tabula.NewRecordReal(float64(0)) // Get inserted words. intext := rec.String() if len(intext) == 0 { ftr.PushBack(r) continue } intext = clean.WikiText(intext) inWords := tekstus.StringSplitWords(intext, true, true) // Get content of new revision. revid := newrevidx.Records[x].String() if DEBUG >= 2 { fmt.Printf("[feature] term_frequency: %d/%d processing %q\n", x, recordslen, revid) } newtext, e := revision.GetContentClean(revid) if e != nil { ftr.PushBack(r) continue } newWords := tekstus.StringSplitWords(newtext, true, false) freq := tekstus.WordsFrequenciesOf(newWords, inWords, false) r.SetFloat(Round(freq)) ftr.PushBack(r) } }