/* Compute frequency of all words. */ func (ftr *WordsAllFrequency) Compute(dataset tabula.DatasetInterface) { allWords := GetAllWordList() col := dataset.GetColumnByName("additions") for _, rec := range col.Records { r := tabula.NewRecordReal(float64(0)) s := rec.String() if len(s) == 0 { ftr.PushBack(r) continue } s = clean.WikiText(s) if len(s) == 0 { ftr.PushBack(r) continue } inWords := tekstus.StringSplitWords(s, true, false) freq := tekstus.WordsFrequenciesOf(inWords, allWords, false) r.SetFloat(Round(freq)) ftr.PushBack(r) } }
func TestWikiText(t *testing.T) { for _, td := range dataWikiText { got := clean.WikiText(td.text) assert(t, td.exp, got, true) } }
/* Compute the longest word in inserted text. */ func (ftr *LongestWord) Compute(dataset tabula.DatasetInterface) { adds := dataset.GetColumnByName("additions") addslen := adds.Len() for x, rec := range adds.Records { text := rec.String() textlen := len(text) if textlen == 0 { ftr.PushBack(tabula.NewRecordInt(int64(0))) continue } text = clean.WikiText(text) inWords := tekstus.StringSplitWords(text, true, true) slong, _ := tekstus.WordsFindLongest(inWords) if DEBUG >= 2 { fmt.Printf("[feature] %d/%d longest word: %q\n", x, addslen, slong) } slonglen := int64(len(slong)) ftr.PushBack(tabula.NewRecordInt(slonglen)) } }
/* Compute frequency vulgar words in inserted text. */ func (ftr *WordsVulgarFrequency) Compute(dataset tabula.DatasetInterface) { col := dataset.GetColumnByName("additions") for _, rec := range col.Records { s := clean.WikiText(rec.String()) freq := tekstus.StringFrequenciesOf(s, tekstus.VulgarWords, false) ftr.PushBack(tabula.NewRecordReal(Round(freq))) } }
/* Compute the frequency of inserted words. */ func (ftr *TermFrequency) Compute(dataset tabula.DatasetInterface) { newrevidx := dataset.GetColumnByName("newrevisionid") adds := dataset.GetColumnByName("additions") recordslen := len(adds.Records) for x, rec := range adds.Records { r := tabula.NewRecordReal(float64(0)) // Get inserted words. intext := rec.String() if len(intext) == 0 { ftr.PushBack(r) continue } intext = clean.WikiText(intext) inWords := tekstus.StringSplitWords(intext, true, true) // Get content of new revision. revid := newrevidx.Records[x].String() if DEBUG >= 2 { fmt.Printf("[feature] term_frequency: %d/%d processing %q\n", x, recordslen, revid) } newtext, e := revision.GetContentClean(revid) if e != nil { ftr.PushBack(r) continue } newWords := tekstus.StringSplitWords(newtext, true, false) freq := tekstus.WordsFrequenciesOf(newWords, inWords, false) r.SetFloat(Round(freq)) ftr.PushBack(r) } }
/* Compute frequency of biased words. */ func (ftr *WordsBiasFrequency) Compute(dataset tabula.DatasetInterface) { col := dataset.GetColumnByName("additions") for _, rec := range col.Records { r := tabula.NewRecordReal(float64(0)) text := rec.String() if len(text) == 0 { ftr.PushBack(r) continue } in := clean.WikiText(text) freq := tekstus.StringFrequenciesOf(in, tekstus.BiasedWords, false) r.SetFloat(freq) ftr.PushBack(r) } }