/* Compute frequency of all words. */ func (ftr *WordsAllFrequency) Compute(dataset tabula.DatasetInterface) { allWords := GetAllWordList() col := dataset.GetColumnByName("additions") for _, rec := range col.Records { r := tabula.NewRecordReal(float64(0)) s := rec.String() if len(s) == 0 { ftr.PushBack(r) continue } s = clean.WikiText(s) if len(s) == 0 { ftr.PushBack(r) continue } inWords := tekstus.StringSplitWords(s, true, false) freq := tekstus.WordsFrequenciesOf(inWords, allWords, false) r.SetFloat(Round(freq)) ftr.PushBack(r) } }
/* Compute compress rate of inserted text. */ func (ftr *CompressRate) Compute(dataset tabula.DatasetInterface) { adds := dataset.GetColumnByName("additions") for _, rec := range adds.Records { v, _ := compressRateLzw(rec.String()) ftr.PushBack(tabula.NewRecordReal(Round(v))) } }
/* Compute non-alphanumeric ratio with all character in inserted text. */ func (ftr *NonAlnumRatio) Compute(dataset tabula.DatasetInterface) { adds := dataset.GetColumnByName("additions") for _, rec := range adds.Records { ratio := tekstus.RatioNonAlnumChar(rec.String(), false) ftr.PushBack(tabula.NewRecordReal(Round(ratio))) } }
/* Compute character diversity. */ func (ftr *CharDiversity) Compute(dataset tabula.DatasetInterface) { adds := dataset.GetColumnByName("additions") for _, rec := range adds.Records { intext := rec.String() textlen := float64(len(intext)) nuniq := tekstus.CountUniqChar(intext) v := math.Pow(textlen, 1/float64(1+nuniq)) ftr.PushBack(tabula.NewRecordReal(Round(v))) } }
/* Compute frequency vulgar words in inserted text. */ func (ftr *WordsVulgarFrequency) Compute(dataset tabula.DatasetInterface) { col := dataset.GetColumnByName("additions") for _, rec := range col.Records { s := clean.WikiText(rec.String()) freq := tekstus.StringFrequenciesOf(s, tekstus.VulgarWords, false) ftr.PushBack(tabula.NewRecordReal(Round(freq))) } }
func createNeigboursByIdx(indices []int) (neighbors knn.Neighbors) { for x, idx := range indices { row := tabula.Row{} for _, v := range dataFloat64[idx] { rec := tabula.NewRecordReal(v) row.PushBack(rec) } neighbors.Add(&row, float64(distances[x])) } return }
func createNeigbours() (neighbors knn.Neighbors) { for x, d := range dataFloat64 { row := tabula.Row{} for _, v := range d { rec := tabula.NewRecordReal(v) row.PushBack(rec) } neighbors.Add(&row, float64(distances[x])) } return }
/* Compute if record in column is IP address then it is an anonim and set their value to 1, otherwise set to 0. */ func (anon *Anonim) Compute(dataset tabula.DatasetInterface) { col := dataset.GetColumnByName("editor") for _, rec := range col.Records { r := tabula.NewRecordReal(0) IP := net.ParseIP(rec.String()) if IP != nil { r.SetFloat(1.0) } anon.PushBack(r) } }
// // ToRow will convert the stat to tabula.row in the order of Stat field. // func (stat *Stat) ToRow() (row *tabula.Row) { row = &tabula.Row{} row.PushBack(tabula.NewRecordInt(stat.ID)) row.PushBack(tabula.NewRecordInt(stat.StartTime)) row.PushBack(tabula.NewRecordInt(stat.EndTime)) row.PushBack(tabula.NewRecordInt(stat.ElapsedTime)) row.PushBack(tabula.NewRecordReal(stat.OobError)) row.PushBack(tabula.NewRecordReal(stat.OobErrorMean)) row.PushBack(tabula.NewRecordInt(stat.TP)) row.PushBack(tabula.NewRecordInt(stat.FP)) row.PushBack(tabula.NewRecordInt(stat.TN)) row.PushBack(tabula.NewRecordInt(stat.FN)) row.PushBack(tabula.NewRecordReal(stat.TPRate)) row.PushBack(tabula.NewRecordReal(stat.FPRate)) row.PushBack(tabula.NewRecordReal(stat.TNRate)) row.PushBack(tabula.NewRecordReal(stat.Precision)) row.PushBack(tabula.NewRecordReal(stat.FMeasure)) row.PushBack(tabula.NewRecordReal(stat.Accuracy)) row.PushBack(tabula.NewRecordReal(stat.AUC)) return }
/* Compute ratio of size between new and old revision. */ func (ftr *SizeRatio) Compute(dataset tabula.DatasetInterface) { oldid := dataset.GetColumnByName("oldrevisionid") newid := dataset.GetColumnByName("newrevisionid") oldidlen := newid.Len() for x, rec := range newid.Records { if x >= oldidlen { // Just in case additions is greater than deletions break } newlen := revision.GetSize(rec.String()) oldlen := revision.GetSize(oldid.Records[x].String()) difflen := float64(1+newlen) / float64(1+oldlen) ftr.PushBack(tabula.NewRecordReal(Round(difflen))) } }
/* Compute the frequency of inserted words. */ func (ftr *TermFrequency) Compute(dataset tabula.DatasetInterface) { newrevidx := dataset.GetColumnByName("newrevisionid") adds := dataset.GetColumnByName("additions") recordslen := len(adds.Records) for x, rec := range adds.Records { r := tabula.NewRecordReal(float64(0)) // Get inserted words. intext := rec.String() if len(intext) == 0 { ftr.PushBack(r) continue } intext = clean.WikiText(intext) inWords := tekstus.StringSplitWords(intext, true, true) // Get content of new revision. revid := newrevidx.Records[x].String() if DEBUG >= 2 { fmt.Printf("[feature] term_frequency: %d/%d processing %q\n", x, recordslen, revid) } newtext, e := revision.GetContentClean(revid) if e != nil { ftr.PushBack(r) continue } newWords := tekstus.StringSplitWords(newtext, true, false) freq := tekstus.WordsFrequenciesOf(newWords, inWords, false) r.SetFloat(Round(freq)) ftr.PushBack(r) } }
/* Compute frequency of biased words. */ func (ftr *WordsBiasFrequency) Compute(dataset tabula.DatasetInterface) { col := dataset.GetColumnByName("additions") for _, rec := range col.Records { r := tabula.NewRecordReal(float64(0)) text := rec.String() if len(text) == 0 { ftr.PushBack(r) continue } in := clean.WikiText(text) freq := tekstus.StringFrequenciesOf(in, tekstus.BiasedWords, false) r.SetFloat(freq) ftr.PushBack(r) } }
/* Compute frequency bias words in inserted text. */ func (ftr *WordsBiasImpact) Compute(dataset tabula.DatasetInterface) { oldrevs := dataset.GetColumnByName("oldrevisionid") newrevs := dataset.GetColumnByName("newrevisionid") oldrevslen := oldrevs.Len() for x, rec := range oldrevs.Records { v := tabula.NewRecordReal(float64(0)) oldid := rec.String() newid := newrevs.Records[x].String() freq := ComputeImpact(oldid, newid, tekstus.BiasedWords) v.SetFloat(Round(freq)) if DEBUG >= 2 { fmt.Printf("[feature] words_bias_impact: %d/%d freq: %f\n", x, oldrevslen, freq) } ftr.PushBack(v) } }
/* Compute character distribution of inserted text. */ func (ftr *CharDistributionInsert) Compute(dataset tabula.DatasetInterface) { oldrevid := dataset.GetColumnByName("oldrevisionid") adds := dataset.GetColumnByName("additions") for x, rold := range oldrevid.Records { r := tabula.NewRecordReal(0.0) // count distribution of old revision oldText, e := revision.GetContent(rold.String()) if e != nil { ftr.PushBack(r) continue } // count distribution of inserted text inText := adds.Records[x].String() divergence := KullbackLeiblerDivergence(oldText, inText) r.SetFloat(Round(divergence)) ftr.PushBack(r) } }