/* Compute the longest word in inserted text. */ func (ftr *LongestWord) Compute(dataset tabula.DatasetInterface) { adds := dataset.GetColumnByName("additions") addslen := adds.Len() for x, rec := range adds.Records { text := rec.String() textlen := len(text) if textlen == 0 { ftr.PushBack(tabula.NewRecordInt(int64(0))) continue } text = clean.WikiText(text) inWords := tekstus.StringSplitWords(text, true, true) slong, _ := tekstus.WordsFindLongest(inWords) if DEBUG >= 2 { fmt.Printf("[feature] %d/%d longest word: %q\n", x, addslen, slong) } slonglen := int64(len(slong)) ftr.PushBack(tabula.NewRecordInt(slonglen)) } }
/* Compute number of good token in inserted text. */ func (ftr *GoodToken) Compute(dataset tabula.DatasetInterface) { col := dataset.GetColumnByName("additions") for _, rec := range col.Records { cnt := tekstus.StringCountTokens(rec.String(), tokens, false) ftr.PushBack(tabula.NewRecordInt(int64(cnt))) } }
/* Compute maximum sequence of character at inserted text. */ func (ftr *LongestCharSeq) Compute(dataset tabula.DatasetInterface) { col := dataset.GetColumnByName("additions") for _, rec := range col.Records { text := rec.String() _, v := tekstus.GetMaxCharSequence(text) ftr.PushBack(tabula.NewRecordInt(int64(v))) } }
// Compute will count number of bytes that is used in comment, NOT including // the header content "/* ... */". func (ftr *CommentLength) Compute(dataset tabula.DatasetInterface) { col := dataset.GetColumnByName("editcomment") leftcap := []byte("/*") rightcap := []byte("*/") for _, rec := range col.Records { cmt := rec.Bytes() cmt, _ = tekstus.BytesRemoveUntil(cmt, leftcap, rightcap) ftr.PushBack(tabula.NewRecordInt(int64(len(cmt)))) } }
/* Compute change the classification from text to numeric. The "regular" edit will become 0 and the "vandalism" will become 1. */ func (ftr *Class) Compute(dataset tabula.DatasetInterface) { col := dataset.GetColumnByName("class") for _, rec := range col.Records { r := tabula.NewRecordInt(0) if rec.String() == "vandalism" { r.SetInteger(1) } ftr.PushBack(r) } }
// // ToRow will convert the stat to tabula.row in the order of Stat field. // func (stat *Stat) ToRow() (row *tabula.Row) { row = &tabula.Row{} row.PushBack(tabula.NewRecordInt(stat.ID)) row.PushBack(tabula.NewRecordInt(stat.StartTime)) row.PushBack(tabula.NewRecordInt(stat.EndTime)) row.PushBack(tabula.NewRecordInt(stat.ElapsedTime)) row.PushBack(tabula.NewRecordReal(stat.OobError)) row.PushBack(tabula.NewRecordReal(stat.OobErrorMean)) row.PushBack(tabula.NewRecordInt(stat.TP)) row.PushBack(tabula.NewRecordInt(stat.FP)) row.PushBack(tabula.NewRecordInt(stat.TN)) row.PushBack(tabula.NewRecordInt(stat.FN)) row.PushBack(tabula.NewRecordReal(stat.TPRate)) row.PushBack(tabula.NewRecordReal(stat.FPRate)) row.PushBack(tabula.NewRecordReal(stat.TNRate)) row.PushBack(tabula.NewRecordReal(stat.Precision)) row.PushBack(tabula.NewRecordReal(stat.FMeasure)) row.PushBack(tabula.NewRecordReal(stat.Accuracy)) row.PushBack(tabula.NewRecordReal(stat.AUC)) return }