Пример #1
0
/*
Compute the longest word in inserted text.
*/
func (ftr *LongestWord) Compute(dataset tabula.DatasetInterface) {
	adds := dataset.GetColumnByName("additions")
	addslen := adds.Len()

	for x, rec := range adds.Records {
		text := rec.String()
		textlen := len(text)

		if textlen == 0 {
			ftr.PushBack(tabula.NewRecordInt(int64(0)))
			continue
		}

		text = clean.WikiText(text)
		inWords := tekstus.StringSplitWords(text, true, true)
		slong, _ := tekstus.WordsFindLongest(inWords)

		if DEBUG >= 2 {
			fmt.Printf("[feature] %d/%d longest word: %q\n", x, addslen,
				slong)
		}

		slonglen := int64(len(slong))

		ftr.PushBack(tabula.NewRecordInt(slonglen))
	}
}
Пример #2
0
/*
Compute number of good token in inserted text.
*/
func (ftr *GoodToken) Compute(dataset tabula.DatasetInterface) {
	col := dataset.GetColumnByName("additions")

	for _, rec := range col.Records {
		cnt := tekstus.StringCountTokens(rec.String(), tokens, false)

		ftr.PushBack(tabula.NewRecordInt(int64(cnt)))
	}
}
Пример #3
0
/*
Compute maximum sequence of character at inserted text.
*/
func (ftr *LongestCharSeq) Compute(dataset tabula.DatasetInterface) {
	col := dataset.GetColumnByName("additions")

	for _, rec := range col.Records {
		text := rec.String()

		_, v := tekstus.GetMaxCharSequence(text)

		ftr.PushBack(tabula.NewRecordInt(int64(v)))
	}
}
Пример #4
0
// Compute will count number of bytes that is used in comment, NOT including
// the header content "/* ... */".
func (ftr *CommentLength) Compute(dataset tabula.DatasetInterface) {
	col := dataset.GetColumnByName("editcomment")
	leftcap := []byte("/*")
	rightcap := []byte("*/")

	for _, rec := range col.Records {
		cmt := rec.Bytes()

		cmt, _ = tekstus.BytesRemoveUntil(cmt, leftcap, rightcap)

		ftr.PushBack(tabula.NewRecordInt(int64(len(cmt))))
	}
}
Пример #5
0
/*
Compute change the classification from text to numeric. The "regular" edit
will become 0 and the "vandalism" will become 1.
*/
func (ftr *Class) Compute(dataset tabula.DatasetInterface) {
	col := dataset.GetColumnByName("class")

	for _, rec := range col.Records {
		r := tabula.NewRecordInt(0)

		if rec.String() == "vandalism" {
			r.SetInteger(1)
		}

		ftr.PushBack(r)
	}
}
Пример #6
0
//
// ToRow will convert the stat to tabula.row in the order of Stat field.
//
func (stat *Stat) ToRow() (row *tabula.Row) {
	row = &tabula.Row{}

	row.PushBack(tabula.NewRecordInt(stat.ID))
	row.PushBack(tabula.NewRecordInt(stat.StartTime))
	row.PushBack(tabula.NewRecordInt(stat.EndTime))
	row.PushBack(tabula.NewRecordInt(stat.ElapsedTime))
	row.PushBack(tabula.NewRecordReal(stat.OobError))
	row.PushBack(tabula.NewRecordReal(stat.OobErrorMean))
	row.PushBack(tabula.NewRecordInt(stat.TP))
	row.PushBack(tabula.NewRecordInt(stat.FP))
	row.PushBack(tabula.NewRecordInt(stat.TN))
	row.PushBack(tabula.NewRecordInt(stat.FN))
	row.PushBack(tabula.NewRecordReal(stat.TPRate))
	row.PushBack(tabula.NewRecordReal(stat.FPRate))
	row.PushBack(tabula.NewRecordReal(stat.TNRate))
	row.PushBack(tabula.NewRecordReal(stat.Precision))
	row.PushBack(tabula.NewRecordReal(stat.FMeasure))
	row.PushBack(tabula.NewRecordReal(stat.Accuracy))
	row.PushBack(tabula.NewRecordReal(stat.AUC))

	return
}