Пример #1
0
/*
Compute frequency of all words.
*/
func (ftr *WordsAllFrequency) Compute(dataset tabula.DatasetInterface) {
	allWords := GetAllWordList()

	col := dataset.GetColumnByName("additions")

	for _, rec := range col.Records {
		r := tabula.NewRecordReal(float64(0))

		s := rec.String()
		if len(s) == 0 {
			ftr.PushBack(r)
			continue
		}

		s = clean.WikiText(s)
		if len(s) == 0 {
			ftr.PushBack(r)
			continue
		}

		inWords := tekstus.StringSplitWords(s, true, false)

		freq := tekstus.WordsFrequenciesOf(inWords, allWords, false)

		r.SetFloat(Round(freq))

		ftr.PushBack(r)
	}
}
Пример #2
0
/*
Compute compress rate of inserted text.
*/
func (ftr *CompressRate) Compute(dataset tabula.DatasetInterface) {
	adds := dataset.GetColumnByName("additions")

	for _, rec := range adds.Records {
		v, _ := compressRateLzw(rec.String())

		ftr.PushBack(tabula.NewRecordReal(Round(v)))
	}
}
Пример #3
0
/*
Compute non-alphanumeric ratio with all character in inserted text.
*/
func (ftr *NonAlnumRatio) Compute(dataset tabula.DatasetInterface) {
	adds := dataset.GetColumnByName("additions")

	for _, rec := range adds.Records {
		ratio := tekstus.RatioNonAlnumChar(rec.String(), false)

		ftr.PushBack(tabula.NewRecordReal(Round(ratio)))
	}
}
Пример #4
0
/*
Compute character diversity.
*/
func (ftr *CharDiversity) Compute(dataset tabula.DatasetInterface) {
	adds := dataset.GetColumnByName("additions")

	for _, rec := range adds.Records {
		intext := rec.String()
		textlen := float64(len(intext))
		nuniq := tekstus.CountUniqChar(intext)
		v := math.Pow(textlen, 1/float64(1+nuniq))

		ftr.PushBack(tabula.NewRecordReal(Round(v)))
	}
}
Пример #5
0
/*
Compute frequency vulgar words in inserted text.
*/
func (ftr *WordsVulgarFrequency) Compute(dataset tabula.DatasetInterface) {
	col := dataset.GetColumnByName("additions")

	for _, rec := range col.Records {
		s := clean.WikiText(rec.String())

		freq := tekstus.StringFrequenciesOf(s, tekstus.VulgarWords,
			false)

		ftr.PushBack(tabula.NewRecordReal(Round(freq)))
	}
}
Пример #6
0
func createNeigboursByIdx(indices []int) (neighbors knn.Neighbors) {
	for x, idx := range indices {
		row := tabula.Row{}

		for _, v := range dataFloat64[idx] {
			rec := tabula.NewRecordReal(v)
			row.PushBack(rec)
		}

		neighbors.Add(&row, float64(distances[x]))
	}
	return
}
Пример #7
0
func createNeigbours() (neighbors knn.Neighbors) {
	for x, d := range dataFloat64 {
		row := tabula.Row{}

		for _, v := range d {
			rec := tabula.NewRecordReal(v)
			row.PushBack(rec)
		}

		neighbors.Add(&row, float64(distances[x]))
	}
	return
}
Пример #8
0
/*
Compute if record in column is IP address then it is an anonim and set
their value to 1, otherwise set to 0.
*/
func (anon *Anonim) Compute(dataset tabula.DatasetInterface) {
	col := dataset.GetColumnByName("editor")

	for _, rec := range col.Records {
		r := tabula.NewRecordReal(0)

		IP := net.ParseIP(rec.String())

		if IP != nil {
			r.SetFloat(1.0)
		}

		anon.PushBack(r)
	}
}
Пример #9
0
//
// ToRow will convert the stat to tabula.row in the order of Stat field.
//
func (stat *Stat) ToRow() (row *tabula.Row) {
	row = &tabula.Row{}

	row.PushBack(tabula.NewRecordInt(stat.ID))
	row.PushBack(tabula.NewRecordInt(stat.StartTime))
	row.PushBack(tabula.NewRecordInt(stat.EndTime))
	row.PushBack(tabula.NewRecordInt(stat.ElapsedTime))
	row.PushBack(tabula.NewRecordReal(stat.OobError))
	row.PushBack(tabula.NewRecordReal(stat.OobErrorMean))
	row.PushBack(tabula.NewRecordInt(stat.TP))
	row.PushBack(tabula.NewRecordInt(stat.FP))
	row.PushBack(tabula.NewRecordInt(stat.TN))
	row.PushBack(tabula.NewRecordInt(stat.FN))
	row.PushBack(tabula.NewRecordReal(stat.TPRate))
	row.PushBack(tabula.NewRecordReal(stat.FPRate))
	row.PushBack(tabula.NewRecordReal(stat.TNRate))
	row.PushBack(tabula.NewRecordReal(stat.Precision))
	row.PushBack(tabula.NewRecordReal(stat.FMeasure))
	row.PushBack(tabula.NewRecordReal(stat.Accuracy))
	row.PushBack(tabula.NewRecordReal(stat.AUC))

	return
}
Пример #10
0
/*
Compute ratio of size between new and old revision.
*/
func (ftr *SizeRatio) Compute(dataset tabula.DatasetInterface) {
	oldid := dataset.GetColumnByName("oldrevisionid")
	newid := dataset.GetColumnByName("newrevisionid")

	oldidlen := newid.Len()

	for x, rec := range newid.Records {
		if x >= oldidlen {
			// Just in case additions is greater than deletions
			break
		}

		newlen := revision.GetSize(rec.String())
		oldlen := revision.GetSize(oldid.Records[x].String())
		difflen := float64(1+newlen) / float64(1+oldlen)

		ftr.PushBack(tabula.NewRecordReal(Round(difflen)))
	}
}
Пример #11
0
/*
Compute the frequency of inserted words.
*/
func (ftr *TermFrequency) Compute(dataset tabula.DatasetInterface) {
	newrevidx := dataset.GetColumnByName("newrevisionid")
	adds := dataset.GetColumnByName("additions")
	recordslen := len(adds.Records)

	for x, rec := range adds.Records {
		r := tabula.NewRecordReal(float64(0))
		// Get inserted words.
		intext := rec.String()

		if len(intext) == 0 {
			ftr.PushBack(r)
			continue
		}

		intext = clean.WikiText(intext)
		inWords := tekstus.StringSplitWords(intext, true, true)

		// Get content of new revision.
		revid := newrevidx.Records[x].String()

		if DEBUG >= 2 {
			fmt.Printf("[feature] term_frequency: %d/%d processing %q\n",
				x, recordslen, revid)
		}

		newtext, e := revision.GetContentClean(revid)
		if e != nil {
			ftr.PushBack(r)
			continue
		}

		newWords := tekstus.StringSplitWords(newtext, true, false)

		freq := tekstus.WordsFrequenciesOf(newWords, inWords, false)

		r.SetFloat(Round(freq))

		ftr.PushBack(r)
	}
}
Пример #12
0
/*
Compute frequency of biased words.
*/
func (ftr *WordsBiasFrequency) Compute(dataset tabula.DatasetInterface) {
	col := dataset.GetColumnByName("additions")

	for _, rec := range col.Records {
		r := tabula.NewRecordReal(float64(0))
		text := rec.String()
		if len(text) == 0 {
			ftr.PushBack(r)
			continue
		}

		in := clean.WikiText(text)

		freq := tekstus.StringFrequenciesOf(in,
			tekstus.BiasedWords, false)

		r.SetFloat(freq)

		ftr.PushBack(r)
	}
}
Пример #13
0
/*
Compute frequency bias words in inserted text.
*/
func (ftr *WordsBiasImpact) Compute(dataset tabula.DatasetInterface) {
	oldrevs := dataset.GetColumnByName("oldrevisionid")
	newrevs := dataset.GetColumnByName("newrevisionid")
	oldrevslen := oldrevs.Len()

	for x, rec := range oldrevs.Records {
		v := tabula.NewRecordReal(float64(0))

		oldid := rec.String()
		newid := newrevs.Records[x].String()

		freq := ComputeImpact(oldid, newid, tekstus.BiasedWords)

		v.SetFloat(Round(freq))

		if DEBUG >= 2 {
			fmt.Printf("[feature] words_bias_impact: %d/%d freq: %f\n",
				x, oldrevslen, freq)
		}

		ftr.PushBack(v)
	}
}
Пример #14
0
/*
Compute character distribution of inserted text.
*/
func (ftr *CharDistributionInsert) Compute(dataset tabula.DatasetInterface) {
	oldrevid := dataset.GetColumnByName("oldrevisionid")
	adds := dataset.GetColumnByName("additions")

	for x, rold := range oldrevid.Records {
		r := tabula.NewRecordReal(0.0)
		// count distribution of old revision
		oldText, e := revision.GetContent(rold.String())

		if e != nil {
			ftr.PushBack(r)
			continue
		}

		// count distribution of inserted text
		inText := adds.Records[x].String()

		divergence := KullbackLeiblerDivergence(oldText, inText)

		r.SetFloat(Round(divergence))

		ftr.PushBack(r)
	}
}