Пример #1
0
/*
Compute frequency of all words.
*/
func (ftr *WordsAllFrequency) Compute(dataset tabula.DatasetInterface) {
	allWords := GetAllWordList()

	col := dataset.GetColumnByName("additions")

	for _, rec := range col.Records {
		r := tabula.NewRecordReal(float64(0))

		s := rec.String()
		if len(s) == 0 {
			ftr.PushBack(r)
			continue
		}

		s = clean.WikiText(s)
		if len(s) == 0 {
			ftr.PushBack(r)
			continue
		}

		inWords := tekstus.StringSplitWords(s, true, false)

		freq := tekstus.WordsFrequenciesOf(inWords, allWords, false)

		r.SetFloat(Round(freq))

		ftr.PushBack(r)
	}
}
Пример #2
0
/*
Compute the longest word in inserted text.
*/
func (ftr *LongestWord) Compute(dataset tabula.DatasetInterface) {
	adds := dataset.GetColumnByName("additions")
	addslen := adds.Len()

	for x, rec := range adds.Records {
		text := rec.String()
		textlen := len(text)

		if textlen == 0 {
			ftr.PushBack(tabula.NewRecordInt(int64(0)))
			continue
		}

		text = clean.WikiText(text)
		inWords := tekstus.StringSplitWords(text, true, true)
		slong, _ := tekstus.WordsFindLongest(inWords)

		if DEBUG >= 2 {
			fmt.Printf("[feature] %d/%d longest word: %q\n", x, addslen,
				slong)
		}

		slonglen := int64(len(slong))

		ftr.PushBack(tabula.NewRecordInt(slonglen))
	}
}
Пример #3
0
/*
Compute describe what this feature do.
*/
func (ftr *Template) Compute(dataset tabula.DatasetInterface) {
	// Get the column from dataset. This is a reference to `InputMetadata`
	// in `features.dsv`.
	// To see the list of column that we can process, see `features.dsv`
	// for an example.
	col := dataset.GetColumnByName("editid")

	for _, rec := range col.Records {
		// This is where the computed value will be saved.
		r := &tabula.Record{}

		// Get the field value from dataset
		s := rec.String()

		// Process the field value `s`, (e.g. cleaning, etc).
		// ...

		// Set the feature value after processing
		e := r.SetValue(s, ftr.GetType())
		if e == nil {
			r.SetInteger(0)
		}

		// Save the record value
		ftr.PushBack(r)
	}
}
Пример #4
0
/*
Compute number of good token in inserted text.
*/
func (ftr *GoodToken) Compute(dataset tabula.DatasetInterface) {
	col := dataset.GetColumnByName("additions")

	for _, rec := range col.Records {
		cnt := tekstus.StringCountTokens(rec.String(), tokens, false)

		ftr.PushBack(tabula.NewRecordInt(int64(cnt)))
	}
}
Пример #5
0
/*
Compute compress rate of inserted text.
*/
func (ftr *CompressRate) Compute(dataset tabula.DatasetInterface) {
	adds := dataset.GetColumnByName("additions")

	for _, rec := range adds.Records {
		v, _ := compressRateLzw(rec.String())

		ftr.PushBack(tabula.NewRecordReal(Round(v)))
	}
}
Пример #6
0
/*
Compute non-alphanumeric ratio with all character in inserted text.
*/
func (ftr *NonAlnumRatio) Compute(dataset tabula.DatasetInterface) {
	adds := dataset.GetColumnByName("additions")

	for _, rec := range adds.Records {
		ratio := tekstus.RatioNonAlnumChar(rec.String(), false)

		ftr.PushBack(tabula.NewRecordReal(Round(ratio)))
	}
}
Пример #7
0
/*
Compute maximum sequence of character at inserted text.
*/
func (ftr *LongestCharSeq) Compute(dataset tabula.DatasetInterface) {
	col := dataset.GetColumnByName("additions")

	for _, rec := range col.Records {
		text := rec.String()

		_, v := tekstus.GetMaxCharSequence(text)

		ftr.PushBack(tabula.NewRecordInt(int64(v)))
	}
}
Пример #8
0
/*
Compute frequency vulgar words in inserted text.
*/
func (ftr *WordsVulgarFrequency) Compute(dataset tabula.DatasetInterface) {
	col := dataset.GetColumnByName("additions")

	for _, rec := range col.Records {
		s := clean.WikiText(rec.String())

		freq := tekstus.StringFrequenciesOf(s, tekstus.VulgarWords,
			false)

		ftr.PushBack(tabula.NewRecordReal(Round(freq)))
	}
}
Пример #9
0
/*
Compute character diversity.
*/
func (ftr *CharDiversity) Compute(dataset tabula.DatasetInterface) {
	adds := dataset.GetColumnByName("additions")

	for _, rec := range adds.Records {
		intext := rec.String()
		textlen := float64(len(intext))
		nuniq := tekstus.CountUniqChar(intext)
		v := math.Pow(textlen, 1/float64(1+nuniq))

		ftr.PushBack(tabula.NewRecordReal(Round(v)))
	}
}
Пример #10
0
// Compute will count number of bytes that is used in comment, NOT including
// the header content "/* ... */".
func (ftr *CommentLength) Compute(dataset tabula.DatasetInterface) {
	col := dataset.GetColumnByName("editcomment")
	leftcap := []byte("/*")
	rightcap := []byte("*/")

	for _, rec := range col.Records {
		cmt := rec.Bytes()

		cmt, _ = tekstus.BytesRemoveUntil(cmt, leftcap, rightcap)

		ftr.PushBack(tabula.NewRecordInt(int64(len(cmt))))
	}
}
Пример #11
0
/*
Compute change the classification from text to numeric. The "regular" edit
will become 0 and the "vandalism" will become 1.
*/
func (ftr *Class) Compute(dataset tabula.DatasetInterface) {
	col := dataset.GetColumnByName("class")

	for _, rec := range col.Records {
		r := tabula.NewRecordInt(0)

		if rec.String() == "vandalism" {
			r.SetInteger(1)
		}

		ftr.PushBack(r)
	}
}
Пример #12
0
/*
Compute if record in column is IP address then it is an anonim and set
their value to 1, otherwise set to 0.
*/
func (anon *Anonim) Compute(dataset tabula.DatasetInterface) {
	col := dataset.GetColumnByName("editor")

	for _, rec := range col.Records {
		r := tabula.NewRecordReal(0)

		IP := net.ParseIP(rec.String())

		if IP != nil {
			r.SetFloat(1.0)
		}

		anon.PushBack(r)
	}
}
Пример #13
0
/*
Compute ratio of size between new and old revision.
*/
func (ftr *SizeRatio) Compute(dataset tabula.DatasetInterface) {
	oldid := dataset.GetColumnByName("oldrevisionid")
	newid := dataset.GetColumnByName("newrevisionid")

	oldidlen := newid.Len()

	for x, rec := range newid.Records {
		if x >= oldidlen {
			// Just in case additions is greater than deletions
			break
		}

		newlen := revision.GetSize(rec.String())
		oldlen := revision.GetSize(oldid.Records[x].String())
		difflen := float64(1+newlen) / float64(1+oldlen)

		ftr.PushBack(tabula.NewRecordReal(Round(difflen)))
	}
}
Пример #14
0
/*
doDiff read old and new revisions from edit and compare both of them to get
deletions in old rev and additions in new rev.

Deletions and additions then combined into one string and appended to dataset.
*/
func doDiff(readset dsv.ReaderInterface, ds tabula.DatasetInterface) {
	oldids := ds.GetColumnByName("oldrevisionid").ToStringSlice()
	newids := ds.GetColumnByName("newrevisionid").ToStringSlice()

	revision.SetDir(dRevisions)

	diffset, e := revision.Diff(oldids, newids, ".txt")
	if e != nil {
		panic(e)
	}

	// Create input metadata for diff
	md := dsv.NewMetadata("deletions", "string", ",", "\"", "\"", nil)
	readset.AddInputMetadata(md)

	md = dsv.NewMetadata("additions", "string", ",", "\"", "\"", nil)
	readset.AddInputMetadata(md)

	ds.MergeColumns(diffset)
}
Пример #15
0
/*
Compute the frequency of inserted words.
*/
func (ftr *TermFrequency) Compute(dataset tabula.DatasetInterface) {
	newrevidx := dataset.GetColumnByName("newrevisionid")
	adds := dataset.GetColumnByName("additions")
	recordslen := len(adds.Records)

	for x, rec := range adds.Records {
		r := tabula.NewRecordReal(float64(0))
		// Get inserted words.
		intext := rec.String()

		if len(intext) == 0 {
			ftr.PushBack(r)
			continue
		}

		intext = clean.WikiText(intext)
		inWords := tekstus.StringSplitWords(intext, true, true)

		// Get content of new revision.
		revid := newrevidx.Records[x].String()

		if DEBUG >= 2 {
			fmt.Printf("[feature] term_frequency: %d/%d processing %q\n",
				x, recordslen, revid)
		}

		newtext, e := revision.GetContentClean(revid)
		if e != nil {
			ftr.PushBack(r)
			continue
		}

		newWords := tekstus.StringSplitWords(newtext, true, false)

		freq := tekstus.WordsFrequenciesOf(newWords, inWords, false)

		r.SetFloat(Round(freq))

		ftr.PushBack(r)
	}
}
Пример #16
0
/*
Compute frequency of biased words.
*/
func (ftr *WordsBiasFrequency) Compute(dataset tabula.DatasetInterface) {
	col := dataset.GetColumnByName("additions")

	for _, rec := range col.Records {
		r := tabula.NewRecordReal(float64(0))
		text := rec.String()
		if len(text) == 0 {
			ftr.PushBack(r)
			continue
		}

		in := clean.WikiText(text)

		freq := tekstus.StringFrequenciesOf(in,
			tekstus.BiasedWords, false)

		r.SetFloat(freq)

		ftr.PushBack(r)
	}
}
Пример #17
0
/*
Compute frequency bias words in inserted text.
*/
func (ftr *WordsBiasImpact) Compute(dataset tabula.DatasetInterface) {
	oldrevs := dataset.GetColumnByName("oldrevisionid")
	newrevs := dataset.GetColumnByName("newrevisionid")
	oldrevslen := oldrevs.Len()

	for x, rec := range oldrevs.Records {
		v := tabula.NewRecordReal(float64(0))

		oldid := rec.String()
		newid := newrevs.Records[x].String()

		freq := ComputeImpact(oldid, newid, tekstus.BiasedWords)

		v.SetFloat(Round(freq))

		if DEBUG >= 2 {
			fmt.Printf("[feature] words_bias_impact: %d/%d freq: %f\n",
				x, oldrevslen, freq)
		}

		ftr.PushBack(v)
	}
}
Пример #18
0
/*
Compute character distribution of inserted text.
*/
func (ftr *CharDistributionInsert) Compute(dataset tabula.DatasetInterface) {
	oldrevid := dataset.GetColumnByName("oldrevisionid")
	adds := dataset.GetColumnByName("additions")

	for x, rold := range oldrevid.Records {
		r := tabula.NewRecordReal(0.0)
		// count distribution of old revision
		oldText, e := revision.GetContent(rold.String())

		if e != nil {
			ftr.PushBack(r)
			continue
		}

		// count distribution of inserted text
		inText := adds.Records[x].String()

		divergence := KullbackLeiblerDivergence(oldText, inText)

		r.SetFloat(Round(divergence))

		ftr.PushBack(r)
	}
}