Example #1
0
/*
Compute frequency of all words.
*/
func (ftr *WordsAllFrequency) Compute(dataset tabula.DatasetInterface) {
	allWords := GetAllWordList()

	col := dataset.GetColumnByName("additions")

	for _, rec := range col.Records {
		r := tabula.NewRecordReal(float64(0))

		s := rec.String()
		if len(s) == 0 {
			ftr.PushBack(r)
			continue
		}

		s = clean.WikiText(s)
		if len(s) == 0 {
			ftr.PushBack(r)
			continue
		}

		inWords := tekstus.StringSplitWords(s, true, false)

		freq := tekstus.WordsFrequenciesOf(inWords, allWords, false)

		r.SetFloat(Round(freq))

		ftr.PushBack(r)
	}
}
Example #2
0
/*
Compute the longest word in inserted text.
*/
func (ftr *LongestWord) Compute(dataset tabula.DatasetInterface) {
	adds := dataset.GetColumnByName("additions")
	addslen := adds.Len()

	for x, rec := range adds.Records {
		text := rec.String()
		textlen := len(text)

		if textlen == 0 {
			ftr.PushBack(tabula.NewRecordInt(int64(0)))
			continue
		}

		text = clean.WikiText(text)
		inWords := tekstus.StringSplitWords(text, true, true)
		slong, _ := tekstus.WordsFindLongest(inWords)

		if DEBUG >= 2 {
			fmt.Printf("[feature] %d/%d longest word: %q\n", x, addslen,
				slong)
		}

		slonglen := int64(len(slong))

		ftr.PushBack(tabula.NewRecordInt(slonglen))
	}
}
Example #3
0
/*
Compute describe what this feature do.
*/
func (ftr *Template) Compute(dataset tabula.DatasetInterface) {
	// Get the column from dataset. This is a reference to `InputMetadata`
	// in `features.dsv`.
	// To see the list of column that we can process, see `features.dsv`
	// for an example.
	col := dataset.GetColumnByName("editid")

	for _, rec := range col.Records {
		// This is where the computed value will be saved.
		r := &tabula.Record{}

		// Get the field value from dataset
		s := rec.String()

		// Process the field value `s`, (e.g. cleaning, etc).
		// ...

		// Set the feature value after processing
		e := r.SetValue(s, ftr.GetType())
		if e == nil {
			r.SetInteger(0)
		}

		// Save the record value
		ftr.PushBack(r)
	}
}
Example #4
0
/*
Compute number of good token in inserted text.
*/
func (ftr *GoodToken) Compute(dataset tabula.DatasetInterface) {
	col := dataset.GetColumnByName("additions")

	for _, rec := range col.Records {
		cnt := tekstus.StringCountTokens(rec.String(), tokens, false)

		ftr.PushBack(tabula.NewRecordInt(int64(cnt)))
	}
}
Example #5
0
/*
Compute compress rate of inserted text.
*/
func (ftr *CompressRate) Compute(dataset tabula.DatasetInterface) {
	adds := dataset.GetColumnByName("additions")

	for _, rec := range adds.Records {
		v, _ := compressRateLzw(rec.String())

		ftr.PushBack(tabula.NewRecordReal(Round(v)))
	}
}
Example #6
0
/*
Compute non-alphanumeric ratio with all character in inserted text.
*/
func (ftr *NonAlnumRatio) Compute(dataset tabula.DatasetInterface) {
	adds := dataset.GetColumnByName("additions")

	for _, rec := range adds.Records {
		ratio := tekstus.RatioNonAlnumChar(rec.String(), false)

		ftr.PushBack(tabula.NewRecordReal(Round(ratio)))
	}
}
Example #7
0
/*
Compute maximum sequence of character at inserted text.
*/
func (ftr *LongestCharSeq) Compute(dataset tabula.DatasetInterface) {
	col := dataset.GetColumnByName("additions")

	for _, rec := range col.Records {
		text := rec.String()

		_, v := tekstus.GetMaxCharSequence(text)

		ftr.PushBack(tabula.NewRecordInt(int64(v)))
	}
}
Example #8
0
/*
Compute frequency vulgar words in inserted text.
*/
func (ftr *WordsVulgarFrequency) Compute(dataset tabula.DatasetInterface) {
	col := dataset.GetColumnByName("additions")

	for _, rec := range col.Records {
		s := clean.WikiText(rec.String())

		freq := tekstus.StringFrequenciesOf(s, tekstus.VulgarWords,
			false)

		ftr.PushBack(tabula.NewRecordReal(Round(freq)))
	}
}
Example #9
0
/*
Compute character diversity.
*/
func (ftr *CharDiversity) Compute(dataset tabula.DatasetInterface) {
	adds := dataset.GetColumnByName("additions")

	for _, rec := range adds.Records {
		intext := rec.String()
		textlen := float64(len(intext))
		nuniq := tekstus.CountUniqChar(intext)
		v := math.Pow(textlen, 1/float64(1+nuniq))

		ftr.PushBack(tabula.NewRecordReal(Round(v)))
	}
}
Example #10
0
// Compute will count number of bytes that is used in comment, NOT including
// the header content "/* ... */".
func (ftr *CommentLength) Compute(dataset tabula.DatasetInterface) {
	col := dataset.GetColumnByName("editcomment")
	leftcap := []byte("/*")
	rightcap := []byte("*/")

	for _, rec := range col.Records {
		cmt := rec.Bytes()

		cmt, _ = tekstus.BytesRemoveUntil(cmt, leftcap, rightcap)

		ftr.PushBack(tabula.NewRecordInt(int64(len(cmt))))
	}
}
Example #11
0
/*
Compute change the classification from text to numeric. The "regular" edit
will become 0 and the "vandalism" will become 1.
*/
func (ftr *Class) Compute(dataset tabula.DatasetInterface) {
	col := dataset.GetColumnByName("class")

	for _, rec := range col.Records {
		r := tabula.NewRecordInt(0)

		if rec.String() == "vandalism" {
			r.SetInteger(1)
		}

		ftr.PushBack(r)
	}
}
Example #12
0
/*
Compute if record in column is IP address then it is an anonim and set
their value to 1, otherwise set to 0.
*/
func (anon *Anonim) Compute(dataset tabula.DatasetInterface) {
	col := dataset.GetColumnByName("editor")

	for _, rec := range col.Records {
		r := tabula.NewRecordReal(0)

		IP := net.ParseIP(rec.String())

		if IP != nil {
			r.SetFloat(1.0)
		}

		anon.PushBack(r)
	}
}
Example #13
0
/*
Compute ratio of size between new and old revision.
*/
func (ftr *SizeRatio) Compute(dataset tabula.DatasetInterface) {
	oldid := dataset.GetColumnByName("oldrevisionid")
	newid := dataset.GetColumnByName("newrevisionid")

	oldidlen := newid.Len()

	for x, rec := range newid.Records {
		if x >= oldidlen {
			// Just in case additions is greater than deletions
			break
		}

		newlen := revision.GetSize(rec.String())
		oldlen := revision.GetSize(oldid.Records[x].String())
		difflen := float64(1+newlen) / float64(1+oldlen)

		ftr.PushBack(tabula.NewRecordReal(Round(difflen)))
	}
}
Example #14
0
/*
doDiff read old and new revisions from edit and compare both of them to get
deletions in old rev and additions in new rev.

Deletions and additions then combined into one string and appended to dataset.
*/
func doDiff(readset dsv.ReaderInterface, ds tabula.DatasetInterface) {
	oldids := ds.GetColumnByName("oldrevisionid").ToStringSlice()
	newids := ds.GetColumnByName("newrevisionid").ToStringSlice()

	revision.SetDir(dRevisions)

	diffset, e := revision.Diff(oldids, newids, ".txt")
	if e != nil {
		panic(e)
	}

	// Create input metadata for diff
	md := dsv.NewMetadata("deletions", "string", ",", "\"", "\"", nil)
	readset.AddInputMetadata(md)

	md = dsv.NewMetadata("additions", "string", ",", "\"", "\"", nil)
	readset.AddInputMetadata(md)

	ds.MergeColumns(diffset)
}
Example #15
0
/*
Compute the frequency of inserted words.
*/
func (ftr *TermFrequency) Compute(dataset tabula.DatasetInterface) {
	newrevidx := dataset.GetColumnByName("newrevisionid")
	adds := dataset.GetColumnByName("additions")
	recordslen := len(adds.Records)

	for x, rec := range adds.Records {
		r := tabula.NewRecordReal(float64(0))
		// Get inserted words.
		intext := rec.String()

		if len(intext) == 0 {
			ftr.PushBack(r)
			continue
		}

		intext = clean.WikiText(intext)
		inWords := tekstus.StringSplitWords(intext, true, true)

		// Get content of new revision.
		revid := newrevidx.Records[x].String()

		if DEBUG >= 2 {
			fmt.Printf("[feature] term_frequency: %d/%d processing %q\n",
				x, recordslen, revid)
		}

		newtext, e := revision.GetContentClean(revid)
		if e != nil {
			ftr.PushBack(r)
			continue
		}

		newWords := tekstus.StringSplitWords(newtext, true, false)

		freq := tekstus.WordsFrequenciesOf(newWords, inWords, false)

		r.SetFloat(Round(freq))

		ftr.PushBack(r)
	}
}
Example #16
0
/*
Compute frequency of biased words.
*/
func (ftr *WordsBiasFrequency) Compute(dataset tabula.DatasetInterface) {
	col := dataset.GetColumnByName("additions")

	for _, rec := range col.Records {
		r := tabula.NewRecordReal(float64(0))
		text := rec.String()
		if len(text) == 0 {
			ftr.PushBack(r)
			continue
		}

		in := clean.WikiText(text)

		freq := tekstus.StringFrequenciesOf(in,
			tekstus.BiasedWords, false)

		r.SetFloat(freq)

		ftr.PushBack(r)
	}
}
Example #17
0
//
// WriteRawDataset will write content of dataset to file without metadata but
// using separator `sep` for each record.
//
// We use pointer in separator parameter, so we can use empty string as
// separator.
//
func (writer *Writer) WriteRawDataset(dataset tabula.DatasetInterface,
	sep *string,
) (
	int, error,
) {
	if nil == writer.fWriter {
		return 0, ErrNotOpen
	}
	if nil == dataset {
		return 0, nil
	}
	if sep == nil {
		sep = new(string)
		*sep = DefSeparator
	}

	var rows *tabula.Rows

	switch dataset.GetMode() {
	case tabula.DatasetModeColumns:
		cols := dataset.GetDataAsColumns()
		return writer.WriteRawColumns(cols, sep)
	case tabula.DatasetModeRows, tabula.DatasetModeMatrix:
		fallthrough
	default:
		rows = dataset.GetDataAsRows()
	}

	return writer.WriteRawRows(rows, sep)
}
Example #18
0
/*
Compute frequency bias words in inserted text.
*/
func (ftr *WordsBiasImpact) Compute(dataset tabula.DatasetInterface) {
	oldrevs := dataset.GetColumnByName("oldrevisionid")
	newrevs := dataset.GetColumnByName("newrevisionid")
	oldrevslen := oldrevs.Len()

	for x, rec := range oldrevs.Records {
		v := tabula.NewRecordReal(float64(0))

		oldid := rec.String()
		newid := newrevs.Records[x].String()

		freq := ComputeImpact(oldid, newid, tekstus.BiasedWords)

		v.SetFloat(Round(freq))

		if DEBUG >= 2 {
			fmt.Printf("[feature] words_bias_impact: %d/%d freq: %f\n",
				x, oldrevslen, freq)
		}

		ftr.PushBack(v)
	}
}
/*
Compute character distribution of inserted text.
*/
func (ftr *CharDistributionInsert) Compute(dataset tabula.DatasetInterface) {
	oldrevid := dataset.GetColumnByName("oldrevisionid")
	adds := dataset.GetColumnByName("additions")

	for x, rold := range oldrevid.Records {
		r := tabula.NewRecordReal(0.0)
		// count distribution of old revision
		oldText, e := revision.GetContent(rold.String())

		if e != nil {
			ftr.PushBack(r)
			continue
		}

		// count distribution of inserted text
		inText := adds.Records[x].String()

		divergence := KullbackLeiblerDivergence(oldText, inText)

		r.SetFloat(Round(divergence))

		ftr.PushBack(r)
	}
}