Exemplo n.º 1
0
/*
ComputeImpact return percentage of words in new revision compared to old
revision, using

	count_of_words_in_new
	/
	(count_of_words_in_old + count_of_words_in_new)

if no words are found in old and new revision, return 0.
*/
func ComputeImpact(oldrevid, newrevid string, wordlist []string) float64 {
	oldtext, _ := revision.GetContentClean(oldrevid)
	newtext, _ := revision.GetContentClean(newrevid)

	oldCnt := tekstus.StringCountTokens(oldtext, wordlist, false)
	newCnt := tekstus.StringCountTokens(newtext, wordlist, false)

	total := float64(oldCnt + newCnt)
	if total == 0 {
		return 0
	}

	return float64(newCnt) / total
}
Exemplo n.º 2
0
/*
Compute the frequency of inserted words.
*/
func (ftr *TermFrequency) Compute(dataset tabula.DatasetInterface) {
	newrevidx := dataset.GetColumnByName("newrevisionid")
	adds := dataset.GetColumnByName("additions")
	recordslen := len(adds.Records)

	for x, rec := range adds.Records {
		r := tabula.NewRecordReal(float64(0))
		// Get inserted words.
		intext := rec.String()

		if len(intext) == 0 {
			ftr.PushBack(r)
			continue
		}

		intext = clean.WikiText(intext)
		inWords := tekstus.StringSplitWords(intext, true, true)

		// Get content of new revision.
		revid := newrevidx.Records[x].String()

		if DEBUG >= 2 {
			fmt.Printf("[feature] term_frequency: %d/%d processing %q\n",
				x, recordslen, revid)
		}

		newtext, e := revision.GetContentClean(revid)
		if e != nil {
			ftr.PushBack(r)
			continue
		}

		newWords := tekstus.StringSplitWords(newtext, true, false)

		freq := tekstus.WordsFrequenciesOf(newWords, inWords, false)

		r.SetFloat(Round(freq))

		ftr.PushBack(r)
	}
}