/* ComputeImpact return percentage of words in new revision compared to old revision, using count_of_words_in_new / (count_of_words_in_old + count_of_words_in_new) if no words are found in old and new revision, return 0. */ func ComputeImpact(oldrevid, newrevid string, wordlist []string) float64 { oldtext, _ := revision.GetContentClean(oldrevid) newtext, _ := revision.GetContentClean(newrevid) oldCnt := tekstus.StringCountTokens(oldtext, wordlist, false) newCnt := tekstus.StringCountTokens(newtext, wordlist, false) total := float64(oldCnt + newCnt) if total == 0 { return 0 } return float64(newCnt) / total }
/* Compute the frequency of inserted words. */ func (ftr *TermFrequency) Compute(dataset tabula.DatasetInterface) { newrevidx := dataset.GetColumnByName("newrevisionid") adds := dataset.GetColumnByName("additions") recordslen := len(adds.Records) for x, rec := range adds.Records { r := tabula.NewRecordReal(float64(0)) // Get inserted words. intext := rec.String() if len(intext) == 0 { ftr.PushBack(r) continue } intext = clean.WikiText(intext) inWords := tekstus.StringSplitWords(intext, true, true) // Get content of new revision. revid := newrevidx.Records[x].String() if DEBUG >= 2 { fmt.Printf("[feature] term_frequency: %d/%d processing %q\n", x, recordslen, revid) } newtext, e := revision.GetContentClean(revid) if e != nil { ftr.PushBack(r) continue } newWords := tekstus.StringSplitWords(newtext, true, false) freq := tekstus.WordsFrequenciesOf(newWords, inWords, false) r.SetFloat(Round(freq)) ftr.PushBack(r) } }