func firstToken(tokens [][]byte) (string, int) { for i := 0; i < len(tokens); i++ { first := stemmer.Stem(bytes.TrimFunc(tokens[i], notletter)) if !IsStopWord(string(first)) && len(first) > 0 { return string(first), i } } return "", len(tokens) }
func (this *Document) tokenize(sentance []byte) <-chan NGram { c := make(chan NGram) tokens := SplitFunc(sentance, notletter) go func() { first, i := firstToken(tokens) for j := i + 1; j < len(tokens); j++ { last := string(stemmer.Stem(bytes.TrimFunc(tokens[j], notletter))) if !IsStopWord(last) && len(last) > 0 { c <- NewNGram(first, string(last)) first = string(last) } } close(c) }() return c }
func (index *Index) MakeReverseIndex(count int, data []byte, docName string) { // This represent the count inside each document. // Will be used to note the position of a term in a document. numWords := 0 i := 0 word := "" docNameAdded := false for i < count { word, i = GetNextWord(data, count, i) if _, exists := index.stopwords[word]; (exists && *doStopwordsFlag) || word == "" { // Ignore the word and not put in the index continue } numWords++ word = strings.ToLower(word) if *doStemmingFlag { word = string(stemmer.Stem([]byte(word))) } if _, exists := index.reverseIndex[word]; exists { if _, entryExists := index.reverseIndex[word][docName]; entryExists { index.reverseIndex[word][docName] = append(index.reverseIndex[word][docName], numWords) } else { index.reverseIndex[word][docName] = make([]int, 0, 10) index.reverseIndex[word][docName] = append(index.reverseIndex[word][docName], numWords) // Compute statistics docNameAdded = true } } else { index.reverseIndex[word] = make(map[string][]int) index.reverseIndex[word][docName] = make([]int, 0, 10) index.reverseIndex[word][docName] = append(index.reverseIndex[word][docName], numWords) // Compute statistics docNameAdded = true } } if docNameAdded { index.stat.numDocs++ } }
/* * Takes a query and returns the list of documents containing this qurey * For phrase queries, query format must be "[word1] cand [word2] cand ..." */ func (index *Index) SearchQuery(query []string) map[string]float64 { // Find weights of query terms. Type map[string]float64 queryTermWeight := index.getQueryWeights(query) // Stores the cosine normalization factor for each document. norm_d := make(map[string]float64) // Stores the cosine similarity measure between the query and document. Initialized to prevent aliacing. rankingList := make(map[string]float64) for d := range index.reverseIndex[query[0]] { rankingList[d] = 0 norm_d[d] = 0 } i := 2 lastWord := "" for _, word := range query { word = strings.ToLower(string(word)) // If the word is a connector if word == "not" { i = 0 continue } else if word == "or" { i = 1 continue } else if word == "and" { i = 2 continue } else if word == "cand" { i = 3 continue } // If the word is not a connector if *doStemmingFlag { word = string(stemmer.Stem([]byte(word))) } // WARNING: templist is an aliace variable pointing to the index.reverseIndex. // Do NOT modify this variable. templist := index.reverseIndex[word] docFreq := float64(len(templist)) invDocFreq := math.Log10(float64(index.stat.numDocs) / docFreq) switch i { case 3: // Perform 'cand' operation. for doc := range rankingList { if _, exists := templist[doc]; !exists { delete(rankingList, doc) delete(norm_d, doc) } else if lastWord != "" { // Check if it is consecutive with each other in the each document. if !consecutive(index.reverseIndex[lastWord][doc], templist[doc]) { delete(rankingList, doc) delete(norm_d, doc) } else { // Calculate score for this document. termFreq := float64(len(templist[doc])) termFreqLog := 1 + math.Log10(termFreq) docTermWeight := termFreqLog * invDocFreq rankingList[doc] += (docTermWeight * queryTermWeight[word]) norm_d[doc] += (docTermWeight * docTermWeight) } } } case 2: // Perform 'and' operation. for doc := range rankingList { if _, exists := templist[doc]; !exists { delete(rankingList, doc) delete(norm_d, doc) } else { // Calculate score for this document. termFreq := float64(len(templist[doc])) termFreqLog := 1 + math.Log10(termFreq) docTermWeight := termFreqLog * invDocFreq rankingList[doc] += (docTermWeight * queryTermWeight[word]) norm_d[doc] += (docTermWeight * docTermWeight) } } case 1: // Perform 'or' operation. for doc := range templist { if _, exists := rankingList[doc]; !exists { rankingList[doc] = 0 norm_d[doc] = 0 } // Calculate score for this document. termFreq := float64(len(templist[doc])) termFreqLog := 1 + math.Log10(termFreq) docTermWeight := termFreqLog * invDocFreq rankingList[doc] += (docTermWeight * queryTermWeight[word]) norm_d[doc] += (docTermWeight * docTermWeight) //fmt.Printf("\n\ndoc: %s, W_dt: %.2f, w_qt: %.2f, norm_d: %.2f\n", doc, docTermWeight, queryTermWeight[word], norm_d[doc]) //fmt.Printf("Rank: %.2f\n", rankingList[doc] ) } case 0: // Perform 'not' operation. for doc := range rankingList { if _, exists := templist[doc]; exists { delete(rankingList, doc) delete(norm_d, doc) } } } lastWord = word } // Calculate the cosine normalization factor for the query. norm_q := 0.0 for _, word := range query { norm_q += (queryTermWeight[word] * queryTermWeight[word]) } norm_q = math.Sqrt(norm_q) fmt.Println("------------") // Divide rank by normalization factors. for doc := range rankingList { //fmt.Printf("\nRank: %.2f\tnorm_d: %.2f\tnorm_q: %.2f\n", rankingList[doc], norm_d[doc], norm_q) norm_d[doc] = math.Sqrt(norm_d[doc]) rankingList[doc] = (rankingList[doc] / (norm_d[doc] * norm_q)) //fmt.Printf("After normalization rank: %.2f\n", rankingList[doc] ) } return rankingList }