Пример #1
0
func firstToken(tokens [][]byte) (string, int) {
	for i := 0; i < len(tokens); i++ {
		first := stemmer.Stem(bytes.TrimFunc(tokens[i], notletter))
		if !IsStopWord(string(first)) && len(first) > 0 {
			return string(first), i
		}
	}
	return "", len(tokens)
}
Пример #2
0
func (this *Document) tokenize(sentance []byte) <-chan NGram {
	c := make(chan NGram)
	tokens := SplitFunc(sentance, notletter)
	go func() {
		first, i := firstToken(tokens)
		for j := i + 1; j < len(tokens); j++ {
			last := string(stemmer.Stem(bytes.TrimFunc(tokens[j], notletter)))
			if !IsStopWord(last) && len(last) > 0 {
				c <- NewNGram(first, string(last))
				first = string(last)
			}
		}
		close(c)
	}()
	return c
}
Пример #3
0
func (index *Index) MakeReverseIndex(count int, data []byte, docName string) {
	// This represent the count inside each document.
	// Will be used to note the position of a term in a document.
	numWords := 0
	i := 0
	word := ""
	docNameAdded := false

	for i < count {
		word, i = GetNextWord(data, count, i)

		if _, exists := index.stopwords[word]; (exists && *doStopwordsFlag) || word == "" {
			// Ignore the word and not put in the index
			continue
		}
		numWords++
		word = strings.ToLower(word)

		if *doStemmingFlag {
			word = string(stemmer.Stem([]byte(word)))
		}

		if _, exists := index.reverseIndex[word]; exists {
			if _, entryExists := index.reverseIndex[word][docName]; entryExists {
				index.reverseIndex[word][docName] = append(index.reverseIndex[word][docName], numWords)
			} else {
				index.reverseIndex[word][docName] = make([]int, 0, 10)
				index.reverseIndex[word][docName] = append(index.reverseIndex[word][docName], numWords)
				// Compute statistics
				docNameAdded = true
			}
		} else {
			index.reverseIndex[word] = make(map[string][]int)
			index.reverseIndex[word][docName] = make([]int, 0, 10)
			index.reverseIndex[word][docName] = append(index.reverseIndex[word][docName], numWords)
			// Compute statistics
			docNameAdded = true
		}
	}
	if docNameAdded {
		index.stat.numDocs++
	}
}
Пример #4
0
/*
 * Takes a query and returns the list of documents containing this qurey
 * For phrase queries, query format must be "[word1] cand [word2] cand ..."
 */
func (index *Index) SearchQuery(query []string) map[string]float64 {
	// Find weights of query terms. Type map[string]float64
	queryTermWeight := index.getQueryWeights(query)

	// Stores the cosine normalization factor for each document.
	norm_d := make(map[string]float64)

	// Stores the cosine similarity measure between the query and document. Initialized to prevent aliacing.
	rankingList := make(map[string]float64)
	for d := range index.reverseIndex[query[0]] {
		rankingList[d] = 0
		norm_d[d] = 0
	}

	i := 2
	lastWord := ""
	for _, word := range query {
		word = strings.ToLower(string(word))

		// If the word is a connector
		if word == "not" {
			i = 0
			continue
		} else if word == "or" {
			i = 1
			continue
		} else if word == "and" {
			i = 2
			continue
		} else if word == "cand" {
			i = 3
			continue
		}
		// If the word is not a connector
		if *doStemmingFlag {
			word = string(stemmer.Stem([]byte(word)))
		}
		// WARNING: templist is an aliace variable pointing to the index.reverseIndex.
		// Do NOT modify this variable.
		templist := index.reverseIndex[word]
		docFreq := float64(len(templist))
		invDocFreq := math.Log10(float64(index.stat.numDocs) / docFreq)
		switch i {
		case 3:
			// Perform 'cand' operation.
			for doc := range rankingList {
				if _, exists := templist[doc]; !exists {
					delete(rankingList, doc)
					delete(norm_d, doc)
				} else if lastWord != "" {
					// Check if it is consecutive with each other in the each document.
					if !consecutive(index.reverseIndex[lastWord][doc], templist[doc]) {
						delete(rankingList, doc)
						delete(norm_d, doc)
					} else {
						// Calculate score for this document.
						termFreq := float64(len(templist[doc]))
						termFreqLog := 1 + math.Log10(termFreq)
						docTermWeight := termFreqLog * invDocFreq
						rankingList[doc] += (docTermWeight * queryTermWeight[word])
						norm_d[doc] += (docTermWeight * docTermWeight)
					}
				}
			}
		case 2:
			// Perform 'and' operation.
			for doc := range rankingList {
				if _, exists := templist[doc]; !exists {
					delete(rankingList, doc)
					delete(norm_d, doc)
				} else {
					// Calculate score for this document.
					termFreq := float64(len(templist[doc]))
					termFreqLog := 1 + math.Log10(termFreq)
					docTermWeight := termFreqLog * invDocFreq
					rankingList[doc] += (docTermWeight * queryTermWeight[word])
					norm_d[doc] += (docTermWeight * docTermWeight)
				}
			}
		case 1:
			// Perform 'or' operation.
			for doc := range templist {
				if _, exists := rankingList[doc]; !exists {
					rankingList[doc] = 0
					norm_d[doc] = 0
				}
				// Calculate score for this document.
				termFreq := float64(len(templist[doc]))
				termFreqLog := 1 + math.Log10(termFreq)
				docTermWeight := termFreqLog * invDocFreq
				rankingList[doc] += (docTermWeight * queryTermWeight[word])
				norm_d[doc] += (docTermWeight * docTermWeight)
				//fmt.Printf("\n\ndoc: %s, W_dt: %.2f, w_qt: %.2f, norm_d: %.2f\n", doc, docTermWeight, queryTermWeight[word], norm_d[doc])
				//fmt.Printf("Rank: %.2f\n", rankingList[doc] )
			}

		case 0:
			// Perform 'not' operation.
			for doc := range rankingList {
				if _, exists := templist[doc]; exists {
					delete(rankingList, doc)
					delete(norm_d, doc)
				}
			}
		}
		lastWord = word
	}

	// Calculate the cosine normalization factor for the query.
	norm_q := 0.0
	for _, word := range query {
		norm_q += (queryTermWeight[word] * queryTermWeight[word])
	}
	norm_q = math.Sqrt(norm_q)
	fmt.Println("------------")
	// Divide rank by normalization factors.
	for doc := range rankingList {
		//fmt.Printf("\nRank: %.2f\tnorm_d: %.2f\tnorm_q: %.2f\n", rankingList[doc], norm_d[doc], norm_q)
		norm_d[doc] = math.Sqrt(norm_d[doc])
		rankingList[doc] = (rankingList[doc] / (norm_d[doc] * norm_q))
		//fmt.Printf("After normalization rank: %.2f\n", rankingList[doc] )
	}

	return rankingList
}