示例#1
0
func GetTFPhased(page *spider.Page, phaseterms []string) float64 {
	words := page.Words()
	var requiredWords []*spider.Word
	for _, phase := range phaseterms {
		for _, word := range words {
			if word.Word == phase {
				requiredWords = append(requiredWords, word)
			}
		}
	}
	// Doc doesn't contain all terms
	if len(phaseterms) != len(requiredWords) {
		return 0.0
	}
	resultPos := requiredWords[0].Positions()
	for index, requiredWord := range requiredWords {
		if index != 0 {
			tempPos := requiredWord.Positions()
			var tempResult []int
			for _, pos := range resultPos {
				//If the following pos don't have pos+1, remove the item
				if containInt(tempPos, pos+1) == true {
					tempResult = append(tempResult, pos)
				}
			}
			resultPos = tempResult
		}
	}
	// if len(resultPos) != 0 {
	// 	fmt.Printf("Terms:%v	%v\n", requiredWords[0].Positions(), requiredWords[1].Positions())
	// }

	return float64(len(resultPos))
}
示例#2
0
func GetTFIDFPhased(page *spider.Page, allStoredPages []*spider.Page, invertedTable map[int][]int64, phaseterms []string) float64 {
	words := page.Words()
	TF := GetTFPhased(page, phaseterms)
	var docLen float64 = 0.0
	for _, word := range words {
		docLen += float64(word.TF())
	}
	N := float64(len(allStoredPages))
	df := GetDFPhased(allStoredPages, phaseterms)

	//k1 := 2.0
	//b := 0.75
	//firstTerm := (math.Log((N - df + 0.5) / (df + 0.5)))
	//secondTerm := ((k1 + 1) * TF / ((k1*(1-b) + b*docLen/AveDocLen) + TF))
	MaxTF := GetMaxTF(page)
	if MaxTF > 0 {
		//fmt.Print("TF >0")
	}
	if MaxTF <= 0 || df <= 0 {
		return 0
	}
	firstTerm := TF / MaxTF
	secondTerm := math.Log2(N / df)

	return firstTerm * secondTerm
}
示例#3
0
func GetMaxTF(page *spider.Page) (maxTF float64) {
	maxTF = 0.0
	for _, word := range page.Words() {
		if float64(word.TF()) > maxTF {
			maxTF = float64(word.TF())
		}
	}
	return
}
示例#4
0
func GetMostFreqWord(page *spider.Page, number int) (freqWords []spider.Word, freq []int) {
	words := page.Words()
	for i := 0; i < number; i++ {
		//make sure it is not a empty array
		if len(words) > 0 {
			MaxTermFreq := words[0].TF()
			MaxIndex := 0
			//find max TF word
			for index, word := range words {
				if word.TF() > MaxTermFreq {
					MaxIndex = index
					MaxTermFreq = word.TF()
				}
			}
			//put into result
			freqWords = append(freqWords, *words[MaxIndex])
			freq = append(freq, MaxTermFreq)
			//remove max term from the list
			words = append(words[:MaxIndex], words[MaxIndex+1:]...)
		}
	}

	return
}
示例#5
0
func SearchingResult(query *spider.Page) (resultPageIDs []int64, resultScores []float64, top5FreqWord []string) {
	//query.wo

	//storing page with TFIDF
	resultScores = nil
	resultPageIDs = nil

	//fmt.Printf("len of allPagesWithTFIDF:%v	\n", len(allPagesWithTFIDF))
	// for _, pageWithTFIDF := range allPagesWithTFIDF {
	// 	//allPagesWithTFIDF
	// 	words := pageWithTFIDF.myWord
	// 	// fmt.Printf("format")
	// 	for _, word := range words {
	// 		str := word.Word.Word
	// 		TFIDF := word.TFIDF
	// 		fmt.Printf("Str:%v   TFIDF:%v\n", str, TFIDF)
	// 	}
	// }

	//end of storing TFIDF

	//handling querry
	Start := time.Now()
	var scores []float64
	var pageIDs []int64
	var Freq5Word []string
	QWords := query.Words()
	// QWords[0].Word = "shuten" + " " + "doji"
	// QWords = QWords[:2]
	for _, word := range QWords {
		fmt.Printf("Qword:%v\n", word.Word)
	}
	// for _, page := range allStoredPages {
	// 	str := QWords[0].Word
	// 	if strings.Contains(str, " ") {
	// 		phaseterms := strings.Fields(str)
	// 		temp := GetTFIDFPhased(page, allStoredPages, invertedTable, phaseterms)
	// 		if temp > 0 {
	// 			fmt.Printf("GOOD")
	// 		}
	// 	}

	// }
	for _, page := range allPagesWithTFIDF {
		var dq float64 = 0.0
		var dlen float64 = GetDLen(page)
		var qlen float64 = float64(len(query.Words()))
		var matchTitle bool = false
		for _, querryWord := range QWords {
			//str := querryWord.Word
			words := page.myWord
			if strings.Contains(page.Page.Title, "") {
				matchTitle = true
			}
			str := querryWord.Word
			if strings.Contains(str, " ") {
				phaseterms := strings.Fields(str)
				// Start := time.Now()
				temp := GetTFIDFPhased(page.Page, allStoredPages, invertedTable, phaseterms)

				dq += temp
				if temp != 0 {
					fmt.Printf("temp: %v\n", temp)
					fmt.Printf("Page with phased:%v\n", page.Page.URL)
				}

			}

			//fmt.Printf("phase: %v\n", QWords[0].Word)
			for _, word := range words {
				if querryWord.Word == word.Word.Word {
					dq += word.TFIDF * 1
					//fmt.Print("notPhase")
				}
			}
		}
		if matchTitle {
			scores = append(scores, 2*dq/dlen/qlen)
		} else {
			scores = append(scores, dq/dlen/qlen)
		}

		pageIDs = append(pageIDs, page.Page.PageID)

		var newStr string
		//var newFreq string
		Freqwords, Freqs := GetMostFreqWord(page.Page, 5)
		for i := 0; i < 5; i++ {

			newStr += Freqwords[i].Word + " " + strconv.Itoa(Freqs[i])
			if i != 4 {
				newStr += ";"
			}
		}

		// for _, word := range Freqwords {
		// 	newStr += word.Word + " "
		// }
		// for _, freq := range Freqs {
		// 	newFreq += strconv.Itoa(freq) + ";"
		// }
		Freq5Word = append(Freq5Word, newStr)
		//Freq5Word = append(Freq5Word, newFreq)
	}

	// for _, word := range allPagesWithTFIDF[53-1].myWord {
	// 	if word.Word.Word == "demon" {
	// 		fmt.Printf("str: %v", word.Word.Word)
	// 	}

	// }
	// fmt.Printf("\n%v\n", allPagesWithTFIDF[53-1].Page.URL)
	// fmt.Printf("pageIDs:%v    ", pageIDs)
	// fmt.Printf("scores:%v\n", scores)

	// for _, word := range query.Words() {
	// 	fmt.Printf("querry words:%v\n", word.Word)
	// }

	MaxNumPageReturn := 50
	for i := 0; i < MaxNumPageReturn; i++ {
		maxIndex := 0
		maxValue := 0.0
		for index, score := range scores {
			if score > maxValue {
				maxIndex = index
				maxValue = score
			}

		}
		if maxValue <= 0 {
			break
		} else {
			resultScores = append(resultScores, maxValue)
			resultPageIDs = append(resultPageIDs, pageIDs[maxIndex])
			top5FreqWord = append(top5FreqWord, Freq5Word[maxIndex])
		}
		scores = append(scores[:maxIndex], scores[maxIndex+1:]...)
		pageIDs = append(pageIDs[:maxIndex], pageIDs[maxIndex+1:]...)
		Freq5Word = append(Freq5Word[:maxIndex], Freq5Word[maxIndex+1:]...)
	}
	elapse := time.Since(Start)
	fmt.Printf("Time used in search=%v\n", elapse)

	// elapse := time.Since(Start)
	// fmt.Printf("Time used in search=%v", elapse)
	return
}