func GetTFPhased(page *spider.Page, phaseterms []string) float64 { words := page.Words() var requiredWords []*spider.Word for _, phase := range phaseterms { for _, word := range words { if word.Word == phase { requiredWords = append(requiredWords, word) } } } // Doc doesn't contain all terms if len(phaseterms) != len(requiredWords) { return 0.0 } resultPos := requiredWords[0].Positions() for index, requiredWord := range requiredWords { if index != 0 { tempPos := requiredWord.Positions() var tempResult []int for _, pos := range resultPos { //If the following pos don't have pos+1, remove the item if containInt(tempPos, pos+1) == true { tempResult = append(tempResult, pos) } } resultPos = tempResult } } // if len(resultPos) != 0 { // fmt.Printf("Terms:%v %v\n", requiredWords[0].Positions(), requiredWords[1].Positions()) // } return float64(len(resultPos)) }
func GetTFIDFPhased(page *spider.Page, allStoredPages []*spider.Page, invertedTable map[int][]int64, phaseterms []string) float64 { words := page.Words() TF := GetTFPhased(page, phaseterms) var docLen float64 = 0.0 for _, word := range words { docLen += float64(word.TF()) } N := float64(len(allStoredPages)) df := GetDFPhased(allStoredPages, phaseterms) //k1 := 2.0 //b := 0.75 //firstTerm := (math.Log((N - df + 0.5) / (df + 0.5))) //secondTerm := ((k1 + 1) * TF / ((k1*(1-b) + b*docLen/AveDocLen) + TF)) MaxTF := GetMaxTF(page) if MaxTF > 0 { //fmt.Print("TF >0") } if MaxTF <= 0 || df <= 0 { return 0 } firstTerm := TF / MaxTF secondTerm := math.Log2(N / df) return firstTerm * secondTerm }
func GetMaxTF(page *spider.Page) (maxTF float64) { maxTF = 0.0 for _, word := range page.Words() { if float64(word.TF()) > maxTF { maxTF = float64(word.TF()) } } return }
func GetMostFreqWord(page *spider.Page, number int) (freqWords []spider.Word, freq []int) { words := page.Words() for i := 0; i < number; i++ { //make sure it is not a empty array if len(words) > 0 { MaxTermFreq := words[0].TF() MaxIndex := 0 //find max TF word for index, word := range words { if word.TF() > MaxTermFreq { MaxIndex = index MaxTermFreq = word.TF() } } //put into result freqWords = append(freqWords, *words[MaxIndex]) freq = append(freq, MaxTermFreq) //remove max term from the list words = append(words[:MaxIndex], words[MaxIndex+1:]...) } } return }
func SearchingResult(query *spider.Page) (resultPageIDs []int64, resultScores []float64, top5FreqWord []string) { //query.wo //storing page with TFIDF resultScores = nil resultPageIDs = nil //fmt.Printf("len of allPagesWithTFIDF:%v \n", len(allPagesWithTFIDF)) // for _, pageWithTFIDF := range allPagesWithTFIDF { // //allPagesWithTFIDF // words := pageWithTFIDF.myWord // // fmt.Printf("format") // for _, word := range words { // str := word.Word.Word // TFIDF := word.TFIDF // fmt.Printf("Str:%v TFIDF:%v\n", str, TFIDF) // } // } //end of storing TFIDF //handling querry Start := time.Now() var scores []float64 var pageIDs []int64 var Freq5Word []string QWords := query.Words() // QWords[0].Word = "shuten" + " " + "doji" // QWords = QWords[:2] for _, word := range QWords { fmt.Printf("Qword:%v\n", word.Word) } // for _, page := range allStoredPages { // str := QWords[0].Word // if strings.Contains(str, " ") { // phaseterms := strings.Fields(str) // temp := GetTFIDFPhased(page, allStoredPages, invertedTable, phaseterms) // if temp > 0 { // fmt.Printf("GOOD") // } // } // } for _, page := range allPagesWithTFIDF { var dq float64 = 0.0 var dlen float64 = GetDLen(page) var qlen float64 = float64(len(query.Words())) var matchTitle bool = false for _, querryWord := range QWords { //str := querryWord.Word words := page.myWord if strings.Contains(page.Page.Title, "") { matchTitle = true } str := querryWord.Word if strings.Contains(str, " ") { phaseterms := strings.Fields(str) // Start := time.Now() temp := GetTFIDFPhased(page.Page, allStoredPages, invertedTable, phaseterms) dq += temp if temp != 0 { fmt.Printf("temp: %v\n", temp) fmt.Printf("Page with phased:%v\n", page.Page.URL) } } //fmt.Printf("phase: %v\n", QWords[0].Word) for _, word := range words { if querryWord.Word == word.Word.Word { dq += word.TFIDF * 1 //fmt.Print("notPhase") } } } if matchTitle { scores = append(scores, 2*dq/dlen/qlen) } else { scores = append(scores, dq/dlen/qlen) } pageIDs = append(pageIDs, page.Page.PageID) var newStr string //var newFreq string Freqwords, Freqs := GetMostFreqWord(page.Page, 5) for i := 0; i < 5; i++ { newStr += Freqwords[i].Word + " " + strconv.Itoa(Freqs[i]) if i != 4 { newStr += ";" } } // for _, word := range Freqwords { // newStr += word.Word + " " // } // for _, freq := range Freqs { // newFreq += strconv.Itoa(freq) + ";" // } Freq5Word = append(Freq5Word, newStr) //Freq5Word = append(Freq5Word, newFreq) } // for _, word := range allPagesWithTFIDF[53-1].myWord { // if word.Word.Word == "demon" { // fmt.Printf("str: %v", word.Word.Word) // } // } // fmt.Printf("\n%v\n", allPagesWithTFIDF[53-1].Page.URL) // fmt.Printf("pageIDs:%v ", pageIDs) // fmt.Printf("scores:%v\n", scores) // for _, word := range query.Words() { // fmt.Printf("querry words:%v\n", word.Word) // } MaxNumPageReturn := 50 for i := 0; i < MaxNumPageReturn; i++ { maxIndex := 0 maxValue := 0.0 for index, score := range scores { if score > maxValue { maxIndex = index maxValue = score } } if maxValue <= 0 { break } else { resultScores = append(resultScores, maxValue) resultPageIDs = append(resultPageIDs, pageIDs[maxIndex]) top5FreqWord = append(top5FreqWord, Freq5Word[maxIndex]) } scores = append(scores[:maxIndex], scores[maxIndex+1:]...) pageIDs = append(pageIDs[:maxIndex], pageIDs[maxIndex+1:]...) Freq5Word = append(Freq5Word[:maxIndex], Freq5Word[maxIndex+1:]...) } elapse := time.Since(Start) fmt.Printf("Time used in search=%v\n", elapse) // elapse := time.Since(Start) // fmt.Printf("Time used in search=%v", elapse) return }