func selectSnippets(text string, tokens villa.StrSet, maxBytes int) string { text = strings.TrimSpace(text) if len(text) <= maxBytes { return text } // return text[:maxBytes] + "..." lines := splitToLines(text) var hitTokens villa.StrSet type lineinfo struct { idx int line string } var selLines []lineinfo count := 0 for i, line := range lines { line = strings.TrimSpace(line) lines[i] = line lineTokens := gcse.AppendTokens(nil, []byte(line)) reserve := false for token := range tokens { if !hitTokens.In(token) && lineTokens.In(token) { reserve = true hitTokens.Put(token) } } if i == 0 || reserve && (count+len(line)+1 < maxBytes) { selLines = append(selLines, lineinfo{ idx: i, line: line, }) count += len(line) + 1 if count == maxBytes { break } lines[i] = "" } } if count < maxBytes { for i, line := range lines { if len(line) == 0 { continue } if count+len(line) >= maxBytes { break } selLines = append(selLines, lineinfo{ idx: i, line: line, }) count += len(line) + 1 } villa.SortF(len(selLines), func(i, j int) bool { return selLines[i].idx < selLines[j].idx }, func(i, j int) { selLines[i], selLines[j] = selLines[j], selLines[i] }) } var outBuf villa.ByteSlice for i, line := range selLines { if line.idx > 1 && (i < 1 || line.idx != selLines[i-1].idx+1) { outBuf.WriteString("...") } else { if i > 0 { outBuf.WriteString(" ") } } outBuf.WriteString(line.line) } if selLines[len(selLines)-1].idx != len(lines)-1 { outBuf.WriteString("...") } return string(outBuf) }
func search(q string) (*SearchResult, villa.StrSet, error) { tokens := gcse.AppendTokens(nil, []byte(q)) log.Printf("tokens for query %s: %v", q, tokens) indexDB := indexDBBox.Get().(*index.TokenSetSearcher) if indexDB == nil { return &SearchResult{}, tokens, nil } var hits []*Hit N := indexDB.DocCount() Df := func(token string) int { return len(indexDB.TokenDocList(gcse.IndexTextField, token)) } _, _ = N, Df indexDB.Search(map[string]villa.StrSet{gcse.IndexTextField: tokens}, func(docID int32, data interface{}) error { hitInfo, _ := data.(gcse.HitInfo) hit := &Hit{ HitInfo: hitInfo, } hit.MatchScore = gcse.CalcMatchScore(&hitInfo, tokens, N, Df) hit.Score = hit.StaticScore * hit.MatchScore hits = append(hits, hit) return nil }) log.Printf("Got %d hits for query %q", len(hits), q) villa.SortF(len(hits), func(i, j int) bool { // true if doc i is before doc j ssi, ssj := hits[i].Score, hits[j].Score if ssi > ssj { return true } if ssi < ssj { return false } sci, scj := hits[i].StarCount, hits[j].StarCount if sci > scj { return true } if sci < scj { return false } pi, pj := hits[i].Package, hits[j].Package if len(pi) < len(pj) { return true } if len(pi) > len(pj) { return false } return pi < pj }, func(i, j int) { // Swap hits[i], hits[j] = hits[j], hits[i] }) return &SearchResult{ TotalResults: len(hits), Hits: hits, }, tokens, nil }
func search(q string) (*SearchResult, villa.StrSet, error) { tokens := gcse.AppendTokens(nil, []byte(q)) tokenList := tokens.Elements() log.Printf("tokens for query %s: %v", q, tokens) indexDB := indexDBBox.Get().(*index.TokenSetSearcher) if indexDB == nil { return &SearchResult{}, tokens, nil } var hits []*Hit N := indexDB.DocCount() TextDf := func(token string) int { return len(indexDB.TokenDocList(gcse.IndexTextField, token)) } NameDf := func(token string) int { return len(indexDB.TokenDocList(gcse.IndexNameField, token)) } textIdfs := make([]float64, len(tokenList)) nameIdfs := make([]float64, len(tokenList)) for i := range textIdfs { textIdfs[i] = idf(TextDf(tokenList[i]), N) nameIdfs[i] = idf(NameDf(tokenList[i]), N) } indexDB.Search(map[string]villa.StrSet{gcse.IndexTextField: tokens}, func(docID int32, data interface{}) error { hitInfo, _ := data.(gcse.HitInfo) hit := &Hit{ HitInfo: hitInfo, } hit.MatchScore = gcse.CalcMatchScore(&hitInfo, tokenList, textIdfs, nameIdfs) hit.Score = maxF(hit.StaticScore, hit.TestStaticScore) * hit.MatchScore hits = append(hits, hit) return nil }) log.Printf("Got %d hits for query %q", len(hits), q) swapHits := func(i, j int) { hits[i], hits[j] = hits[j], hits[i] } sortp.SortF(len(hits), func(i, j int) bool { // true if doc i is before doc j ssi, ssj := hits[i].Score, hits[j].Score if ssi > ssj { return true } if ssi < ssj { return false } sci, scj := hits[i].StarCount, hits[j].StarCount if sci > scj { return true } if sci < scj { return false } pi, pj := hits[i].Package, hits[j].Package if len(pi) < len(pj) { return true } if len(pi) > len(pj) { return false } return pi < pj }, swapHits) if len(hits) < 5000 { // Adjust Score by down ranking duplicated packages pkgCount := make(map[string]int) for _, hit := range hits { cnt := pkgCount[hit.Name] + 1 pkgCount[hit.Name] = cnt if cnt > 1 && len(hit.Imported) == 0 && len(hit.TestImported) == 0 { hit.Score /= float64(cnt) } } // Re-sort sortp.BubbleF(len(hits), func(i, j int) bool { return hits[i].Score > hits[j].Score }, swapHits) } return &SearchResult{ TotalResults: len(hits), Hits: hits, }, tokens, nil }
func search(tr trace.Trace, db database, q string) (*SearchResult, stringsp.Set, error) { tokens := gcse.AppendTokens(nil, []byte(q)) tokenList := tokens.Elements() log.Printf("tokens for query %s: %v", q, tokens) var hits []*Hit N := db.PackageCount() textIdfs := make([]float64, len(tokenList)) nameIdfs := make([]float64, len(tokenList)) for i := range textIdfs { textIdfs[i] = idf(db.PackageCountOfToken(gcse.IndexTextField, tokenList[i]), N) nameIdfs[i] = idf(db.PackageCountOfToken(gcse.IndexNameField, tokenList[i]), N) } db.Search(map[string]stringsp.Set{gcse.IndexTextField: tokens}, func(docID int32, data interface{}) error { hit := &Hit{} var ok bool hit.HitInfo, ok = data.(gcse.HitInfo) if !ok { log.Print("ok = false") } hit.MatchScore = gcse.CalcMatchScore(&hit.HitInfo, tokenList, textIdfs, nameIdfs) hit.Score = math.Max(hit.StaticScore, hit.TestStaticScore) * hit.MatchScore hits = append(hits, hit) return nil }) tr.LazyPrintf("Got %d hits for query %q", len(hits), q) swapHits := func(i, j int) { hits[i], hits[j] = hits[j], hits[i] } sortp.SortF(len(hits), func(i, j int) bool { // true if doc i is before doc j ssi, ssj := hits[i].Score, hits[j].Score if ssi > ssj { return true } if ssi < ssj { return false } sci, scj := hits[i].StarCount, hits[j].StarCount if sci > scj { return true } if sci < scj { return false } pi, pj := hits[i].Package, hits[j].Package if len(pi) < len(pj) { return true } if len(pi) > len(pj) { return false } return pi < pj }, swapHits) tr.LazyPrintf("Results sorted") if len(hits) < 5000 { // Adjust Score by down ranking duplicated packages pkgCount := make(map[string]int) for _, hit := range hits { cnt := pkgCount[hit.Name] + 1 pkgCount[hit.Name] = cnt if cnt > 1 && hit.ImportedLen == 0 && hit.TestImportedLen == 0 { hit.Score /= float64(cnt) } } // Re-sort sortp.BubbleF(len(hits), func(i, j int) bool { return hits[i].Score > hits[j].Score }, swapHits) tr.LazyPrintf("Results reranked") } return &SearchResult{ TotalResults: len(hits), Hits: hits, }, tokens, nil }