Exemple #1
0
func selectSnippets(text string, tokens villa.StrSet, maxBytes int) string {
	text = strings.TrimSpace(text)
	if len(text) <= maxBytes {
		return text
	}
	// return text[:maxBytes] + "..."

	lines := splitToLines(text)

	var hitTokens villa.StrSet
	type lineinfo struct {
		idx  int
		line string
	}
	var selLines []lineinfo
	count := 0
	for i, line := range lines {
		line = strings.TrimSpace(line)
		lines[i] = line

		lineTokens := gcse.AppendTokens(nil, []byte(line))
		reserve := false
		for token := range tokens {
			if !hitTokens.In(token) && lineTokens.In(token) {
				reserve = true
				hitTokens.Put(token)
			}
		}

		if i == 0 || reserve && (count+len(line)+1 < maxBytes) {
			selLines = append(selLines, lineinfo{
				idx:  i,
				line: line,
			})
			count += len(line) + 1
			if count == maxBytes {
				break
			}

			lines[i] = ""
		}
	}

	if count < maxBytes {
		for i, line := range lines {
			if len(line) == 0 {
				continue
			}

			if count+len(line) >= maxBytes {
				break
			}

			selLines = append(selLines, lineinfo{
				idx:  i,
				line: line,
			})

			count += len(line) + 1
		}

		villa.SortF(len(selLines), func(i, j int) bool {
			return selLines[i].idx < selLines[j].idx
		}, func(i, j int) {
			selLines[i], selLines[j] = selLines[j], selLines[i]
		})
	}

	var outBuf villa.ByteSlice
	for i, line := range selLines {
		if line.idx > 1 && (i < 1 || line.idx != selLines[i-1].idx+1) {
			outBuf.WriteString("...")
		} else {
			if i > 0 {
				outBuf.WriteString(" ")
			}
		}
		outBuf.WriteString(line.line)
	}

	if selLines[len(selLines)-1].idx != len(lines)-1 {
		outBuf.WriteString("...")
	}

	return string(outBuf)
}
Exemple #2
0
func search(q string) (*SearchResult, villa.StrSet, error) {
	tokens := gcse.AppendTokens(nil, []byte(q))
	log.Printf("tokens for query %s: %v", q, tokens)

	indexDB := indexDBBox.Get().(*index.TokenSetSearcher)

	if indexDB == nil {
		return &SearchResult{}, tokens, nil
	}

	var hits []*Hit

	N := indexDB.DocCount()
	Df := func(token string) int {
		return len(indexDB.TokenDocList(gcse.IndexTextField, token))
	}

	_, _ = N, Df

	indexDB.Search(map[string]villa.StrSet{gcse.IndexTextField: tokens},
		func(docID int32, data interface{}) error {
			hitInfo, _ := data.(gcse.HitInfo)
			hit := &Hit{
				HitInfo: hitInfo,
			}

			hit.MatchScore = gcse.CalcMatchScore(&hitInfo, tokens, N, Df)
			hit.Score = hit.StaticScore * hit.MatchScore

			hits = append(hits, hit)
			return nil
		})

	log.Printf("Got %d hits for query %q", len(hits), q)

	villa.SortF(len(hits), func(i, j int) bool {
		// true if doc i is before doc j
		ssi, ssj := hits[i].Score, hits[j].Score
		if ssi > ssj {
			return true
		}
		if ssi < ssj {
			return false
		}

		sci, scj := hits[i].StarCount, hits[j].StarCount
		if sci > scj {
			return true
		}
		if sci < scj {
			return false
		}

		pi, pj := hits[i].Package, hits[j].Package
		if len(pi) < len(pj) {
			return true
		}
		if len(pi) > len(pj) {
			return false
		}

		return pi < pj
	}, func(i, j int) {
		// Swap
		hits[i], hits[j] = hits[j], hits[i]
	})

	return &SearchResult{
		TotalResults: len(hits),
		Hits:         hits,
	}, tokens, nil
}
Exemple #3
0
func search(q string) (*SearchResult, villa.StrSet, error) {
	tokens := gcse.AppendTokens(nil, []byte(q))
	tokenList := tokens.Elements()
	log.Printf("tokens for query %s: %v", q, tokens)

	indexDB := indexDBBox.Get().(*index.TokenSetSearcher)

	if indexDB == nil {
		return &SearchResult{}, tokens, nil
	}

	var hits []*Hit

	N := indexDB.DocCount()
	TextDf := func(token string) int {
		return len(indexDB.TokenDocList(gcse.IndexTextField, token))
	}
	NameDf := func(token string) int {
		return len(indexDB.TokenDocList(gcse.IndexNameField, token))
	}

	textIdfs := make([]float64, len(tokenList))
	nameIdfs := make([]float64, len(tokenList))
	for i := range textIdfs {
		textIdfs[i] = idf(TextDf(tokenList[i]), N)
		nameIdfs[i] = idf(NameDf(tokenList[i]), N)
	}

	indexDB.Search(map[string]villa.StrSet{gcse.IndexTextField: tokens},
		func(docID int32, data interface{}) error {
			hitInfo, _ := data.(gcse.HitInfo)
			hit := &Hit{
				HitInfo: hitInfo,
			}

			hit.MatchScore = gcse.CalcMatchScore(&hitInfo, tokenList,
				textIdfs, nameIdfs)
			hit.Score = maxF(hit.StaticScore, hit.TestStaticScore) *
				hit.MatchScore

			hits = append(hits, hit)
			return nil
		})

	log.Printf("Got %d hits for query %q", len(hits), q)

	swapHits := func(i, j int) {
		hits[i], hits[j] = hits[j], hits[i]
	}

	sortp.SortF(len(hits), func(i, j int) bool {
		// true if doc i is before doc j
		ssi, ssj := hits[i].Score, hits[j].Score
		if ssi > ssj {
			return true
		}
		if ssi < ssj {
			return false
		}

		sci, scj := hits[i].StarCount, hits[j].StarCount
		if sci > scj {
			return true
		}
		if sci < scj {
			return false
		}

		pi, pj := hits[i].Package, hits[j].Package
		if len(pi) < len(pj) {
			return true
		}
		if len(pi) > len(pj) {
			return false
		}

		return pi < pj
	}, swapHits)

	if len(hits) < 5000 {
		// Adjust Score by down ranking duplicated packages
		pkgCount := make(map[string]int)
		for _, hit := range hits {
			cnt := pkgCount[hit.Name] + 1
			pkgCount[hit.Name] = cnt
			if cnt > 1 && len(hit.Imported) == 0 && len(hit.TestImported) == 0 {
				hit.Score /= float64(cnt)
			}
		}

		// Re-sort
		sortp.BubbleF(len(hits), func(i, j int) bool {
			return hits[i].Score > hits[j].Score
		}, swapHits)
	}

	return &SearchResult{
		TotalResults: len(hits),
		Hits:         hits,
	}, tokens, nil
}
Exemple #4
0
func search(tr trace.Trace, db database, q string) (*SearchResult, stringsp.Set, error) {
	tokens := gcse.AppendTokens(nil, []byte(q))
	tokenList := tokens.Elements()
	log.Printf("tokens for query %s: %v", q, tokens)

	var hits []*Hit

	N := db.PackageCount()
	textIdfs := make([]float64, len(tokenList))
	nameIdfs := make([]float64, len(tokenList))
	for i := range textIdfs {
		textIdfs[i] = idf(db.PackageCountOfToken(gcse.IndexTextField, tokenList[i]), N)
		nameIdfs[i] = idf(db.PackageCountOfToken(gcse.IndexNameField, tokenList[i]), N)
	}

	db.Search(map[string]stringsp.Set{gcse.IndexTextField: tokens},
		func(docID int32, data interface{}) error {
			hit := &Hit{}
			var ok bool
			hit.HitInfo, ok = data.(gcse.HitInfo)
			if !ok {
				log.Print("ok = false")
			}

			hit.MatchScore = gcse.CalcMatchScore(&hit.HitInfo, tokenList, textIdfs, nameIdfs)
			hit.Score = math.Max(hit.StaticScore, hit.TestStaticScore) * hit.MatchScore

			hits = append(hits, hit)
			return nil
		})
	tr.LazyPrintf("Got %d hits for query %q", len(hits), q)

	swapHits := func(i, j int) {
		hits[i], hits[j] = hits[j], hits[i]
	}
	sortp.SortF(len(hits), func(i, j int) bool {
		// true if doc i is before doc j
		ssi, ssj := hits[i].Score, hits[j].Score
		if ssi > ssj {
			return true
		}
		if ssi < ssj {
			return false
		}
		sci, scj := hits[i].StarCount, hits[j].StarCount
		if sci > scj {
			return true
		}
		if sci < scj {
			return false
		}
		pi, pj := hits[i].Package, hits[j].Package
		if len(pi) < len(pj) {
			return true
		}
		if len(pi) > len(pj) {
			return false
		}
		return pi < pj
	}, swapHits)

	tr.LazyPrintf("Results sorted")

	if len(hits) < 5000 {
		// Adjust Score by down ranking duplicated packages
		pkgCount := make(map[string]int)
		for _, hit := range hits {
			cnt := pkgCount[hit.Name] + 1
			pkgCount[hit.Name] = cnt
			if cnt > 1 && hit.ImportedLen == 0 && hit.TestImportedLen == 0 {
				hit.Score /= float64(cnt)
			}
		}
		// Re-sort
		sortp.BubbleF(len(hits), func(i, j int) bool {
			return hits[i].Score > hits[j].Score
		}, swapHits)
		tr.LazyPrintf("Results reranked")
	}
	return &SearchResult{
		TotalResults: len(hits),
		Hits:         hits,
	}, tokens, nil
}