func Index(docDB mr.Input, outDir string) (*index.TokenSetSearcher, error) { DumpMemStats() docPartCnt, err := docDB.PartCount() if err != nil { return nil, err } docCount := 0 log.Printf("Generating importsDB ...") importsDB := NewTokenIndexer("", "") testImportsDB := NewTokenIndexer("", "") // per project imported by projects prjImportsDB := NewTokenIndexer("", "") type projectStart struct { StarCount int LastUpdated time.Time } prjStars := make(map[string]projectStart) // generate importsDB for i := 0; i < docPartCnt; i++ { it, err := docDB.Iterator(i) if err != nil { return nil, err } var pkg sophie.RawString var docInfo DocInfo for { if err := it.Next(&pkg, &docInfo); err != nil { if errorsp.Cause(err) == io.EOF { break } it.Close() return nil, err } filterDocInfo(&docInfo) importsDB.PutTokens(string(pkg), stringsp.NewSet(docInfo.Imports...)) testImportsDB.PutTokens(string(pkg), stringsp.NewSet(docInfo.TestImports...)) var projects stringsp.Set for _, imp := range docInfo.Imports { projects.Add(FullProjectOfPackage(imp)) } for _, imp := range docInfo.TestImports { projects.Add(FullProjectOfPackage(imp)) } prj := FullProjectOfPackage(string(pkg)) orgProjects := prjImportsDB.TokensOfId(prj) projects.Add(orgProjects...) prjImportsDB.PutTokens(prj, projects) // update stars if cur, ok := prjStars[prj]; !ok || docInfo.LastUpdated.After(cur.LastUpdated) { prjStars[prj] = projectStart{ StarCount: docInfo.StarCount, LastUpdated: docInfo.LastUpdated, } } docCount++ } it.Close() } DumpMemStats() log.Printf("Making HitInfos ...") hits := make([]HitInfo, 0, docCount) for i := 0; i < docPartCnt; i++ { it, err := docDB.Iterator(i) if err != nil { return nil, err } var pkg sophie.RawString var hitInfo HitInfo for { if err := it.Next(&pkg, &hitInfo.DocInfo); err != nil { if errorsp.Cause(err) == io.EOF { break } it.Close() return nil, err } filterDocInfo(&hitInfo.DocInfo) hitInfo.Imported = importsDB.IdsOfToken(hitInfo.Package) hitInfo.ImportedLen = len(hitInfo.Imported) hitInfo.TestImported = testImportsDB.IdsOfToken(hitInfo.Package) hitInfo.TestImportedLen = len(hitInfo.TestImported) realTestImported := excludeImports(testImportsDB.IdsOfToken(hitInfo.Package), hitInfo.Imported) prj := FullProjectOfPackage(hitInfo.Package) impPrjsCnt := len(prjImportsDB.IdsOfToken(prj)) var assignedStarCount = float64(prjStars[prj].StarCount) if prj != hitInfo.Package { if impPrjsCnt == 0 { assignedStarCount = 0 } else { perStarCount := float64(prjStars[prj].StarCount) / float64(impPrjsCnt) var projects stringsp.Set for _, imp := range hitInfo.Imported { projects.Add(FullProjectOfPackage(imp)) } for _, imp := range hitInfo.TestImported { projects.Add(FullProjectOfPackage(imp)) } assignedStarCount = perStarCount * float64(len(projects)) } } hitInfo.AssignedStarCount = assignedStarCount readme := ReadmeToText(hitInfo.ReadmeFn, hitInfo.ReadmeData) hitInfo.ImportantSentences = ChooseImportantSentenses(readme, hitInfo.Name, hitInfo.Package) // StaticScore is calculated after setting all other fields of // hitInfo hitInfo.StaticScore = CalcStaticScore(&hitInfo) hitInfo.TestStaticScore = CalcTestStaticScore(&hitInfo, realTestImported) hits = append(hits, hitInfo) } it.Close() } DumpMemStats() importsDB = nil testImportsDB = nil DumpMemStats() log.Printf("%d hits collected, sorting static-scores in descending order", len(hits)) idxs := sortp.IndexSortF(len(hits), func(i, j int) bool { return hits[i].StaticScore > hits[j].StaticScore }) ts := &index.TokenSetSearcher{} DumpMemStats() log.Printf("Indexing %d packages to TokenSetSearcher ...", len(idxs)) hitsArr, err := index.CreateConstArray(path.Join(outDir, HitsArrFn)) if err != nil { return nil, err } defer hitsArr.Close() if err := indexAndSaveHits(ts, hits, idxs, func(hit *HitInfo) error { _, err := hitsArr.AppendGob(*hit) return err }); err != nil { return nil, err } return ts, nil }
func Index(docDB mr.Input) (*index.TokenSetSearcher, error) { DumpMemStats() docPartCnt, err := docDB.PartCount() if err != nil { return nil, err } docCount := 0 log.Printf("Generating importsDB ...") importsDB := NewTokenIndexer("", "") testImportsDB := NewTokenIndexer("", "") // per project imported by projects prjImportsDB := NewTokenIndexer("", "") prjStars := make(map[string]struct { StarCount int LastUpdated time.Time }) // generate importsDB for i := 0; i < docPartCnt; i++ { it, err := docDB.Iterator(i) if err != nil { return nil, err } var pkg sophie.RawString var docInfo DocInfo for { if err := it.Next(&pkg, &docInfo); err != nil { if err == sophie.EOF { break } it.Close() return nil, err } filterDocInfo(&docInfo) importsDB.PutTokens(string(pkg), stringsp.NewSet(docInfo.Imports...)) testImportsDB.PutTokens(string(pkg), stringsp.NewSet(docInfo.TestImports...)) var projects stringsp.Set for _, imp := range docInfo.Imports { projects.Add(FullProjectOfPackage(imp)) } for _, imp := range docInfo.TestImports { projects.Add(FullProjectOfPackage(imp)) } prj := FullProjectOfPackage(string(pkg)) orgProjects := prjImportsDB.TokensOfId(prj) projects.Add(orgProjects...) prjImportsDB.PutTokens(prj, projects) // update stars if cur, ok := prjStars[prj]; !ok || docInfo.LastUpdated.After(cur.LastUpdated) { prjStars[prj] = struct { StarCount int LastUpdated time.Time }{ docInfo.StarCount, docInfo.LastUpdated, } } docCount++ } it.Close() } DumpMemStats() log.Printf("Making HitInfos ...") hits := make([]HitInfo, 0, docCount) for i := 0; i < docPartCnt; i++ { it, err := docDB.Iterator(i) if err != nil { return nil, err } var pkg sophie.RawString var hitInfo HitInfo for { if err := it.Next(&pkg, &hitInfo.DocInfo); err != nil { if err == sophie.EOF { break } it.Close() return nil, err } filterDocInfo(&hitInfo.DocInfo) hitInfo.Imported = importsDB.IdsOfToken(hitInfo.Package) hitInfo.TestImported = testImportsDB.IdsOfToken(hitInfo.Package) realTestImported := excludeImports(testImportsDB.IdsOfToken(hitInfo.Package), hitInfo.Imported) prj := FullProjectOfPackage(hitInfo.Package) impPrjsCnt := len(prjImportsDB.IdsOfToken(prj)) var assignedStarCount = float64(prjStars[prj].StarCount) if prj != hitInfo.Package { if impPrjsCnt == 0 { assignedStarCount = 0 } else { perStarCount := float64(prjStars[prj].StarCount) / float64(impPrjsCnt) var projects stringsp.Set for _, imp := range hitInfo.Imported { projects.Add(FullProjectOfPackage(imp)) } for _, imp := range hitInfo.TestImported { projects.Add(FullProjectOfPackage(imp)) } assignedStarCount = perStarCount * float64(len(projects)) } } hitInfo.AssignedStarCount = assignedStarCount readme := ReadmeToText(hitInfo.ReadmeFn, hitInfo.ReadmeData) hitInfo.ImportantSentences = ChooseImportantSentenses(readme, hitInfo.Name, hitInfo.Package) // StaticScore is calculated after setting all other fields of // hitInfo hitInfo.StaticScore = CalcStaticScore(&hitInfo) hitInfo.TestStaticScore = CalcTestStaticScore(&hitInfo, realTestImported) hits = append(hits, hitInfo) } it.Close() } DumpMemStats() importsDB = nil DumpMemStats() log.Printf("%d hits collected, sorting static-scores in descending order", len(hits)) idxs := make([]int, len(hits)) for i := range idxs { idxs[i] = i } sortp.SortF(len(idxs), func(i, j int) bool { return hits[idxs[i]].StaticScore > hits[idxs[j]].StaticScore }, func(i, j int) { idxs[i], idxs[j] = idxs[j], idxs[i] }) ts := &index.TokenSetSearcher{} DumpMemStats() log.Printf("Indexing %d packages to TokenSetSearcher ...", len(idxs)) rank := 0 for i := range idxs { hit := &hits[idxs[i]] if i > 0 && hit.StaticScore < hits[idxs[i-1]].StaticScore { rank = i } hit.StaticRank = rank var nameTokens stringsp.Set nameTokens = AppendTokens(nameTokens, []byte(hit.Name)) var tokens stringsp.Set tokens.Add(nameTokens.Elements()...) tokens = AppendTokens(tokens, []byte(hit.Package)) tokens = AppendTokens(tokens, []byte(hit.Description)) tokens = AppendTokens(tokens, []byte(hit.ReadmeData)) tokens = AppendTokens(tokens, []byte(hit.Author)) for _, word := range hit.Exported { AppendTokens(tokens, []byte(word)) } ts.AddDoc(map[string]stringsp.Set{ IndexTextField: tokens, IndexNameField: nameTokens, IndexPkgField: stringsp.NewSet(hit.Package), }, *hit) } DumpMemStats() return ts, nil }