Пример #1
0
func Index(docDB mr.Input, outDir string) (*index.TokenSetSearcher, error) {
	DumpMemStats()

	docPartCnt, err := docDB.PartCount()
	if err != nil {
		return nil, err
	}
	docCount := 0

	log.Printf("Generating importsDB ...")
	importsDB := NewTokenIndexer("", "")
	testImportsDB := NewTokenIndexer("", "")
	// per project imported by projects
	prjImportsDB := NewTokenIndexer("", "")
	type projectStart struct {
		StarCount   int
		LastUpdated time.Time
	}
	prjStars := make(map[string]projectStart)

	// generate importsDB
	for i := 0; i < docPartCnt; i++ {
		it, err := docDB.Iterator(i)
		if err != nil {
			return nil, err
		}
		var pkg sophie.RawString
		var docInfo DocInfo
		for {
			if err := it.Next(&pkg, &docInfo); err != nil {
				if errorsp.Cause(err) == io.EOF {
					break
				}
				it.Close()
				return nil, err
			}
			filterDocInfo(&docInfo)

			importsDB.PutTokens(string(pkg), stringsp.NewSet(docInfo.Imports...))
			testImportsDB.PutTokens(string(pkg), stringsp.NewSet(docInfo.TestImports...))

			var projects stringsp.Set
			for _, imp := range docInfo.Imports {
				projects.Add(FullProjectOfPackage(imp))
			}
			for _, imp := range docInfo.TestImports {
				projects.Add(FullProjectOfPackage(imp))
			}
			prj := FullProjectOfPackage(string(pkg))
			orgProjects := prjImportsDB.TokensOfId(prj)
			projects.Add(orgProjects...)
			prjImportsDB.PutTokens(prj, projects)

			// update stars
			if cur, ok := prjStars[prj]; !ok ||
				docInfo.LastUpdated.After(cur.LastUpdated) {
				prjStars[prj] = projectStart{
					StarCount:   docInfo.StarCount,
					LastUpdated: docInfo.LastUpdated,
				}
			}
			docCount++
		}
		it.Close()
	}

	DumpMemStats()
	log.Printf("Making HitInfos ...")
	hits := make([]HitInfo, 0, docCount)
	for i := 0; i < docPartCnt; i++ {
		it, err := docDB.Iterator(i)
		if err != nil {
			return nil, err
		}
		var pkg sophie.RawString
		var hitInfo HitInfo
		for {
			if err := it.Next(&pkg, &hitInfo.DocInfo); err != nil {
				if errorsp.Cause(err) == io.EOF {
					break
				}
				it.Close()
				return nil, err
			}
			filterDocInfo(&hitInfo.DocInfo)

			hitInfo.Imported = importsDB.IdsOfToken(hitInfo.Package)
			hitInfo.ImportedLen = len(hitInfo.Imported)
			hitInfo.TestImported = testImportsDB.IdsOfToken(hitInfo.Package)
			hitInfo.TestImportedLen = len(hitInfo.TestImported)
			realTestImported := excludeImports(testImportsDB.IdsOfToken(hitInfo.Package), hitInfo.Imported)

			prj := FullProjectOfPackage(hitInfo.Package)
			impPrjsCnt := len(prjImportsDB.IdsOfToken(prj))
			var assignedStarCount = float64(prjStars[prj].StarCount)
			if prj != hitInfo.Package {
				if impPrjsCnt == 0 {
					assignedStarCount = 0
				} else {
					perStarCount :=
						float64(prjStars[prj].StarCount) / float64(impPrjsCnt)

					var projects stringsp.Set
					for _, imp := range hitInfo.Imported {
						projects.Add(FullProjectOfPackage(imp))
					}
					for _, imp := range hitInfo.TestImported {
						projects.Add(FullProjectOfPackage(imp))
					}
					assignedStarCount = perStarCount * float64(len(projects))
				}
			}
			hitInfo.AssignedStarCount = assignedStarCount

			readme := ReadmeToText(hitInfo.ReadmeFn, hitInfo.ReadmeData)

			hitInfo.ImportantSentences = ChooseImportantSentenses(readme,
				hitInfo.Name, hitInfo.Package)
			// StaticScore is calculated after setting all other fields of
			// hitInfo
			hitInfo.StaticScore = CalcStaticScore(&hitInfo)
			hitInfo.TestStaticScore = CalcTestStaticScore(&hitInfo, realTestImported)
			hits = append(hits, hitInfo)
		}
		it.Close()
	}

	DumpMemStats()
	importsDB = nil
	testImportsDB = nil
	DumpMemStats()
	log.Printf("%d hits collected, sorting static-scores in descending order", len(hits))

	idxs := sortp.IndexSortF(len(hits), func(i, j int) bool {
		return hits[i].StaticScore > hits[j].StaticScore
	})

	ts := &index.TokenSetSearcher{}
	DumpMemStats()
	log.Printf("Indexing %d packages to TokenSetSearcher ...", len(idxs))
	hitsArr, err := index.CreateConstArray(path.Join(outDir, HitsArrFn))
	if err != nil {
		return nil, err
	}
	defer hitsArr.Close()

	if err := indexAndSaveHits(ts, hits, idxs, func(hit *HitInfo) error {
		_, err := hitsArr.AppendGob(*hit)
		return err
	}); err != nil {
		return nil, err
	}

	return ts, nil
}
Пример #2
0
func Index(docDB mr.Input) (*index.TokenSetSearcher, error) {
	DumpMemStats()

	docPartCnt, err := docDB.PartCount()
	if err != nil {
		return nil, err
	}
	docCount := 0

	log.Printf("Generating importsDB ...")
	importsDB := NewTokenIndexer("", "")
	testImportsDB := NewTokenIndexer("", "")
	// per project imported by projects
	prjImportsDB := NewTokenIndexer("", "")
	prjStars := make(map[string]struct {
		StarCount   int
		LastUpdated time.Time
	})
	// generate importsDB
	for i := 0; i < docPartCnt; i++ {
		it, err := docDB.Iterator(i)
		if err != nil {
			return nil, err
		}

		var pkg sophie.RawString
		var docInfo DocInfo
		for {
			if err := it.Next(&pkg, &docInfo); err != nil {
				if err == sophie.EOF {
					break
				}
				it.Close()
				return nil, err
			}
			filterDocInfo(&docInfo)

			importsDB.PutTokens(string(pkg), stringsp.NewSet(docInfo.Imports...))
			testImportsDB.PutTokens(string(pkg),
				stringsp.NewSet(docInfo.TestImports...))

			var projects stringsp.Set
			for _, imp := range docInfo.Imports {
				projects.Add(FullProjectOfPackage(imp))
			}
			for _, imp := range docInfo.TestImports {
				projects.Add(FullProjectOfPackage(imp))
			}
			prj := FullProjectOfPackage(string(pkg))
			orgProjects := prjImportsDB.TokensOfId(prj)
			projects.Add(orgProjects...)
			prjImportsDB.PutTokens(prj, projects)

			// update stars
			if cur, ok := prjStars[prj]; !ok ||
				docInfo.LastUpdated.After(cur.LastUpdated) {
				prjStars[prj] = struct {
					StarCount   int
					LastUpdated time.Time
				}{
					docInfo.StarCount,
					docInfo.LastUpdated,
				}
			}

			docCount++
		}

		it.Close()
	}

	DumpMemStats()
	log.Printf("Making HitInfos ...")
	hits := make([]HitInfo, 0, docCount)
	for i := 0; i < docPartCnt; i++ {
		it, err := docDB.Iterator(i)
		if err != nil {
			return nil, err
		}

		var pkg sophie.RawString
		var hitInfo HitInfo
		for {
			if err := it.Next(&pkg, &hitInfo.DocInfo); err != nil {
				if err == sophie.EOF {
					break
				}
				it.Close()
				return nil, err
			}
			filterDocInfo(&hitInfo.DocInfo)

			hitInfo.Imported = importsDB.IdsOfToken(hitInfo.Package)
			hitInfo.TestImported = testImportsDB.IdsOfToken(hitInfo.Package)
			realTestImported := excludeImports(testImportsDB.IdsOfToken(hitInfo.Package), hitInfo.Imported)

			prj := FullProjectOfPackage(hitInfo.Package)
			impPrjsCnt := len(prjImportsDB.IdsOfToken(prj))
			var assignedStarCount = float64(prjStars[prj].StarCount)
			if prj != hitInfo.Package {
				if impPrjsCnt == 0 {
					assignedStarCount = 0
				} else {
					perStarCount :=
						float64(prjStars[prj].StarCount) / float64(impPrjsCnt)

					var projects stringsp.Set
					for _, imp := range hitInfo.Imported {
						projects.Add(FullProjectOfPackage(imp))
					}
					for _, imp := range hitInfo.TestImported {
						projects.Add(FullProjectOfPackage(imp))
					}
					assignedStarCount = perStarCount * float64(len(projects))
				}
			}
			hitInfo.AssignedStarCount = assignedStarCount

			readme := ReadmeToText(hitInfo.ReadmeFn, hitInfo.ReadmeData)

			hitInfo.ImportantSentences = ChooseImportantSentenses(readme,
				hitInfo.Name, hitInfo.Package)
			// StaticScore is calculated after setting all other fields of
			// hitInfo
			hitInfo.StaticScore = CalcStaticScore(&hitInfo)
			hitInfo.TestStaticScore = CalcTestStaticScore(&hitInfo, realTestImported)
			hits = append(hits, hitInfo)
		}

		it.Close()
	}

	DumpMemStats()
	importsDB = nil
	DumpMemStats()
	log.Printf("%d hits collected, sorting static-scores in descending order",
		len(hits))
	idxs := make([]int, len(hits))
	for i := range idxs {
		idxs[i] = i
	}
	sortp.SortF(len(idxs), func(i, j int) bool {
		return hits[idxs[i]].StaticScore > hits[idxs[j]].StaticScore
	}, func(i, j int) {
		idxs[i], idxs[j] = idxs[j], idxs[i]
	})
	ts := &index.TokenSetSearcher{}

	DumpMemStats()
	log.Printf("Indexing %d packages to TokenSetSearcher ...", len(idxs))
	rank := 0
	for i := range idxs {
		hit := &hits[idxs[i]]
		if i > 0 && hit.StaticScore < hits[idxs[i-1]].StaticScore {
			rank = i
		}
		hit.StaticRank = rank

		var nameTokens stringsp.Set
		nameTokens = AppendTokens(nameTokens, []byte(hit.Name))

		var tokens stringsp.Set
		tokens.Add(nameTokens.Elements()...)
		tokens = AppendTokens(tokens, []byte(hit.Package))
		tokens = AppendTokens(tokens, []byte(hit.Description))
		tokens = AppendTokens(tokens, []byte(hit.ReadmeData))
		tokens = AppendTokens(tokens, []byte(hit.Author))
		for _, word := range hit.Exported {
			AppendTokens(tokens, []byte(word))
		}

		ts.AddDoc(map[string]stringsp.Set{
			IndexTextField: tokens,
			IndexNameField: nameTokens,
			IndexPkgField:  stringsp.NewSet(hit.Package),
		}, *hit)
	}

	DumpMemStats()
	return ts, nil
}