Example #1
0
func doIndex() bool {
	idxSegm, err := gcse.IndexSegments.GenMaxSegment()
	if err != nil {
		log.Printf("GenMaxSegment failed: %v", err)
		return false
	}

	runtime.GC()
	gcse.DumpMemStats()

	log.Printf("Indexing to %v ...", idxSegm)

	fpDocDB := sophie.LocalFsPath(configs.DocsDBPath().S())
	ts, err := gcse.Index(kv.DirInput(fpDocDB), idxSegm.Join("").S())
	if err != nil {
		log.Printf("Indexing failed: %v", err)
		return false
	}

	if !func() bool {
		f, err := idxSegm.Join(gcse.IndexFn).Create()
		if err != nil {
			log.Printf("Create index file failed: %v", err)
			return false
		}
		defer f.Close()

		log.Printf("Saving index to %v ...", idxSegm)
		if err := ts.Save(f); err != nil {
			log.Printf("ts.Save failed: %v", err)
			return false
		}
		return true
	}() {
		return false
	}
	runtime.GC()
	gcse.DumpMemStats()

	storePath := idxSegm.Join(configs.FnStore)
	log.Printf("Saving store snapshot to %v", storePath)
	if err := store.SaveSnapshot(storePath.S()); err != nil {
		log.Printf("SaveSnapshot %v failed: %v", storePath, err)
	}

	if err := idxSegm.Done(); err != nil {
		log.Printf("segm.Done failed: %v", err)
		return false
	}

	log.Printf("Indexing success: %s (%d)", idxSegm, ts.DocCount())
	gcse.AddBiValueAndProcess(bi.Average, "index.doc-count", ts.DocCount())

	ts = nil
	gcse.DumpMemStats()
	runtime.GC()
	gcse.DumpMemStats()

	return true
}
Example #2
0
func main() {
	log.Println("Running tocrawl tool, to generate crawling list")
	log.Println("NonCrawlHosts: ", configs.NonCrawlHosts)
	log.Println("CrawlGithubUpdate: ", configs.CrawlGithubUpdate)
	log.Println("CrawlByGodocApi: ", configs.CrawlByGodocApi)

	log.Printf("Using personal: %v", configs.CrawlerGithubPersonal)
	gcse.GithubSpider = github.NewSpiderWithToken(configs.CrawlerGithubPersonal)

	// Load CrawlerDB
	cDB = gcse.LoadCrawlerDB()

	if configs.CrawlGithubUpdate || configs.CrawlByGodocApi {
		// load pkgUTs
		pkgUTs, err := loadPackageUpdateTimes(
			sophie.LocalFsPath(configs.DocsDBPath().S()))
		if err != nil {
			log.Fatalf("loadPackageUpdateTimes failed: %v", err)
		}

		if configs.CrawlGithubUpdate {
			touchByGithubUpdates(pkgUTs)
		}

		if configs.CrawlByGodocApi {
			httpClient := gcse.GenHttpClient("")
			pkgs, err := gcse.FetchAllPackagesInGodoc(httpClient)
			if err != nil {
				log.Fatalf("FetchAllPackagesInGodoc failed: %v", err)
			}
			gcse.AddBiValueAndProcess(bi.Max, "godoc.doc-count", len(pkgs))
			log.Printf("FetchAllPackagesInGodoc returns %d entries", len(pkgs))
			now := time.Now()
			for _, pkg := range pkgs {
				cDB.AppendPackage(pkg, func(pkg string) bool {
					_, ok := pkgUTs[pkg]
					return ok
				})
				site, path := utils.SplitPackage(pkg)
				if err := store.AppendPackageEvent(site, path, "godoc", now, sppb.HistoryEvent_Action_None); err != nil {
					log.Printf("UpdatePackageHistory %s %s failed: %v", site, path, err)
				}
			}
		}
		syncDatabases()
	}

	log.Printf("Package DB: %d entries", cDB.PackageDB.Count())
	log.Printf("Person DB: %d entries", cDB.PersonDB.Count())

	pathToCrawl := configs.DataRoot.Join(configs.FnToCrawl)

	kvPackage := kv.DirOutput(sophie.LocalFsPath(
		pathToCrawl.Join(configs.FnPackage).S()))
	kvPackage.Clean()
	if err := generateCrawlEntries(cDB.PackageDB, gcse.HostOfPackage, kvPackage); err != nil {
		log.Fatalf("generateCrawlEntries %v failed: %v", kvPackage.Path, err)
	}

	kvPerson := kv.DirOutput(sophie.LocalFsPath(
		pathToCrawl.Join(configs.FnPerson).S()))

	kvPerson.Clean()
	if err := generateCrawlEntries(cDB.PersonDB, func(id string) string {
		site, _ := gcse.ParsePersonId(id)
		return site
	}, kvPerson); err != nil {
		log.Fatalf("generateCrawlEntries %v failed: %v", kvPerson.Path, err)
	}
}