func doIndex() bool { idxSegm, err := gcse.IndexSegments.GenMaxSegment() if err != nil { log.Printf("GenMaxSegment failed: %v", err) return false } runtime.GC() gcse.DumpMemStats() log.Printf("Indexing to %v ...", idxSegm) fpDocDB := sophie.LocalFsPath(configs.DocsDBPath().S()) ts, err := gcse.Index(kv.DirInput(fpDocDB), idxSegm.Join("").S()) if err != nil { log.Printf("Indexing failed: %v", err) return false } if !func() bool { f, err := idxSegm.Join(gcse.IndexFn).Create() if err != nil { log.Printf("Create index file failed: %v", err) return false } defer f.Close() log.Printf("Saving index to %v ...", idxSegm) if err := ts.Save(f); err != nil { log.Printf("ts.Save failed: %v", err) return false } return true }() { return false } runtime.GC() gcse.DumpMemStats() storePath := idxSegm.Join(configs.FnStore) log.Printf("Saving store snapshot to %v", storePath) if err := store.SaveSnapshot(storePath.S()); err != nil { log.Printf("SaveSnapshot %v failed: %v", storePath, err) } if err := idxSegm.Done(); err != nil { log.Printf("segm.Done failed: %v", err) return false } log.Printf("Indexing success: %s (%d)", idxSegm, ts.DocCount()) gcse.AddBiValueAndProcess(bi.Average, "index.doc-count", ts.DocCount()) ts = nil gcse.DumpMemStats() runtime.GC() gcse.DumpMemStats() return true }
func generateCrawlEntries(db *gcse.MemDB, hostFromID func(id string) string, out kv.DirOutput) error { now := time.Now() type idAndCrawlingEntry struct { id string ent *gcse.CrawlingEntry } groups := make(map[string][]idAndCrawlingEntry) count := 0 type nameAndAges struct { maxName string maxAge time.Duration sumAgeHours float64 cnt int } ages := make(map[string]nameAndAges) if err := db.Iterate(func(id string, val interface{}) error { ent, ok := val.(gcse.CrawlingEntry) if !ok { log.Printf("Wrong entry: %+v", ent) return nil } if ent.Version == gcse.CrawlerVersion && ent.ScheduleTime.After(now) { return nil } host := hostFromID(id) // check host black list if configs.NonCrawlHosts.Contain(host) { return nil } if rand.Intn(10) == 0 { // randomly set Etag to empty to fetch stars ent.Etag = "" } groups[host] = append(groups[host], idAndCrawlingEntry{id, &ent}) age := now.Sub(ent.ScheduleTime) na := ages[host] if age > na.maxAge { na.maxName, na.maxAge = id, age } na.sumAgeHours += age.Hours() na.cnt++ ages[host] = na count++ return nil }); err != nil { return errorsp.WithStacks(err) } index := 0 for _, g := range groups { sortp.SortF(len(g), func(i, j int) bool { return g[i].ent.ScheduleTime.Before(g[j].ent.ScheduleTime) }, func(i, j int) { g[i], g[j] = g[j], g[i] }) if err := func(index int, ies []idAndCrawlingEntry) error { c, err := out.Collector(index) if err != nil { return err } defer c.Close() for _, ie := range ies { if err := c.Collect(sophie.RawString(ie.id), ie.ent); err != nil { return err } } return nil }(index, g); err != nil { log.Printf("Saving ents failed: %v", err) } index++ } for host, na := range ages { aveAge := time.Duration(na.sumAgeHours / float64(na.cnt) * float64(time.Hour)) log.Printf("%s age: max -> %v(%s), ave -> %v", host, na.maxAge, na.maxName, aveAge) if host == "github.com" && strings.Contains(out.Path, configs.FnPackage) { gcse.AddBiValueAndProcess(bi.Average, "crawler.github_max_age.hours", int(na.maxAge.Hours())) gcse.AddBiValueAndProcess(bi.Average, "crawler.github_max_age.days", int(na.maxAge/timep.Day)) gcse.AddBiValueAndProcess(bi.Average, "crawler.github_ave_age.hours", int(aveAge.Hours())) gcse.AddBiValueAndProcess(bi.Average, "crawler.github_ave_age.days", int(aveAge/timep.Day)) } } log.Printf("%d entries to crawl for folder %v", count, out.Path) return nil }
func main() { log.Println("Running tocrawl tool, to generate crawling list") log.Println("NonCrawlHosts: ", configs.NonCrawlHosts) log.Println("CrawlGithubUpdate: ", configs.CrawlGithubUpdate) log.Println("CrawlByGodocApi: ", configs.CrawlByGodocApi) log.Printf("Using personal: %v", configs.CrawlerGithubPersonal) gcse.GithubSpider = github.NewSpiderWithToken(configs.CrawlerGithubPersonal) // Load CrawlerDB cDB = gcse.LoadCrawlerDB() if configs.CrawlGithubUpdate || configs.CrawlByGodocApi { // load pkgUTs pkgUTs, err := loadPackageUpdateTimes( sophie.LocalFsPath(configs.DocsDBPath().S())) if err != nil { log.Fatalf("loadPackageUpdateTimes failed: %v", err) } if configs.CrawlGithubUpdate { touchByGithubUpdates(pkgUTs) } if configs.CrawlByGodocApi { httpClient := gcse.GenHttpClient("") pkgs, err := gcse.FetchAllPackagesInGodoc(httpClient) if err != nil { log.Fatalf("FetchAllPackagesInGodoc failed: %v", err) } gcse.AddBiValueAndProcess(bi.Max, "godoc.doc-count", len(pkgs)) log.Printf("FetchAllPackagesInGodoc returns %d entries", len(pkgs)) now := time.Now() for _, pkg := range pkgs { cDB.AppendPackage(pkg, func(pkg string) bool { _, ok := pkgUTs[pkg] return ok }) site, path := utils.SplitPackage(pkg) if err := store.AppendPackageEvent(site, path, "godoc", now, sppb.HistoryEvent_Action_None); err != nil { log.Printf("UpdatePackageHistory %s %s failed: %v", site, path, err) } } } syncDatabases() } log.Printf("Package DB: %d entries", cDB.PackageDB.Count()) log.Printf("Person DB: %d entries", cDB.PersonDB.Count()) pathToCrawl := configs.DataRoot.Join(configs.FnToCrawl) kvPackage := kv.DirOutput(sophie.LocalFsPath( pathToCrawl.Join(configs.FnPackage).S())) kvPackage.Clean() if err := generateCrawlEntries(cDB.PackageDB, gcse.HostOfPackage, kvPackage); err != nil { log.Fatalf("generateCrawlEntries %v failed: %v", kvPackage.Path, err) } kvPerson := kv.DirOutput(sophie.LocalFsPath( pathToCrawl.Join(configs.FnPerson).S())) kvPerson.Clean() if err := generateCrawlEntries(cDB.PersonDB, func(id string) string { site, _ := gcse.ParsePersonId(id) return site }, kvPerson); err != nil { log.Fatalf("generateCrawlEntries %v failed: %v", kvPerson.Path, err) } }
func loadIndex() error { segm, err := gcse.IndexSegments.FindMaxDone() if segm == nil || err != nil { return err } if indexSegment != nil && !gcse.SegmentLess(indexSegment, segm) { // no new index return nil } db := &searcherDB{} if err := func() error { f, err := segm.Join(gcse.IndexFn).Open() if err != nil { return err } defer f.Close() return db.ts.Load(f) }(); err != nil { return err } db.storeDB = &bh.RefCountBox{ DataPath: func() string { return segm.Join(configs.FnStore).S() }, } hitsPath := segm.Join(gcse.HitsArrFn) if db.hits, err = index.OpenConstArray(hitsPath.S()); err != nil { log.Printf("OpenConstArray %v failed: %v", hitsPath, err) return err } // Calculate db.projectCount var projects stringsp.Set db.ts.Search(nil, func(docID int32, data interface{}) error { hit := data.(gcse.HitInfo) projects.Add(hit.ProjectURL) return nil }) db.projectCount = len(projects) gcse.AddBiValueAndProcess(bi.Max, "index.proj-count", db.projectCount) // Update db.indexUpdated db.indexUpdated = time.Now() if st, err := segm.Join(gcse.IndexFn).Stat(); err == nil { db.indexUpdated = st.ModTime() } indexSegment = segm log.Printf("Load index from %v (%d packages)", segm, db.PackageCount()) // Exchange new/old database and close the old one. oldDB := getDatabase() databaseValue.Store(db) oldDB.Close() oldDB = nil gcse.DumpMemStats() runtime.GC() gcse.DumpMemStats() return nil }