func generateCrawlEntries(db *gcse.MemDB, hostFromID func(id string) string, out kv.DirOutput) error { now := time.Now() groups := make(map[string]sophie.CollectCloser) count := 0 if err := db.Iterate(func(id string, val interface{}) error { ent, ok := val.(gcse.CrawlingEntry) if !ok { log.Printf("Wrong entry: %+v", ent) return nil } if ent.Version == gcse.CrawlerVersion && ent.ScheduleTime.After(now) { return nil } host := hostFromID(id) // check host black list if gcse.NonCrawlHosts.In(host) { return nil } c, ok := groups[host] if !ok { index := len(groups) var err error c, err = out.Collector(index) if err != nil { return err } groups[host] = c } if rand.Intn(10) == 0 { // randomly set Etag to empty to fetch stars ent.Etag = "" } count++ return c.Collect(sophie.RawString(id), &ent) }); err != nil { return err } for _, c := range groups { c.Close() } log.Printf("%d entries to crawl for folder %v", count, out.Path) return nil }
func listCrawlEntriesByHost(db *gcse.MemDB, hostFromID func(id string) string, maxHosts, numPerHost int) (groups map[string][]EntryInfo) { now := time.Now() groups = make(map[string][]EntryInfo) fullGroups := 0 db.Iterate(func(id string, val interface{}) error { ent, ok := val.(CrawlingEntry) if !ok { return nil } if ent.ScheduleTime.After(now) { return nil } host := hostFromID(id) entryInfos := groups[host] if maxHosts > 0 { // check host limit if len(entryInfos) == 0 && len(groups) == maxHosts { // no quota for new group return nil } } if numPerHost > 0 { // check per host limit if len(entryInfos) == numPerHost-1 { // this group is about to be full, count it fullGroups++ } else if len(entryInfos) == numPerHost { // no quota for this group return nil } } etag := ent.Etag if ent.Version != gcse.CrawlerVersion { etag = "" } groups[host] = append(entryInfos, EntryInfo{ ID: id, Etag: etag, }) if fullGroups == maxHosts { return errStop } return nil }) return groups }
func generateCrawlEntries(db *gcse.MemDB, hostFromID func(id string) string, out kv.DirOutput) error { now := time.Now() type idAndCrawlingEntry struct { id string ent *gcse.CrawlingEntry } groups := make(map[string][]idAndCrawlingEntry) count := 0 type nameAndAges struct { maxName string maxAge time.Duration sumAgeHours float64 cnt int } ages := make(map[string]nameAndAges) if err := db.Iterate(func(id string, val interface{}) error { ent, ok := val.(gcse.CrawlingEntry) if !ok { log.Printf("Wrong entry: %+v", ent) return nil } if ent.Version == gcse.CrawlerVersion && ent.ScheduleTime.After(now) { return nil } host := hostFromID(id) // check host black list if configs.NonCrawlHosts.Contain(host) { return nil } if rand.Intn(10) == 0 { // randomly set Etag to empty to fetch stars ent.Etag = "" } groups[host] = append(groups[host], idAndCrawlingEntry{id, &ent}) age := now.Sub(ent.ScheduleTime) na := ages[host] if age > na.maxAge { na.maxName, na.maxAge = id, age } na.sumAgeHours += age.Hours() na.cnt++ ages[host] = na count++ return nil }); err != nil { return errorsp.WithStacks(err) } index := 0 for _, g := range groups { sortp.SortF(len(g), func(i, j int) bool { return g[i].ent.ScheduleTime.Before(g[j].ent.ScheduleTime) }, func(i, j int) { g[i], g[j] = g[j], g[i] }) if err := func(index int, ies []idAndCrawlingEntry) error { c, err := out.Collector(index) if err != nil { return err } defer c.Close() for _, ie := range ies { if err := c.Collect(sophie.RawString(ie.id), ie.ent); err != nil { return err } } return nil }(index, g); err != nil { log.Printf("Saving ents failed: %v", err) } index++ } for host, na := range ages { aveAge := time.Duration(na.sumAgeHours / float64(na.cnt) * float64(time.Hour)) log.Printf("%s age: max -> %v(%s), ave -> %v", host, na.maxAge, na.maxName, aveAge) if host == "github.com" && strings.Contains(out.Path, configs.FnPackage) { gcse.AddBiValueAndProcess(bi.Average, "crawler.github_max_age.hours", int(na.maxAge.Hours())) gcse.AddBiValueAndProcess(bi.Average, "crawler.github_max_age.days", int(na.maxAge/timep.Day)) gcse.AddBiValueAndProcess(bi.Average, "crawler.github_ave_age.hours", int(aveAge.Hours())) gcse.AddBiValueAndProcess(bi.Average, "crawler.github_ave_age.days", int(aveAge/timep.Day)) } } log.Printf("%d entries to crawl for folder %v", count, out.Path) return nil }