Пример #1
0
func generateCrawlEntries(db *gcse.MemDB, hostFromID func(id string) string,
	out kv.DirOutput) error {
	now := time.Now()
	groups := make(map[string]sophie.CollectCloser)
	count := 0
	if err := db.Iterate(func(id string, val interface{}) error {
		ent, ok := val.(gcse.CrawlingEntry)
		if !ok {
			log.Printf("Wrong entry: %+v", ent)
			return nil
		}

		if ent.Version == gcse.CrawlerVersion &&
			ent.ScheduleTime.After(now) {
			return nil
		}

		host := hostFromID(id)

		// check host black list
		if gcse.NonCrawlHosts.In(host) {
			return nil
		}

		c, ok := groups[host]
		if !ok {
			index := len(groups)
			var err error
			c, err = out.Collector(index)
			if err != nil {
				return err
			}
			groups[host] = c
		}

		if rand.Intn(10) == 0 {
			// randomly set Etag to empty to fetch stars
			ent.Etag = ""
		}

		count++
		return c.Collect(sophie.RawString(id), &ent)
	}); err != nil {
		return err
	}

	for _, c := range groups {
		c.Close()
	}

	log.Printf("%d entries to crawl for folder %v", count, out.Path)
	return nil
}
Пример #2
0
func listCrawlEntriesByHost(db *gcse.MemDB, hostFromID func(id string) string,
	maxHosts, numPerHost int) (groups map[string][]EntryInfo) {
	now := time.Now()
	groups = make(map[string][]EntryInfo)
	fullGroups := 0
	db.Iterate(func(id string, val interface{}) error {
		ent, ok := val.(CrawlingEntry)
		if !ok {
			return nil
		}

		if ent.ScheduleTime.After(now) {
			return nil
		}

		host := hostFromID(id)
		entryInfos := groups[host]
		if maxHosts > 0 {
			// check host limit
			if len(entryInfos) == 0 && len(groups) == maxHosts {
				// no quota for new group
				return nil
			}
		}
		if numPerHost > 0 {
			// check per host limit
			if len(entryInfos) == numPerHost-1 {
				// this group is about to be full, count it
				fullGroups++
			} else if len(entryInfos) == numPerHost {
				// no quota for this group
				return nil
			}
		}

		etag := ent.Etag
		if ent.Version != gcse.CrawlerVersion {
			etag = ""
		}
		groups[host] = append(entryInfos, EntryInfo{
			ID:   id,
			Etag: etag,
		})

		if fullGroups == maxHosts {
			return errStop
		}
		return nil
	})

	return groups
}
Пример #3
0
func generateCrawlEntries(db *gcse.MemDB, hostFromID func(id string) string, out kv.DirOutput) error {
	now := time.Now()
	type idAndCrawlingEntry struct {
		id  string
		ent *gcse.CrawlingEntry
	}
	groups := make(map[string][]idAndCrawlingEntry)
	count := 0
	type nameAndAges struct {
		maxName string
		maxAge  time.Duration

		sumAgeHours float64
		cnt         int
	}
	ages := make(map[string]nameAndAges)
	if err := db.Iterate(func(id string, val interface{}) error {
		ent, ok := val.(gcse.CrawlingEntry)
		if !ok {
			log.Printf("Wrong entry: %+v", ent)
			return nil
		}
		if ent.Version == gcse.CrawlerVersion && ent.ScheduleTime.After(now) {
			return nil
		}
		host := hostFromID(id)

		// check host black list
		if configs.NonCrawlHosts.Contain(host) {
			return nil
		}
		if rand.Intn(10) == 0 {
			// randomly set Etag to empty to fetch stars
			ent.Etag = ""
		}
		groups[host] = append(groups[host], idAndCrawlingEntry{id, &ent})

		age := now.Sub(ent.ScheduleTime)
		na := ages[host]
		if age > na.maxAge {
			na.maxName, na.maxAge = id, age
		}
		na.sumAgeHours += age.Hours()
		na.cnt++
		ages[host] = na

		count++
		return nil
	}); err != nil {
		return errorsp.WithStacks(err)
	}
	index := 0
	for _, g := range groups {
		sortp.SortF(len(g), func(i, j int) bool {
			return g[i].ent.ScheduleTime.Before(g[j].ent.ScheduleTime)
		}, func(i, j int) {
			g[i], g[j] = g[j], g[i]
		})
		if err := func(index int, ies []idAndCrawlingEntry) error {
			c, err := out.Collector(index)
			if err != nil {
				return err
			}
			defer c.Close()

			for _, ie := range ies {
				if err := c.Collect(sophie.RawString(ie.id), ie.ent); err != nil {
					return err
				}
			}
			return nil
		}(index, g); err != nil {
			log.Printf("Saving ents failed: %v", err)
		}
		index++
	}
	for host, na := range ages {
		aveAge := time.Duration(na.sumAgeHours / float64(na.cnt) * float64(time.Hour))
		log.Printf("%s age: max -> %v(%s), ave -> %v", host, na.maxAge, na.maxName, aveAge)
		if host == "github.com" && strings.Contains(out.Path, configs.FnPackage) {
			gcse.AddBiValueAndProcess(bi.Average, "crawler.github_max_age.hours", int(na.maxAge.Hours()))
			gcse.AddBiValueAndProcess(bi.Average, "crawler.github_max_age.days", int(na.maxAge/timep.Day))
			gcse.AddBiValueAndProcess(bi.Average, "crawler.github_ave_age.hours", int(aveAge.Hours()))
			gcse.AddBiValueAndProcess(bi.Average, "crawler.github_ave_age.days", int(aveAge/timep.Day))
		}
	}
	log.Printf("%d entries to crawl for folder %v", count, out.Path)
	return nil
}