Example #1
0
func doIndex() bool {
	idxSegm, err := gcse.IndexSegments.GenMaxSegment()
	if err != nil {
		log.Printf("GenMaxSegment failed: %v", err)
		return false
	}

	runtime.GC()
	gcse.DumpMemStats()

	log.Printf("Indexing to %v ...", idxSegm)

	fpDocDB := sophie.LocalFsPath(configs.DocsDBPath().S())
	ts, err := gcse.Index(kv.DirInput(fpDocDB), idxSegm.Join("").S())
	if err != nil {
		log.Printf("Indexing failed: %v", err)
		return false
	}

	if !func() bool {
		f, err := idxSegm.Join(gcse.IndexFn).Create()
		if err != nil {
			log.Printf("Create index file failed: %v", err)
			return false
		}
		defer f.Close()

		log.Printf("Saving index to %v ...", idxSegm)
		if err := ts.Save(f); err != nil {
			log.Printf("ts.Save failed: %v", err)
			return false
		}
		return true
	}() {
		return false
	}
	runtime.GC()
	gcse.DumpMemStats()

	storePath := idxSegm.Join(configs.FnStore)
	log.Printf("Saving store snapshot to %v", storePath)
	if err := store.SaveSnapshot(storePath.S()); err != nil {
		log.Printf("SaveSnapshot %v failed: %v", storePath, err)
	}

	if err := idxSegm.Done(); err != nil {
		log.Printf("segm.Done failed: %v", err)
		return false
	}

	log.Printf("Indexing success: %s (%d)", idxSegm, ts.DocCount())
	gcse.AddBiValueAndProcess(bi.Average, "index.doc-count", ts.DocCount())

	ts = nil
	gcse.DumpMemStats()
	runtime.GC()
	gcse.DumpMemStats()

	return true
}
Example #2
0
func generateCrawlEntries(db *gcse.MemDB, hostFromID func(id string) string, out kv.DirOutput) error {
	now := time.Now()
	type idAndCrawlingEntry struct {
		id  string
		ent *gcse.CrawlingEntry
	}
	groups := make(map[string][]idAndCrawlingEntry)
	count := 0
	type nameAndAges struct {
		maxName string
		maxAge  time.Duration

		sumAgeHours float64
		cnt         int
	}
	ages := make(map[string]nameAndAges)
	if err := db.Iterate(func(id string, val interface{}) error {
		ent, ok := val.(gcse.CrawlingEntry)
		if !ok {
			log.Printf("Wrong entry: %+v", ent)
			return nil
		}
		if ent.Version == gcse.CrawlerVersion && ent.ScheduleTime.After(now) {
			return nil
		}
		host := hostFromID(id)

		// check host black list
		if configs.NonCrawlHosts.Contain(host) {
			return nil
		}
		if rand.Intn(10) == 0 {
			// randomly set Etag to empty to fetch stars
			ent.Etag = ""
		}
		groups[host] = append(groups[host], idAndCrawlingEntry{id, &ent})

		age := now.Sub(ent.ScheduleTime)
		na := ages[host]
		if age > na.maxAge {
			na.maxName, na.maxAge = id, age
		}
		na.sumAgeHours += age.Hours()
		na.cnt++
		ages[host] = na

		count++
		return nil
	}); err != nil {
		return errorsp.WithStacks(err)
	}
	index := 0
	for _, g := range groups {
		sortp.SortF(len(g), func(i, j int) bool {
			return g[i].ent.ScheduleTime.Before(g[j].ent.ScheduleTime)
		}, func(i, j int) {
			g[i], g[j] = g[j], g[i]
		})
		if err := func(index int, ies []idAndCrawlingEntry) error {
			c, err := out.Collector(index)
			if err != nil {
				return err
			}
			defer c.Close()

			for _, ie := range ies {
				if err := c.Collect(sophie.RawString(ie.id), ie.ent); err != nil {
					return err
				}
			}
			return nil
		}(index, g); err != nil {
			log.Printf("Saving ents failed: %v", err)
		}
		index++
	}
	for host, na := range ages {
		aveAge := time.Duration(na.sumAgeHours / float64(na.cnt) * float64(time.Hour))
		log.Printf("%s age: max -> %v(%s), ave -> %v", host, na.maxAge, na.maxName, aveAge)
		if host == "github.com" && strings.Contains(out.Path, configs.FnPackage) {
			gcse.AddBiValueAndProcess(bi.Average, "crawler.github_max_age.hours", int(na.maxAge.Hours()))
			gcse.AddBiValueAndProcess(bi.Average, "crawler.github_max_age.days", int(na.maxAge/timep.Day))
			gcse.AddBiValueAndProcess(bi.Average, "crawler.github_ave_age.hours", int(aveAge.Hours()))
			gcse.AddBiValueAndProcess(bi.Average, "crawler.github_ave_age.days", int(aveAge/timep.Day))
		}
	}
	log.Printf("%d entries to crawl for folder %v", count, out.Path)
	return nil
}
Example #3
0
func main() {
	log.Println("Running tocrawl tool, to generate crawling list")
	log.Println("NonCrawlHosts: ", configs.NonCrawlHosts)
	log.Println("CrawlGithubUpdate: ", configs.CrawlGithubUpdate)
	log.Println("CrawlByGodocApi: ", configs.CrawlByGodocApi)

	log.Printf("Using personal: %v", configs.CrawlerGithubPersonal)
	gcse.GithubSpider = github.NewSpiderWithToken(configs.CrawlerGithubPersonal)

	// Load CrawlerDB
	cDB = gcse.LoadCrawlerDB()

	if configs.CrawlGithubUpdate || configs.CrawlByGodocApi {
		// load pkgUTs
		pkgUTs, err := loadPackageUpdateTimes(
			sophie.LocalFsPath(configs.DocsDBPath().S()))
		if err != nil {
			log.Fatalf("loadPackageUpdateTimes failed: %v", err)
		}

		if configs.CrawlGithubUpdate {
			touchByGithubUpdates(pkgUTs)
		}

		if configs.CrawlByGodocApi {
			httpClient := gcse.GenHttpClient("")
			pkgs, err := gcse.FetchAllPackagesInGodoc(httpClient)
			if err != nil {
				log.Fatalf("FetchAllPackagesInGodoc failed: %v", err)
			}
			gcse.AddBiValueAndProcess(bi.Max, "godoc.doc-count", len(pkgs))
			log.Printf("FetchAllPackagesInGodoc returns %d entries", len(pkgs))
			now := time.Now()
			for _, pkg := range pkgs {
				cDB.AppendPackage(pkg, func(pkg string) bool {
					_, ok := pkgUTs[pkg]
					return ok
				})
				site, path := utils.SplitPackage(pkg)
				if err := store.AppendPackageEvent(site, path, "godoc", now, sppb.HistoryEvent_Action_None); err != nil {
					log.Printf("UpdatePackageHistory %s %s failed: %v", site, path, err)
				}
			}
		}
		syncDatabases()
	}

	log.Printf("Package DB: %d entries", cDB.PackageDB.Count())
	log.Printf("Person DB: %d entries", cDB.PersonDB.Count())

	pathToCrawl := configs.DataRoot.Join(configs.FnToCrawl)

	kvPackage := kv.DirOutput(sophie.LocalFsPath(
		pathToCrawl.Join(configs.FnPackage).S()))
	kvPackage.Clean()
	if err := generateCrawlEntries(cDB.PackageDB, gcse.HostOfPackage, kvPackage); err != nil {
		log.Fatalf("generateCrawlEntries %v failed: %v", kvPackage.Path, err)
	}

	kvPerson := kv.DirOutput(sophie.LocalFsPath(
		pathToCrawl.Join(configs.FnPerson).S()))

	kvPerson.Clean()
	if err := generateCrawlEntries(cDB.PersonDB, func(id string) string {
		site, _ := gcse.ParsePersonId(id)
		return site
	}, kvPerson); err != nil {
		log.Fatalf("generateCrawlEntries %v failed: %v", kvPerson.Path, err)
	}
}
Example #4
0
func loadIndex() error {
	segm, err := gcse.IndexSegments.FindMaxDone()
	if segm == nil || err != nil {
		return err
	}
	if indexSegment != nil && !gcse.SegmentLess(indexSegment, segm) {
		// no new index
		return nil
	}
	db := &searcherDB{}
	if err := func() error {
		f, err := segm.Join(gcse.IndexFn).Open()
		if err != nil {
			return err
		}
		defer f.Close()

		return db.ts.Load(f)
	}(); err != nil {
		return err
	}
	db.storeDB = &bh.RefCountBox{
		DataPath: func() string {
			return segm.Join(configs.FnStore).S()
		},
	}
	hitsPath := segm.Join(gcse.HitsArrFn)
	if db.hits, err = index.OpenConstArray(hitsPath.S()); err != nil {
		log.Printf("OpenConstArray %v failed: %v", hitsPath, err)
		return err
	}
	// Calculate db.projectCount
	var projects stringsp.Set
	db.ts.Search(nil, func(docID int32, data interface{}) error {
		hit := data.(gcse.HitInfo)
		projects.Add(hit.ProjectURL)
		return nil
	})
	db.projectCount = len(projects)
	gcse.AddBiValueAndProcess(bi.Max, "index.proj-count", db.projectCount)

	// Update db.indexUpdated
	db.indexUpdated = time.Now()
	if st, err := segm.Join(gcse.IndexFn).Stat(); err == nil {
		db.indexUpdated = st.ModTime()
	}
	indexSegment = segm
	log.Printf("Load index from %v (%d packages)", segm, db.PackageCount())

	// Exchange new/old database and close the old one.
	oldDB := getDatabase()
	databaseValue.Store(db)
	oldDB.Close()
	oldDB = nil
	gcse.DumpMemStats()

	runtime.GC()
	gcse.DumpMemStats()

	return nil
}