func touchByGithubUpdates(pkgUTs map[string]time.Time) { log.Printf("touchByGithubUpdates ...") rs, err := gcse.GithubSpider.SearchRepositories("") if err != nil { log.Printf("SearchRepositories failed: %v", err) return } count := 0 now := time.Now() for _, r := range rs { if r.Owner == nil || r.UpdatedAt == nil { continue } user := stringsp.Get(r.Owner.Name) path := stringsp.Get(r.Name) if user == "" || path == "" { continue } touchPackage(fmt.Sprintf("github.com/%s/%s", user, path), r.UpdatedAt.Time, pkgUTs) if err := store.AppendPackageEvent("github.com", user+"/"+path, "githubhupdate", now, sppb.HistoryEvent_Action_None); err != nil { log.Printf("UpdatePackageHistory %s %s failed: %v", "github.com", user+"/"+path, err) } count++ } log.Printf("%d updates found!", count) }
func doFill() error { cDB := gcse.LoadCrawlerDB() return cDB.PackageDB.Iterate(func(pkg string, val interface{}) error { ent, ok := val.(gcse.CrawlingEntry) if !ok { log.Printf("Wrong entry, ignored: %+v", ent) return nil } site, path := utils.SplitPackage(pkg) return store.AppendPackageEvent(site, path, "unknown", ent.ScheduleTime.Add(-10*timep.Day), sppb.HistoryEvent_Action_None) }) }
func appendNewPackage(pkg, foundWay string) { cDB.AppendPackage(pkg, allDocsPkgs.Contain) site, path := utils.SplitPackage(pkg) if err := store.UpdatePackage(site, path, func(*stpb.PackageInfo) error { return nil }); err != nil { log.Printf("UpdatePackage %s %s failed: %v", site, path, err) } if err := store.AppendPackageEvent(site, path, foundWay, time.Now(), sppb.HistoryEvent_Action_None); err != nil { log.Printf("UpdatePackageHistory %s %s failed: %v", site, path, err) } }
func main() { log.Println("Running tocrawl tool, to generate crawling list") log.Println("NonCrawlHosts: ", configs.NonCrawlHosts) log.Println("CrawlGithubUpdate: ", configs.CrawlGithubUpdate) log.Println("CrawlByGodocApi: ", configs.CrawlByGodocApi) log.Printf("Using personal: %v", configs.CrawlerGithubPersonal) gcse.GithubSpider = github.NewSpiderWithToken(configs.CrawlerGithubPersonal) // Load CrawlerDB cDB = gcse.LoadCrawlerDB() if configs.CrawlGithubUpdate || configs.CrawlByGodocApi { // load pkgUTs pkgUTs, err := loadPackageUpdateTimes( sophie.LocalFsPath(configs.DocsDBPath().S())) if err != nil { log.Fatalf("loadPackageUpdateTimes failed: %v", err) } if configs.CrawlGithubUpdate { touchByGithubUpdates(pkgUTs) } if configs.CrawlByGodocApi { httpClient := gcse.GenHttpClient("") pkgs, err := gcse.FetchAllPackagesInGodoc(httpClient) if err != nil { log.Fatalf("FetchAllPackagesInGodoc failed: %v", err) } gcse.AddBiValueAndProcess(bi.Max, "godoc.doc-count", len(pkgs)) log.Printf("FetchAllPackagesInGodoc returns %d entries", len(pkgs)) now := time.Now() for _, pkg := range pkgs { cDB.AppendPackage(pkg, func(pkg string) bool { _, ok := pkgUTs[pkg] return ok }) site, path := utils.SplitPackage(pkg) if err := store.AppendPackageEvent(site, path, "godoc", now, sppb.HistoryEvent_Action_None); err != nil { log.Printf("UpdatePackageHistory %s %s failed: %v", site, path, err) } } } syncDatabases() } log.Printf("Package DB: %d entries", cDB.PackageDB.Count()) log.Printf("Person DB: %d entries", cDB.PersonDB.Count()) pathToCrawl := configs.DataRoot.Join(configs.FnToCrawl) kvPackage := kv.DirOutput(sophie.LocalFsPath( pathToCrawl.Join(configs.FnPackage).S())) kvPackage.Clean() if err := generateCrawlEntries(cDB.PackageDB, gcse.HostOfPackage, kvPackage); err != nil { log.Fatalf("generateCrawlEntries %v failed: %v", kvPackage.Path, err) } kvPerson := kv.DirOutput(sophie.LocalFsPath( pathToCrawl.Join(configs.FnPerson).S())) kvPerson.Clean() if err := generateCrawlEntries(cDB.PersonDB, func(id string) string { site, _ := gcse.ParsePersonId(id) return site }, kvPerson); err != nil { log.Fatalf("generateCrawlEntries %v failed: %v", kvPerson.Path, err) } }
// OnlyMapper.Map func (pc *PackageCrawler) Map(key, val sophie.SophieWriter, c []sophie.Collector) error { if time.Now().After(AppStopTime) { log.Printf("[Part %d] Timeout(key = %v), PackageCrawler returns EOM", pc.part, key) return mr.EOM } pkg := string(*key.(*sophie.RawString)) ent := val.(*gcse.CrawlingEntry) if ent.Version < gcse.CrawlerVersion { // if gcse.CrawlerVersion is larger than Version, Etag is ignored. ent.Etag = "" } log.Printf("[Part %d] Crawling package %v with etag %s\n", pc.part, pkg, ent.Etag) p, flds, err := gcse.CrawlPackage(pc.httpClient, pkg, ent.Etag) for _, fld := range flds { if spider.LikeGoSubFolder(fld.Name) { appendNewPackage(pkg+"/"+fld.Path, "parent") } } site, path := utils.SplitPackage(pkg) if err != nil && errorsp.Cause(err) != gcse.ErrPackageNotModifed { log.Printf("[Part %d] Crawling pkg %s failed: %v", pc.part, pkg, err) if gcse.IsBadPackage(err) { utils.LogError(store.AppendPackageEvent(site, path, "", time.Now(), sppb.HistoryEvent_Action_Invalid), "AppendPackageEvent %v %v failed", site, path) bi.AddValue(bi.Sum, "crawler.package.wrong-package", 1) // a wrong path nda := gcse.NewDocAction{ Action: gcse.NDA_DEL, } c[0].Collect(sophie.RawString(pkg), &nda) cDB.PackageDB.Delete(pkg) log.Printf("[Part %d] Remove wrong package %s", pc.part, pkg) } else { utils.LogError(store.AppendPackageEvent(site, path, "", time.Now(), sppb.HistoryEvent_Action_Failed), "AppendPackageEvent %v %v failed", site, path) bi.Inc("crawler.package.failed") if strings.HasPrefix(pkg, "github.com/") { bi.Inc("crawler.package.failed.github") } pc.failCount++ cDB.SchedulePackage(pkg, time.Now().Add(12*time.Hour), ent.Etag) if pc.failCount >= 10 || strings.Contains(err.Error(), "403") { durToSleep := 10 * time.Minute if time.Now().Add(durToSleep).After(AppStopTime) { log.Printf("[Part %d] Timeout(key = %v), PackageCrawler returns EOM", pc.part, key) return mr.EOM } log.Printf("[Part %d] Last ten crawling packages failed, sleep for a while...(current: %s)", pc.part, pkg) time.Sleep(durToSleep) pc.failCount = 0 } } return nil } utils.LogError(store.AppendPackageEvent(site, path, "", time.Now(), sppb.HistoryEvent_Action_Success), "AppendPackageEvent %v %v failed", site, path) pc.failCount = 0 if errorsp.Cause(err) == gcse.ErrPackageNotModifed { // TODO crawling stars for unchanged project log.Printf("[Part %d] Package %s unchanged!", pc.part, pkg) schedulePackageNextCrawl(pkg, ent.Etag) bi.AddValue(bi.Sum, "crawler.package.not-modified", 1) return nil } bi.AddValue(bi.Sum, "crawler.package.success", 1) if strings.HasPrefix(pkg, "github.com/") { bi.AddValue(bi.Sum, "crawler.package.success.github", 1) } log.Printf("[Part %d] Crawled package %s success!", pc.part, pkg) var pkgInfo *stpb.PackageInfo if err := store.UpdatePackage(site, path, func(pi *stpb.PackageInfo) error { fillPackageInfo(p, pi) pkgInfo = pi return nil }); err != nil { log.Printf("UpdatePackage %v %v failed: %v", site, path, err) } saveRelatedInfo(pkgInfo) nda := gcse.NewDocAction{ Action: gcse.NDA_UPDATE, DocInfo: packageToDoc(p), } c[0].Collect(sophie.RawString(pkg), &nda) log.Printf("[Part %d] Package %s saved!", pc.part, pkg) if !strings.HasPrefix(pkg, "github.com/") { // github.com throttling is done within the GithubSpider. time.Sleep(10 * time.Second) } return nil }