func main() { defer func() { tmpFn := villa.Path("/tmp/gddo") if err := tmpFn.RemoveAll(); err != nil { log.Printf("Delete %v failed: %v", tmpFn, err) } }() singlePackge := "" singleETag := "" flag.StringVar(&singlePackge, "pkg", singlePackge, "Crawling single package") flag.StringVar(&singleETag, "etag", singleETag, "ETag for single package crawling") flag.Parse() httpClient := gcse.GenHttpClient("") if singlePackge != "" { log.Printf("Crawling single package %s ...", singlePackge) p, err := gcse.CrawlPackage(httpClient, singlePackge, singleETag) if err != nil { fmtp.Printfln("Crawling package %s failured: %v", singlePackge, err) } else { fmtp.Printfln("Package %s: %+v", singlePackge, p) } return } log.Println("crawler started...") // Load CrawlerDB cDB = gcse.LoadCrawlerDB() fpDataRoot := sophie.FsPath{ Fs: sophie.LocalFS, Path: gcse.DataRoot.S(), } fpDocs := fpDataRoot.Join(gcse.FnDocs) if err := loadAllDocsPkgs(kv.DirInput(fpDocs)); err != nil { log.Fatalf("loadAllDocsPkgs: %v", err) } log.Printf("%d docs loaded!", len(allDocsPkgs)) AppStopTime = time.Now().Add(gcse.CrawlerDuePerRun) //pathToCrawl := gcse.DataRoot.Join(gcse.FnToCrawl) fpCrawler := fpDataRoot.Join(gcse.FnCrawlerDB) fpToCrawl := fpDataRoot.Join(gcse.FnToCrawl) fpNewDocs := fpCrawler.Join(gcse.FnNewDocs) fpNewDocs.Remove() pkgEnd := make(chan error, 1) go crawlPackages(httpClient, fpToCrawl.Join(gcse.FnPackage), fpNewDocs, pkgEnd) psnEnd := make(chan error, 1) go crawlPersons(httpClient, fpToCrawl.Join(gcse.FnPerson), psnEnd) errPkg, errPsn := <-pkgEnd, <-psnEnd if errPkg != nil || errPsn != nil { log.Fatalf("Some job may failed, package: %v, person: %v", errPkg, errPsn) } if err := processImports(); err != nil { log.Printf("processImports failed: %v", err) } syncDatabases() log.Println("crawler stopped...") }
// OnlyMapper.Map func (pc *PackageCrawler) Map(key, val sophie.SophieWriter, c []sophie.Collector) error { if time.Now().After(AppStopTime) { log.Printf("[Part %d] Timeout(key = %v), PackageCrawler returns EOM", pc.part, key) return mr.EOM } pkg := string(*key.(*sophie.RawString)) ent := val.(*gcse.CrawlingEntry) if ent.Version < gcse.CrawlerVersion { // if gcse.CrawlerVersion is larger than Version, Etag is ignored. ent.Etag = "" } log.Printf("[Part %d] Crawling package %v with etag %s\n", pc.part, pkg, ent.Etag) p, flds, err := gcse.CrawlPackage(pc.httpClient, pkg, ent.Etag) for _, fld := range flds { if spider.LikeGoSubFolder(fld.Name) { appendNewPackage(pkg+"/"+fld.Path, "parent") } } site, path := utils.SplitPackage(pkg) if err != nil && errorsp.Cause(err) != gcse.ErrPackageNotModifed { log.Printf("[Part %d] Crawling pkg %s failed: %v", pc.part, pkg, err) if gcse.IsBadPackage(err) { utils.LogError(store.AppendPackageEvent(site, path, "", time.Now(), sppb.HistoryEvent_Action_Invalid), "AppendPackageEvent %v %v failed", site, path) bi.AddValue(bi.Sum, "crawler.package.wrong-package", 1) // a wrong path nda := gcse.NewDocAction{ Action: gcse.NDA_DEL, } c[0].Collect(sophie.RawString(pkg), &nda) cDB.PackageDB.Delete(pkg) log.Printf("[Part %d] Remove wrong package %s", pc.part, pkg) } else { utils.LogError(store.AppendPackageEvent(site, path, "", time.Now(), sppb.HistoryEvent_Action_Failed), "AppendPackageEvent %v %v failed", site, path) bi.Inc("crawler.package.failed") if strings.HasPrefix(pkg, "github.com/") { bi.Inc("crawler.package.failed.github") } pc.failCount++ cDB.SchedulePackage(pkg, time.Now().Add(12*time.Hour), ent.Etag) if pc.failCount >= 10 || strings.Contains(err.Error(), "403") { durToSleep := 10 * time.Minute if time.Now().Add(durToSleep).After(AppStopTime) { log.Printf("[Part %d] Timeout(key = %v), PackageCrawler returns EOM", pc.part, key) return mr.EOM } log.Printf("[Part %d] Last ten crawling packages failed, sleep for a while...(current: %s)", pc.part, pkg) time.Sleep(durToSleep) pc.failCount = 0 } } return nil } utils.LogError(store.AppendPackageEvent(site, path, "", time.Now(), sppb.HistoryEvent_Action_Success), "AppendPackageEvent %v %v failed", site, path) pc.failCount = 0 if errorsp.Cause(err) == gcse.ErrPackageNotModifed { // TODO crawling stars for unchanged project log.Printf("[Part %d] Package %s unchanged!", pc.part, pkg) schedulePackageNextCrawl(pkg, ent.Etag) bi.AddValue(bi.Sum, "crawler.package.not-modified", 1) return nil } bi.AddValue(bi.Sum, "crawler.package.success", 1) if strings.HasPrefix(pkg, "github.com/") { bi.AddValue(bi.Sum, "crawler.package.success.github", 1) } log.Printf("[Part %d] Crawled package %s success!", pc.part, pkg) var pkgInfo *stpb.PackageInfo if err := store.UpdatePackage(site, path, func(pi *stpb.PackageInfo) error { fillPackageInfo(p, pi) pkgInfo = pi return nil }); err != nil { log.Printf("UpdatePackage %v %v failed: %v", site, path, err) } saveRelatedInfo(pkgInfo) nda := gcse.NewDocAction{ Action: gcse.NDA_UPDATE, DocInfo: packageToDoc(p), } c[0].Collect(sophie.RawString(pkg), &nda) log.Printf("[Part %d] Package %s saved!", pc.part, pkg) if !strings.HasPrefix(pkg, "github.com/") { // github.com throttling is done within the GithubSpider. time.Sleep(10 * time.Second) } return nil }
func crawlEntriesLoop() { httpClient := gcse.GenHttpClient("") for time.Now().Before(AppStopTime) { checkImports() if gcse.CrawlByGodocApi { processGodoc(httpClient) } didSomething := false var wg sync.WaitGroup pkgGroups := listPackagesByHost(5, 50) if len(pkgGroups) > 0 { didSomething = true log.Printf("Crawling packages of %d groups", len(pkgGroups)) wg.Add(len(pkgGroups)) for host, ents := range pkgGroups { go func(host string, ents []EntryInfo) { failCount := 0 for _, ent := range ents { if time.Now().After(AppStopTime) { break } runtime.GC() p, err := gcse.CrawlPackage(httpClient, ent.ID, ent.Etag) if err != nil && err != gcse.ErrPackageNotModifed { log.Printf("Crawling pkg %s failed: %v", ent.ID, err) if gcse.IsBadPackage(err) { // a wrong path deletePackage(ent.ID) log.Printf("Remove wrong package %s", ent.ID) } else { failCount++ schedulePackage(ent.ID, time.Now().Add( 12*time.Hour), ent.Etag) if failCount >= 10 { durToSleep := 10 * time.Minute if time.Now().Add(durToSleep).After(AppStopTime) { break } log.Printf("Last ten crawling %s packages failed, sleep for a while...", host) time.Sleep(durToSleep) failCount = 0 } } continue } failCount = 0 if err == gcse.ErrPackageNotModifed { log.Printf("Package %s unchanged!", ent.ID) schedulePackageNextCrawl(ent.ID, ent.Etag) continue } log.Printf("Crawled package %s success!", ent.ID) pushPackage(p) log.Printf("Package %s saved!", ent.ID) } wg.Done() }(host, ents) } } personGroups := listPersonsByHost(5, 100) if len(personGroups) > 0 { didSomething = true log.Printf("Crawling persons of %d groups", len(personGroups)) wg.Add(len(personGroups)) for host, ents := range personGroups { go func(host string, ents []EntryInfo) { failCount := 0 for _, ent := range ents { if time.Now().After(AppStopTime) { break } p, err := gcse.CrawlPerson(httpClient, ent.ID) if err != nil { failCount++ log.Printf("Crawling person %s failed: %v", ent.ID, err) schedulePerson(ent.ID, time.Now().Add(12*time.Hour)) if failCount >= 10 { durToSleep := 10 * time.Minute if time.Now().Add(durToSleep).After(AppStopTime) { break } log.Printf("Last ten crawling %s persons failed, sleep for a while...", host) time.Sleep(durToSleep) failCount = 0 } continue } log.Printf("Crawled person %s success!", ent.ID) pushPerson(p) log.Printf("Push person %s success", ent.ID) failCount = 0 } wg.Done() }(host, ents) } } wg.Wait() syncDatabases() if gcse.CrawlGithubUpdate { if touchByGithubUpdates() { didSomething = true } } if !didSomething { log.Printf("Nothing to crawl sleep for a while...") time.Sleep(2 * time.Minute) } } }
// OnlyMapper.Map func (pc *PackageCrawler) Map(key, val sophie.SophieWriter, c []sophie.Collector) error { if time.Now().After(AppStopTime) { log.Printf("[Part %d] Timeout(key = %v), PackageCrawler returns EOM", pc.part, key) return mr.EOM } pkg := string(*key.(*sophie.RawString)) ent := val.(*gcse.CrawlingEntry) if ent.Version < gcse.CrawlerVersion { // if gcse.CrawlerVersion is larger than Version, Etag is ignored. ent.Etag = "" } log.Printf("[Part %d] Crawling package %v with etag %s\n", pc.part, pkg, ent.Etag) p, err := gcse.CrawlPackage(pc.httpClient, pkg, ent.Etag) _ = p if err != nil && err != gcse.ErrPackageNotModifed { log.Printf("[Part %d] Crawling pkg %s failed: %v", pc.part, pkg, err) if gcse.IsBadPackage(err) { // a wrong path nda := gcse.NewDocAction{ Action: gcse.NDA_DEL, } c[0].Collect(sophie.RawString(pkg), &nda) cDB.PackageDB.Delete(pkg) log.Printf("[Part %d] Remove wrong package %s", pc.part, pkg) } else { pc.failCount++ cDB.SchedulePackage(pkg, time.Now().Add(12*time.Hour), ent.Etag) if pc.failCount >= 10 || strings.Contains(err.Error(), "403") { durToSleep := 10 * time.Minute if time.Now().Add(durToSleep).After(AppStopTime) { log.Printf("[Part %d] Timeout(key = %v), PackageCrawler returns EOM", pc.part, key) return mr.EOM } log.Printf("[Part %d] Last ten crawling packages failed, sleep for a while...(current: %s)", pc.part, pkg) time.Sleep(durToSleep) pc.failCount = 0 } } return nil } pc.failCount = 0 if err == gcse.ErrPackageNotModifed { // TODO crawling stars for unchanged project log.Printf("[Part %d] Package %s unchanged!", pc.part, pkg) schedulePackageNextCrawl(pkg, ent.Etag) return nil } log.Printf("[Part %d] Crawled package %s success!", pc.part, pkg) nda := gcse.NewDocAction{ Action: gcse.NDA_UPDATE, DocInfo: packageToDoc(p), } c[0].Collect(sophie.RawString(pkg), &nda) log.Printf("[Part %d] Package %s saved!", pc.part, pkg) time.Sleep(10 * time.Second) return nil }
func main() { runtime.GOMAXPROCS(2) log.Printf("Using personal: %v", configs.CrawlerGithubPersonal) gcse.GithubSpider = github.NewSpiderWithToken(configs.CrawlerGithubPersonal) if db, err := bh.Open(configs.DataRoot.Join("filecache.bolt").S(), 0644, nil); err == nil { log.Print("Using file cache!") gcse.GithubSpider.FileCache = spider.BoltFileCache{ DB: db, IncCounter: bi.Inc, } } else { log.Printf("Open file cache failed: %v", err) } cleanTempDir() defer cleanTempDir() singlePackage := flag.String("pkg", "", "Crawling a single package") singleETag := flag.String("etag", "", "ETag for the single package crawling") singlePerson := flag.String("person", "", "Crawling a single person") flag.Parse() httpClient := gcse.GenHttpClient("") if *singlePerson != "" { log.Printf("Crawling single person %s ...", *singlePerson) p, err := gcse.CrawlPerson(httpClient, *singlePerson) if err != nil { fmtp.Printfln("Crawling person %s failed: %v", *singlePerson, err) } else { fmtp.Printfln("Person %s: %+v", *singlePerson, p) } } if *singlePackage != "" { log.Printf("Crawling single package %s ...", *singlePackage) p, flds, err := gcse.CrawlPackage(httpClient, *singlePackage, *singleETag) if err != nil { fmtp.Printfln("Crawling package %s failed: %v, folders: %v", *singlePackage, err, flds) } else { fmtp.Printfln("Package %s: %+v, folders: %v", *singlePackage, p, flds) } } if *singlePackage != "" || *singlePerson != "" { return } log.Println("crawler started...") // Load CrawlerDB cDB = gcse.LoadCrawlerDB() fpDataRoot := sophie.FsPath{ Fs: sophie.LocalFS, Path: configs.DataRoot.S(), } fpDocs := fpDataRoot.Join(configs.FnDocs) if err := loadAllDocsPkgs(kv.DirInput(fpDocs)); err != nil { log.Fatalf("loadAllDocsPkgs: %v", err) } log.Printf("%d docs loaded!", len(allDocsPkgs)) AppStopTime = time.Now().Add(configs.CrawlerDuePerRun) //pathToCrawl := gcse.DataRoot.Join(gcse.FnToCrawl) fpCrawler := fpDataRoot.Join(configs.FnCrawlerDB) fpToCrawl := fpDataRoot.Join(configs.FnToCrawl) fpNewDocs := fpCrawler.Join(configs.FnNewDocs) fpNewDocs.Remove() if err := processImports(); err != nil { log.Printf("processImports failed: %v", err) } pkgEnd := make(chan error, 1) go crawlPackages(httpClient, fpToCrawl.Join(configs.FnPackage), fpNewDocs, pkgEnd) psnEnd := make(chan error, 1) go crawlPersons(httpClient, fpToCrawl.Join(configs.FnPerson), psnEnd) errPkg, errPsn := <-pkgEnd, <-psnEnd bi.Flush() bi.Process() syncDatabases() if errPkg != nil || errPsn != nil { log.Fatalf("Some job may failed, package: %v, person: %v", errPkg, errPsn) } log.Println("crawler stopped...") }