func main() { log.Println("Running tocrawl tool, to generate crawling list") log.Println("NonCrawlHosts: ", gcse.NonCrawlHosts) log.Println("CrawlGithubUpdate: ", gcse.CrawlGithubUpdate) log.Println("CrawlByGodocApi: ", gcse.CrawlByGodocApi) // Load CrawlerDB cDB = gcse.LoadCrawlerDB() if gcse.CrawlGithubUpdate || gcse.CrawlByGodocApi { // load pkgUTs pkgUTs, err := loadPackageUpdateTimes( sophie.LocalFsPath(gcse.DocsDBPath.S())) if err != nil { log.Fatalf("loadPackageUpdateTimes failed: %v", err) } if gcse.CrawlGithubUpdate { touchByGithubUpdates(pkgUTs) } if gcse.CrawlByGodocApi { httpClient := gcse.GenHttpClient("") pkgs, err := gcse.FetchAllPackagesInGodoc(httpClient) if err != nil { log.Fatalf("FetchAllPackagesInGodoc failed: %v", err) } log.Printf("FetchAllPackagesInGodoc returns %d entries", len(pkgs)) for _, pkg := range pkgs { cDB.AppendPackage(pkg, func(pkg string) bool { _, ok := pkgUTs[pkg] return ok }) } } syncDatabases() } log.Printf("Package DB: %d entries", cDB.PackageDB.Count()) log.Printf("Person DB: %d entries", cDB.PersonDB.Count()) pathToCrawl := gcse.DataRoot.Join(gcse.FnToCrawl) kvPackage := kv.DirOutput(sophie.LocalFsPath( pathToCrawl.Join(gcse.FnPackage).S())) kvPackage.Clean() if err := generateCrawlEntries(cDB.PackageDB, gcse.HostOfPackage, kvPackage); err != nil { log.Fatalf("generateCrawlEntries %v failed: %v", kvPackage.Path, err) } kvPerson := kv.DirOutput(sophie.LocalFsPath( pathToCrawl.Join(gcse.FnPerson).S())) kvPerson.Clean() if err := generateCrawlEntries(cDB.PersonDB, func(id string) string { site, _ := gcse.ParsePersonId(id) return site }, kvPerson); err != nil { log.Fatalf("generateCrawlEntries %v failed: %v", kvPerson.Path, err) } }
func doFill() error { cDB := gcse.LoadCrawlerDB() return cDB.PackageDB.Iterate(func(pkg string, val interface{}) error { ent, ok := val.(gcse.CrawlingEntry) if !ok { log.Printf("Wrong entry, ignored: %+v", ent) return nil } site, path := utils.SplitPackage(pkg) return store.AppendPackageEvent(site, path, "unknown", ent.ScheduleTime.Add(-10*timep.Day), sppb.HistoryEvent_Action_None) }) }
func dumpCrawler(keys []string) { cDB := gcse.LoadCrawlerDB() if len(keys) == 0 { // Full dump log.Printf("Dumping PackageDB...") cDB.PackageDB.Iterate(func(k string, v interface{}) error { fmtp.Printfln("Package %v: %+v", k, v) return nil }) return } for _, key := range keys { var ent gcse.CrawlingEntry if cDB.PackageDB.Get(key, &ent) { fmtp.Printfln("Package %v: %+v", key, ent) } } }
func TestDoFill(t *testing.T) { const ( site = "github.com" path = "daviddengcn/gcse" ) tm := time.Now().Add(-20 * timep.Day) cDB := gcse.LoadCrawlerDB() cDB.PackageDB.Put(site+"/"+path, gcse.CrawlingEntry{ ScheduleTime: tm.Add(10 * timep.Day), }) assert.NoError(t, cDB.Sync()) assert.NoError(t, doFill()) h, err := store.ReadPackageHistory(site, path) assert.NoError(t, err) ts, _ := ptypes.TimestampProto(tm) assert.Equal(t, "h", h, &sppb.HistoryInfo{ FoundTime: ts, FoundWay: "unknown", }) }
func main() { defer func() { tmpFn := villa.Path("/tmp/gddo") if err := tmpFn.RemoveAll(); err != nil { log.Printf("Delete %v failed: %v", tmpFn, err) } }() singlePackge := "" singleETag := "" flag.StringVar(&singlePackge, "pkg", singlePackge, "Crawling single package") flag.StringVar(&singleETag, "etag", singleETag, "ETag for single package crawling") flag.Parse() httpClient := gcse.GenHttpClient("") if singlePackge != "" { log.Printf("Crawling single package %s ...", singlePackge) p, err := gcse.CrawlPackage(httpClient, singlePackge, singleETag) if err != nil { fmtp.Printfln("Crawling package %s failured: %v", singlePackge, err) } else { fmtp.Printfln("Package %s: %+v", singlePackge, p) } return } log.Println("crawler started...") // Load CrawlerDB cDB = gcse.LoadCrawlerDB() fpDataRoot := sophie.FsPath{ Fs: sophie.LocalFS, Path: gcse.DataRoot.S(), } fpDocs := fpDataRoot.Join(gcse.FnDocs) if err := loadAllDocsPkgs(kv.DirInput(fpDocs)); err != nil { log.Fatalf("loadAllDocsPkgs: %v", err) } log.Printf("%d docs loaded!", len(allDocsPkgs)) AppStopTime = time.Now().Add(gcse.CrawlerDuePerRun) //pathToCrawl := gcse.DataRoot.Join(gcse.FnToCrawl) fpCrawler := fpDataRoot.Join(gcse.FnCrawlerDB) fpToCrawl := fpDataRoot.Join(gcse.FnToCrawl) fpNewDocs := fpCrawler.Join(gcse.FnNewDocs) fpNewDocs.Remove() pkgEnd := make(chan error, 1) go crawlPackages(httpClient, fpToCrawl.Join(gcse.FnPackage), fpNewDocs, pkgEnd) psnEnd := make(chan error, 1) go crawlPersons(httpClient, fpToCrawl.Join(gcse.FnPerson), psnEnd) errPkg, errPsn := <-pkgEnd, <-psnEnd if errPkg != nil || errPsn != nil { log.Fatalf("Some job may failed, package: %v, person: %v", errPkg, errPsn) } if err := processImports(); err != nil { log.Printf("processImports failed: %v", err) } syncDatabases() log.Println("crawler stopped...") }
func main() { log.Println("Running tocrawl tool, to generate crawling list") log.Println("NonCrawlHosts: ", configs.NonCrawlHosts) log.Println("CrawlGithubUpdate: ", configs.CrawlGithubUpdate) log.Println("CrawlByGodocApi: ", configs.CrawlByGodocApi) log.Printf("Using personal: %v", configs.CrawlerGithubPersonal) gcse.GithubSpider = github.NewSpiderWithToken(configs.CrawlerGithubPersonal) // Load CrawlerDB cDB = gcse.LoadCrawlerDB() if configs.CrawlGithubUpdate || configs.CrawlByGodocApi { // load pkgUTs pkgUTs, err := loadPackageUpdateTimes( sophie.LocalFsPath(configs.DocsDBPath().S())) if err != nil { log.Fatalf("loadPackageUpdateTimes failed: %v", err) } if configs.CrawlGithubUpdate { touchByGithubUpdates(pkgUTs) } if configs.CrawlByGodocApi { httpClient := gcse.GenHttpClient("") pkgs, err := gcse.FetchAllPackagesInGodoc(httpClient) if err != nil { log.Fatalf("FetchAllPackagesInGodoc failed: %v", err) } gcse.AddBiValueAndProcess(bi.Max, "godoc.doc-count", len(pkgs)) log.Printf("FetchAllPackagesInGodoc returns %d entries", len(pkgs)) now := time.Now() for _, pkg := range pkgs { cDB.AppendPackage(pkg, func(pkg string) bool { _, ok := pkgUTs[pkg] return ok }) site, path := utils.SplitPackage(pkg) if err := store.AppendPackageEvent(site, path, "godoc", now, sppb.HistoryEvent_Action_None); err != nil { log.Printf("UpdatePackageHistory %s %s failed: %v", site, path, err) } } } syncDatabases() } log.Printf("Package DB: %d entries", cDB.PackageDB.Count()) log.Printf("Person DB: %d entries", cDB.PersonDB.Count()) pathToCrawl := configs.DataRoot.Join(configs.FnToCrawl) kvPackage := kv.DirOutput(sophie.LocalFsPath( pathToCrawl.Join(configs.FnPackage).S())) kvPackage.Clean() if err := generateCrawlEntries(cDB.PackageDB, gcse.HostOfPackage, kvPackage); err != nil { log.Fatalf("generateCrawlEntries %v failed: %v", kvPackage.Path, err) } kvPerson := kv.DirOutput(sophie.LocalFsPath( pathToCrawl.Join(configs.FnPerson).S())) kvPerson.Clean() if err := generateCrawlEntries(cDB.PersonDB, func(id string) string { site, _ := gcse.ParsePersonId(id) return site }, kvPerson); err != nil { log.Fatalf("generateCrawlEntries %v failed: %v", kvPerson.Path, err) } }
func main() { runtime.GOMAXPROCS(2) log.Printf("Using personal: %v", configs.CrawlerGithubPersonal) gcse.GithubSpider = github.NewSpiderWithToken(configs.CrawlerGithubPersonal) if db, err := bh.Open(configs.DataRoot.Join("filecache.bolt").S(), 0644, nil); err == nil { log.Print("Using file cache!") gcse.GithubSpider.FileCache = spider.BoltFileCache{ DB: db, IncCounter: bi.Inc, } } else { log.Printf("Open file cache failed: %v", err) } cleanTempDir() defer cleanTempDir() singlePackage := flag.String("pkg", "", "Crawling a single package") singleETag := flag.String("etag", "", "ETag for the single package crawling") singlePerson := flag.String("person", "", "Crawling a single person") flag.Parse() httpClient := gcse.GenHttpClient("") if *singlePerson != "" { log.Printf("Crawling single person %s ...", *singlePerson) p, err := gcse.CrawlPerson(httpClient, *singlePerson) if err != nil { fmtp.Printfln("Crawling person %s failed: %v", *singlePerson, err) } else { fmtp.Printfln("Person %s: %+v", *singlePerson, p) } } if *singlePackage != "" { log.Printf("Crawling single package %s ...", *singlePackage) p, flds, err := gcse.CrawlPackage(httpClient, *singlePackage, *singleETag) if err != nil { fmtp.Printfln("Crawling package %s failed: %v, folders: %v", *singlePackage, err, flds) } else { fmtp.Printfln("Package %s: %+v, folders: %v", *singlePackage, p, flds) } } if *singlePackage != "" || *singlePerson != "" { return } log.Println("crawler started...") // Load CrawlerDB cDB = gcse.LoadCrawlerDB() fpDataRoot := sophie.FsPath{ Fs: sophie.LocalFS, Path: configs.DataRoot.S(), } fpDocs := fpDataRoot.Join(configs.FnDocs) if err := loadAllDocsPkgs(kv.DirInput(fpDocs)); err != nil { log.Fatalf("loadAllDocsPkgs: %v", err) } log.Printf("%d docs loaded!", len(allDocsPkgs)) AppStopTime = time.Now().Add(configs.CrawlerDuePerRun) //pathToCrawl := gcse.DataRoot.Join(gcse.FnToCrawl) fpCrawler := fpDataRoot.Join(configs.FnCrawlerDB) fpToCrawl := fpDataRoot.Join(configs.FnToCrawl) fpNewDocs := fpCrawler.Join(configs.FnNewDocs) fpNewDocs.Remove() if err := processImports(); err != nil { log.Printf("processImports failed: %v", err) } pkgEnd := make(chan error, 1) go crawlPackages(httpClient, fpToCrawl.Join(configs.FnPackage), fpNewDocs, pkgEnd) psnEnd := make(chan error, 1) go crawlPersons(httpClient, fpToCrawl.Join(configs.FnPerson), psnEnd) errPkg, errPsn := <-pkgEnd, <-psnEnd bi.Flush() bi.Process() syncDatabases() if errPkg != nil || errPsn != nil { log.Fatalf("Some job may failed, package: %v, person: %v", errPkg, errPsn) } log.Println("crawler stopped...") }