Пример #1
0
func main() {
	log.Println("Running tocrawl tool, to generate crawling list")
	log.Println("NonCrawlHosts: ", gcse.NonCrawlHosts)
	log.Println("CrawlGithubUpdate: ", gcse.CrawlGithubUpdate)
	log.Println("CrawlByGodocApi: ", gcse.CrawlByGodocApi)
	// Load CrawlerDB
	cDB = gcse.LoadCrawlerDB()

	if gcse.CrawlGithubUpdate || gcse.CrawlByGodocApi {
		// load pkgUTs
		pkgUTs, err := loadPackageUpdateTimes(
			sophie.LocalFsPath(gcse.DocsDBPath.S()))
		if err != nil {
			log.Fatalf("loadPackageUpdateTimes failed: %v", err)
		}

		if gcse.CrawlGithubUpdate {
			touchByGithubUpdates(pkgUTs)
		}

		if gcse.CrawlByGodocApi {
			httpClient := gcse.GenHttpClient("")
			pkgs, err := gcse.FetchAllPackagesInGodoc(httpClient)
			if err != nil {
				log.Fatalf("FetchAllPackagesInGodoc failed: %v", err)
			}
			log.Printf("FetchAllPackagesInGodoc returns %d entries", len(pkgs))
			for _, pkg := range pkgs {
				cDB.AppendPackage(pkg, func(pkg string) bool {
					_, ok := pkgUTs[pkg]
					return ok
				})
			}
		}
		syncDatabases()
	}

	log.Printf("Package DB: %d entries", cDB.PackageDB.Count())
	log.Printf("Person DB: %d entries", cDB.PersonDB.Count())

	pathToCrawl := gcse.DataRoot.Join(gcse.FnToCrawl)

	kvPackage := kv.DirOutput(sophie.LocalFsPath(
		pathToCrawl.Join(gcse.FnPackage).S()))
	kvPackage.Clean()
	if err := generateCrawlEntries(cDB.PackageDB, gcse.HostOfPackage, kvPackage); err != nil {
		log.Fatalf("generateCrawlEntries %v failed: %v", kvPackage.Path, err)
	}

	kvPerson := kv.DirOutput(sophie.LocalFsPath(
		pathToCrawl.Join(gcse.FnPerson).S()))

	kvPerson.Clean()
	if err := generateCrawlEntries(cDB.PersonDB, func(id string) string {
		site, _ := gcse.ParsePersonId(id)
		return site
	}, kvPerson); err != nil {
		log.Fatalf("generateCrawlEntries %v failed: %v", kvPerson.Path, err)
	}
}
Пример #2
0
func doFill() error {
	cDB := gcse.LoadCrawlerDB()
	return cDB.PackageDB.Iterate(func(pkg string, val interface{}) error {
		ent, ok := val.(gcse.CrawlingEntry)
		if !ok {
			log.Printf("Wrong entry, ignored: %+v", ent)
			return nil
		}
		site, path := utils.SplitPackage(pkg)
		return store.AppendPackageEvent(site, path, "unknown", ent.ScheduleTime.Add(-10*timep.Day), sppb.HistoryEvent_Action_None)
	})
}
Пример #3
0
func dumpCrawler(keys []string) {
	cDB := gcse.LoadCrawlerDB()
	if len(keys) == 0 {
		// Full dump
		log.Printf("Dumping PackageDB...")
		cDB.PackageDB.Iterate(func(k string, v interface{}) error {
			fmtp.Printfln("Package %v: %+v", k, v)
			return nil
		})
		return
	}
	for _, key := range keys {
		var ent gcse.CrawlingEntry
		if cDB.PackageDB.Get(key, &ent) {
			fmtp.Printfln("Package %v: %+v", key, ent)
		}
	}
}
Пример #4
0
func TestDoFill(t *testing.T) {
	const (
		site = "github.com"
		path = "daviddengcn/gcse"
	)
	tm := time.Now().Add(-20 * timep.Day)
	cDB := gcse.LoadCrawlerDB()
	cDB.PackageDB.Put(site+"/"+path, gcse.CrawlingEntry{
		ScheduleTime: tm.Add(10 * timep.Day),
	})
	assert.NoError(t, cDB.Sync())

	assert.NoError(t, doFill())

	h, err := store.ReadPackageHistory(site, path)
	assert.NoError(t, err)
	ts, _ := ptypes.TimestampProto(tm)
	assert.Equal(t, "h", h, &sppb.HistoryInfo{
		FoundTime: ts,
		FoundWay:  "unknown",
	})
}
Пример #5
0
func main() {
	defer func() {
		tmpFn := villa.Path("/tmp/gddo")
		if err := tmpFn.RemoveAll(); err != nil {
			log.Printf("Delete %v failed: %v", tmpFn, err)
		}
	}()

	singlePackge := ""
	singleETag := ""
	flag.StringVar(&singlePackge, "pkg", singlePackge, "Crawling single package")
	flag.StringVar(&singleETag, "etag", singleETag, "ETag for single package crawling")

	flag.Parse()

	httpClient := gcse.GenHttpClient("")

	if singlePackge != "" {
		log.Printf("Crawling single package %s ...", singlePackge)
		p, err := gcse.CrawlPackage(httpClient, singlePackge, singleETag)
		if err != nil {
			fmtp.Printfln("Crawling package %s failured: %v", singlePackge, err)
		} else {
			fmtp.Printfln("Package %s: %+v", singlePackge, p)
		}
		return
	}

	log.Println("crawler started...")

	// Load CrawlerDB
	cDB = gcse.LoadCrawlerDB()

	fpDataRoot := sophie.FsPath{
		Fs:   sophie.LocalFS,
		Path: gcse.DataRoot.S(),
	}

	fpDocs := fpDataRoot.Join(gcse.FnDocs)
	if err := loadAllDocsPkgs(kv.DirInput(fpDocs)); err != nil {
		log.Fatalf("loadAllDocsPkgs: %v", err)
	}
	log.Printf("%d docs loaded!", len(allDocsPkgs))

	AppStopTime = time.Now().Add(gcse.CrawlerDuePerRun)

	//pathToCrawl := gcse.DataRoot.Join(gcse.FnToCrawl)
	fpCrawler := fpDataRoot.Join(gcse.FnCrawlerDB)
	fpToCrawl := fpDataRoot.Join(gcse.FnToCrawl)

	fpNewDocs := fpCrawler.Join(gcse.FnNewDocs)
	fpNewDocs.Remove()

	pkgEnd := make(chan error, 1)
	go crawlPackages(httpClient, fpToCrawl.Join(gcse.FnPackage), fpNewDocs,
		pkgEnd)

	psnEnd := make(chan error, 1)
	go crawlPersons(httpClient, fpToCrawl.Join(gcse.FnPerson), psnEnd)

	errPkg, errPsn := <-pkgEnd, <-psnEnd
	if errPkg != nil || errPsn != nil {
		log.Fatalf("Some job may failed, package: %v, person: %v",
			errPkg, errPsn)
	}

	if err := processImports(); err != nil {
		log.Printf("processImports failed: %v", err)
	}

	syncDatabases()
	log.Println("crawler stopped...")
}
Пример #6
0
func main() {
	log.Println("Running tocrawl tool, to generate crawling list")
	log.Println("NonCrawlHosts: ", configs.NonCrawlHosts)
	log.Println("CrawlGithubUpdate: ", configs.CrawlGithubUpdate)
	log.Println("CrawlByGodocApi: ", configs.CrawlByGodocApi)

	log.Printf("Using personal: %v", configs.CrawlerGithubPersonal)
	gcse.GithubSpider = github.NewSpiderWithToken(configs.CrawlerGithubPersonal)

	// Load CrawlerDB
	cDB = gcse.LoadCrawlerDB()

	if configs.CrawlGithubUpdate || configs.CrawlByGodocApi {
		// load pkgUTs
		pkgUTs, err := loadPackageUpdateTimes(
			sophie.LocalFsPath(configs.DocsDBPath().S()))
		if err != nil {
			log.Fatalf("loadPackageUpdateTimes failed: %v", err)
		}

		if configs.CrawlGithubUpdate {
			touchByGithubUpdates(pkgUTs)
		}

		if configs.CrawlByGodocApi {
			httpClient := gcse.GenHttpClient("")
			pkgs, err := gcse.FetchAllPackagesInGodoc(httpClient)
			if err != nil {
				log.Fatalf("FetchAllPackagesInGodoc failed: %v", err)
			}
			gcse.AddBiValueAndProcess(bi.Max, "godoc.doc-count", len(pkgs))
			log.Printf("FetchAllPackagesInGodoc returns %d entries", len(pkgs))
			now := time.Now()
			for _, pkg := range pkgs {
				cDB.AppendPackage(pkg, func(pkg string) bool {
					_, ok := pkgUTs[pkg]
					return ok
				})
				site, path := utils.SplitPackage(pkg)
				if err := store.AppendPackageEvent(site, path, "godoc", now, sppb.HistoryEvent_Action_None); err != nil {
					log.Printf("UpdatePackageHistory %s %s failed: %v", site, path, err)
				}
			}
		}
		syncDatabases()
	}

	log.Printf("Package DB: %d entries", cDB.PackageDB.Count())
	log.Printf("Person DB: %d entries", cDB.PersonDB.Count())

	pathToCrawl := configs.DataRoot.Join(configs.FnToCrawl)

	kvPackage := kv.DirOutput(sophie.LocalFsPath(
		pathToCrawl.Join(configs.FnPackage).S()))
	kvPackage.Clean()
	if err := generateCrawlEntries(cDB.PackageDB, gcse.HostOfPackage, kvPackage); err != nil {
		log.Fatalf("generateCrawlEntries %v failed: %v", kvPackage.Path, err)
	}

	kvPerson := kv.DirOutput(sophie.LocalFsPath(
		pathToCrawl.Join(configs.FnPerson).S()))

	kvPerson.Clean()
	if err := generateCrawlEntries(cDB.PersonDB, func(id string) string {
		site, _ := gcse.ParsePersonId(id)
		return site
	}, kvPerson); err != nil {
		log.Fatalf("generateCrawlEntries %v failed: %v", kvPerson.Path, err)
	}
}
Пример #7
0
func main() {
	runtime.GOMAXPROCS(2)

	log.Printf("Using personal: %v", configs.CrawlerGithubPersonal)
	gcse.GithubSpider = github.NewSpiderWithToken(configs.CrawlerGithubPersonal)

	if db, err := bh.Open(configs.DataRoot.Join("filecache.bolt").S(), 0644, nil); err == nil {
		log.Print("Using file cache!")
		gcse.GithubSpider.FileCache = spider.BoltFileCache{
			DB:         db,
			IncCounter: bi.Inc,
		}
	} else {
		log.Printf("Open file cache failed: %v", err)
	}

	cleanTempDir()
	defer cleanTempDir()

	singlePackage := flag.String("pkg", "", "Crawling a single package")
	singleETag := flag.String("etag", "", "ETag for the single package crawling")
	singlePerson := flag.String("person", "", "Crawling a single person")

	flag.Parse()

	httpClient := gcse.GenHttpClient("")

	if *singlePerson != "" {
		log.Printf("Crawling single person %s ...", *singlePerson)
		p, err := gcse.CrawlPerson(httpClient, *singlePerson)
		if err != nil {
			fmtp.Printfln("Crawling person %s failed: %v", *singlePerson, err)
		} else {
			fmtp.Printfln("Person %s: %+v", *singlePerson, p)
		}
	}
	if *singlePackage != "" {
		log.Printf("Crawling single package %s ...", *singlePackage)
		p, flds, err := gcse.CrawlPackage(httpClient, *singlePackage, *singleETag)
		if err != nil {
			fmtp.Printfln("Crawling package %s failed: %v, folders: %v", *singlePackage, err, flds)
		} else {
			fmtp.Printfln("Package %s: %+v, folders: %v", *singlePackage, p, flds)
		}
	}
	if *singlePackage != "" || *singlePerson != "" {
		return
	}

	log.Println("crawler started...")

	// Load CrawlerDB
	cDB = gcse.LoadCrawlerDB()

	fpDataRoot := sophie.FsPath{
		Fs:   sophie.LocalFS,
		Path: configs.DataRoot.S(),
	}

	fpDocs := fpDataRoot.Join(configs.FnDocs)
	if err := loadAllDocsPkgs(kv.DirInput(fpDocs)); err != nil {
		log.Fatalf("loadAllDocsPkgs: %v", err)
	}
	log.Printf("%d docs loaded!", len(allDocsPkgs))

	AppStopTime = time.Now().Add(configs.CrawlerDuePerRun)

	//pathToCrawl := gcse.DataRoot.Join(gcse.FnToCrawl)
	fpCrawler := fpDataRoot.Join(configs.FnCrawlerDB)
	fpToCrawl := fpDataRoot.Join(configs.FnToCrawl)

	fpNewDocs := fpCrawler.Join(configs.FnNewDocs)
	fpNewDocs.Remove()

	if err := processImports(); err != nil {
		log.Printf("processImports failed: %v", err)
	}

	pkgEnd := make(chan error, 1)
	go crawlPackages(httpClient, fpToCrawl.Join(configs.FnPackage), fpNewDocs, pkgEnd)

	psnEnd := make(chan error, 1)
	go crawlPersons(httpClient, fpToCrawl.Join(configs.FnPerson), psnEnd)

	errPkg, errPsn := <-pkgEnd, <-psnEnd
	bi.Flush()
	bi.Process()
	syncDatabases()
	if errPkg != nil || errPsn != nil {
		log.Fatalf("Some job may failed, package: %v, person: %v", errPkg, errPsn)
	}
	log.Println("crawler stopped...")
}