// processTimeRange calls gs.GetLatestGSDirs to get a list of func (xformer *pdfXformer) processTimeRange(start time.Time, end time.Time) { glog.Infof("Processing time range: (%s, %s)", start.Truncate(time.Second), end.Truncate(time.Second)) for _, dir := range gs.GetLatestGSDirs(start.Unix(), end.Unix(), *storageJsonDirectory) { glog.Infof("> Reading gs://%s/%s\n", *storageBucket, dir) requestedObjects := xformer.client.storageService.Objects.List(*storageBucket).Prefix(dir).Fields( "nextPageToken", "items/updated", "items/md5Hash", "items/mediaLink", "items/name", "items/metadata") for requestedObjects != nil { responseObjects, err := requestedObjects.Do() if err != nil { glog.Errorf("request %#v failed: %s", requestedObjects, err) } else { for _, jsonObject := range responseObjects.Items { xformer.counter++ glog.Infof("> > Processing object: gs://%s/%s {%d}", *storageBucket, jsonObject.Name, xformer.counter) xformer.processJsonFile(jsonObject) } } if len(responseObjects.NextPageToken) > 0 { requestedObjects.PageToken(responseObjects.NextPageToken) } else { requestedObjects = nil } } } glog.Infof("finished time range.") }
// GetGSResultsFileLocations retrieves a list of ResultsFileLocations from Cloud Storage, each one // corresponding to a single JSON file. func getGSResultsFileLocations(startTS int64, endTS int64, storage *storage.Service, bucket, dir string) ([]*ResultsFileLocation, error) { dirs := gs.GetLatestGSDirs(startTS, endTS, dir) glog.Infof("getGSResultsFileLocations: Looking in bucket %s and dirs: %v ", bucket, dirs) retval := []*ResultsFileLocation{} for _, dir := range dirs { files, err := getFilesFromGSDir(bucket, dir, startTS, storage) if err != nil { return nil, err } retval = append(retval, files...) } return retval, nil }
func getLocalResultsFileLocations(startTS, endTS int64, localDir string) ([]*ResultsFileLocation, error) { retval := []*ResultsFileLocation{} glog.Infof("getLocalResultsFileLocations: Looking in local directory %s", localDir) // although GetLatestGSDirs is in the "gs" package, there's nothing specific about // its operation that makes it not re-usable here. dirs := gs.GetLatestGSDirs(startTS, endTS, localDir) for _, dir := range dirs { files, err := getFilesFromLocalDir(dir, startTS) if err != nil { return nil, err } retval = append(retval, files...) } return retval, nil }
// See Source interface. func (f *FileSystemSource) Poll(startTime, endTime int64) ([]ResultFileLocation, error) { retval := []ResultFileLocation{} // although GetLatestGSDirs is in the "gs" package, there's nothing specific about // its operation that makes it not re-usable here. dirs := gs.GetLatestGSDirs(startTime, endTime, f.rootDir) for _, dir := range dirs { // Inject dir into a closure. func(dir string) { walkFn := func(path string, info os.FileInfo, err error) error { if err != nil { // We swallow the error to continue processing, but make sure it's // shows up in the logs. glog.Errorf("Error walking %s: %s", path, err) return nil } if info.IsDir() { return nil } updateTimestamp := info.ModTime().Unix() if updateTimestamp > startTime { rf, err := FileSystemResult(path) if err != nil { glog.Errorf("Unable to create file system result: %s", err) return nil } retval = append(retval, rf) } return nil } // Only walk the tree if the top directory exists. if fileutil.FileExists(dir) { if err := filepath.Walk(dir, walkFn); err != nil { glog.Infof("Unable to read the local dir %s: %s", dir, err) return } } }(dir) } return retval, nil }
// See Source interface. func (g *GoogleStorageSource) Poll(startTime, endTime int64) ([]ResultFileLocation, error) { dirs := gs.GetLatestGSDirs(startTime, endTime, g.rootDir) retval := []ResultFileLocation{} for _, dir := range dirs { glog.Infof("Opening bucket/directory: %s/%s", g.bucket, dir) req := g.gStorage.Objects.List(g.bucket).Prefix(dir).Fields("nextPageToken", "items/updated", "items/md5Hash", "items/mediaLink", "items/name") for req != nil { resp, err := req.Do() if err != nil { return nil, fmt.Errorf("Error occurred while listing JSON files: %s", err) } for _, result := range resp.Items { updateDate, err := time.Parse(time.RFC3339, result.Updated) if err != nil { glog.Errorf("Unable to parse date %s: %s", result.Updated, err) continue } updateTimestamp := updateDate.Unix() if updateTimestamp > startTime { // Decode the MD5 hash from base64. md5Bytes, err := base64.StdEncoding.DecodeString(result.Md5Hash) if err != nil { glog.Errorf("Unable to decode base64-encoded MD5: %s", err) continue } // We re-encode the md5 hash as a hex string to make debugging and testing easier. retval = append(retval, newGSResultFileLocation(result, updateTimestamp, hex.EncodeToString(md5Bytes), g.client)) } } if len(resp.NextPageToken) > 0 { req.PageToken(resp.NextPageToken) } else { req = nil } } } return retval, nil }