Beispiel #1
0
// processTimeRange calls gs.GetLatestGSDirs to get a list of
func (xformer *pdfXformer) processTimeRange(start time.Time, end time.Time) {
	glog.Infof("Processing time range: (%s, %s)", start.Truncate(time.Second), end.Truncate(time.Second))
	for _, dir := range gs.GetLatestGSDirs(start.Unix(), end.Unix(), *storageJsonDirectory) {
		glog.Infof("> Reading gs://%s/%s\n", *storageBucket, dir)
		requestedObjects := xformer.client.storageService.Objects.List(*storageBucket).Prefix(dir).Fields(
			"nextPageToken", "items/updated", "items/md5Hash", "items/mediaLink", "items/name", "items/metadata")
		for requestedObjects != nil {
			responseObjects, err := requestedObjects.Do()
			if err != nil {
				glog.Errorf("request %#v failed: %s", requestedObjects, err)
			} else {
				for _, jsonObject := range responseObjects.Items {
					xformer.counter++
					glog.Infof("> > Processing object:  gs://%s/%s {%d}", *storageBucket, jsonObject.Name, xformer.counter)
					xformer.processJsonFile(jsonObject)
				}
			}
			if len(responseObjects.NextPageToken) > 0 {
				requestedObjects.PageToken(responseObjects.NextPageToken)
			} else {
				requestedObjects = nil
			}
		}
	}
	glog.Infof("finished time range.")
}
Beispiel #2
0
// GetGSResultsFileLocations retrieves a list of ResultsFileLocations from Cloud Storage, each one
// corresponding to a single JSON file.
func getGSResultsFileLocations(startTS int64, endTS int64, storage *storage.Service, bucket, dir string) ([]*ResultsFileLocation, error) {
	dirs := gs.GetLatestGSDirs(startTS, endTS, dir)
	glog.Infof("getGSResultsFileLocations: Looking in bucket %s and dirs: %v ", bucket, dirs)

	retval := []*ResultsFileLocation{}
	for _, dir := range dirs {
		files, err := getFilesFromGSDir(bucket, dir, startTS, storage)
		if err != nil {
			return nil, err
		}
		retval = append(retval, files...)
	}
	return retval, nil
}
Beispiel #3
0
func getLocalResultsFileLocations(startTS, endTS int64, localDir string) ([]*ResultsFileLocation, error) {
	retval := []*ResultsFileLocation{}
	glog.Infof("getLocalResultsFileLocations: Looking in local directory %s", localDir)

	// although GetLatestGSDirs is in the "gs" package, there's nothing specific about
	// its operation that makes it not re-usable here.
	dirs := gs.GetLatestGSDirs(startTS, endTS, localDir)

	for _, dir := range dirs {
		files, err := getFilesFromLocalDir(dir, startTS)
		if err != nil {
			return nil, err
		}
		retval = append(retval, files...)
	}

	return retval, nil
}
Beispiel #4
0
// See Source interface.
func (f *FileSystemSource) Poll(startTime, endTime int64) ([]ResultFileLocation, error) {
	retval := []ResultFileLocation{}

	// although GetLatestGSDirs is in the "gs" package, there's nothing specific about
	// its operation that makes it not re-usable here.
	dirs := gs.GetLatestGSDirs(startTime, endTime, f.rootDir)
	for _, dir := range dirs {
		// Inject dir into a closure.
		func(dir string) {
			walkFn := func(path string, info os.FileInfo, err error) error {
				if err != nil {
					// We swallow the error to continue processing, but make sure it's
					// shows up in the logs.
					glog.Errorf("Error walking %s: %s", path, err)
					return nil
				}
				if info.IsDir() {
					return nil
				}

				updateTimestamp := info.ModTime().Unix()
				if updateTimestamp > startTime {
					rf, err := FileSystemResult(path)
					if err != nil {
						glog.Errorf("Unable to create file system result: %s", err)
						return nil
					}
					retval = append(retval, rf)
				}
				return nil
			}

			// Only walk the tree if the top directory exists.
			if fileutil.FileExists(dir) {
				if err := filepath.Walk(dir, walkFn); err != nil {
					glog.Infof("Unable to read the local dir %s: %s", dir, err)
					return
				}
			}
		}(dir)
	}

	return retval, nil
}
Beispiel #5
0
// See Source interface.
func (g *GoogleStorageSource) Poll(startTime, endTime int64) ([]ResultFileLocation, error) {
	dirs := gs.GetLatestGSDirs(startTime, endTime, g.rootDir)
	retval := []ResultFileLocation{}
	for _, dir := range dirs {
		glog.Infof("Opening bucket/directory: %s/%s", g.bucket, dir)

		req := g.gStorage.Objects.List(g.bucket).Prefix(dir).Fields("nextPageToken", "items/updated", "items/md5Hash", "items/mediaLink", "items/name")
		for req != nil {
			resp, err := req.Do()
			if err != nil {
				return nil, fmt.Errorf("Error occurred while listing JSON files: %s", err)
			}
			for _, result := range resp.Items {
				updateDate, err := time.Parse(time.RFC3339, result.Updated)
				if err != nil {
					glog.Errorf("Unable to parse date %s: %s", result.Updated, err)
					continue
				}
				updateTimestamp := updateDate.Unix()
				if updateTimestamp > startTime {
					// Decode the MD5 hash from base64.
					md5Bytes, err := base64.StdEncoding.DecodeString(result.Md5Hash)
					if err != nil {
						glog.Errorf("Unable to decode base64-encoded MD5: %s", err)
						continue
					}
					// We re-encode the md5 hash as a hex string to make debugging and testing easier.
					retval = append(retval, newGSResultFileLocation(result, updateTimestamp, hex.EncodeToString(md5Bytes), g.client))
				}
			}
			if len(resp.NextPageToken) > 0 {
				req.PageToken(resp.NextPageToken)
			} else {
				req = nil
			}
		}
	}
	return retval, nil
}