// GetCollections gets collections from api func GetCollections(params GetCollectionsParams) (results ReadCollections) { if ¶ms.Client == nil { log.Fatalf("params.Client passed to GetCollections() should " + "contain a valid ArvadosClient, but instead it is nil.") } fieldsWanted := []string{"manifest_text", "owner_uuid", "uuid", "redundancy", "modified_at"} sdkParams := arvadosclient.Dict{ "select": fieldsWanted, "order": []string{"modified_at ASC"}, "filters": [][]string{[]string{"modified_at", ">=", "1900-01-01T00:00:00Z"}}} if params.BatchSize > 0 { sdkParams["limit"] = params.BatchSize } var defaultReplicationLevel int { value, err := params.Client.Discovery("defaultCollectionReplication") if err != nil { loggerutil.FatalWithMessage(params.Logger, fmt.Sprintf("Error querying default collection replication: %v", err)) } defaultReplicationLevel = int(value.(float64)) if defaultReplicationLevel <= 0 { loggerutil.FatalWithMessage(params.Logger, fmt.Sprintf("Default collection replication returned by arvados SDK "+ "should be a positive integer but instead it was %d.", defaultReplicationLevel)) } } initialNumberOfCollectionsAvailable, err := util.NumberItemsAvailable(params.Client, "collections") if err != nil { loggerutil.FatalWithMessage(params.Logger, fmt.Sprintf("Error querying collection count: %v", err)) } // Include a 1% margin for collections added while we're reading so // that we don't have to grow the map in most cases. maxExpectedCollections := int( float64(initialNumberOfCollectionsAvailable) * 1.01) results.UUIDToCollection = make(map[string]Collection, maxExpectedCollections) if params.Logger != nil { params.Logger.Update(func(p map[string]interface{}, e map[string]interface{}) { collectionInfo := logger.GetOrCreateMap(p, "collection_info") collectionInfo["num_collections_at_start"] = initialNumberOfCollectionsAvailable collectionInfo["batch_size"] = params.BatchSize collectionInfo["default_replication_level"] = defaultReplicationLevel }) } // These values are just for getting the loop to run the first time, // afterwards they'll be set to real values. previousTotalCollections := -1 totalCollections := 0 for totalCollections > previousTotalCollections { // We're still finding new collections // Write the heap profile for examining memory usage WriteHeapProfile() // Get next batch of collections. var collections SdkCollectionList err := params.Client.List("collections", sdkParams, &collections) if err != nil { loggerutil.FatalWithMessage(params.Logger, fmt.Sprintf("Error querying collections: %v", err)) } // Process collection and update our date filter. sdkParams["filters"].([][]string)[0][2] = ProcessCollections(params.Logger, collections.Items, defaultReplicationLevel, results.UUIDToCollection).Format(time.RFC3339) // update counts previousTotalCollections = totalCollections totalCollections = len(results.UUIDToCollection) log.Printf("%d collections read, %d new in last batch, "+ "%s latest modified date, %.0f %d %d avg,max,total manifest size", totalCollections, totalCollections-previousTotalCollections, sdkParams["filters"].([][]string)[0][2], float32(totalManifestSize)/float32(totalCollections), maxManifestSize, totalManifestSize) if params.Logger != nil { params.Logger.Update(func(p map[string]interface{}, e map[string]interface{}) { collectionInfo := logger.GetOrCreateMap(p, "collection_info") collectionInfo["collections_read"] = totalCollections collectionInfo["latest_modified_date_seen"] = sdkParams["filters"].([][]string)[0][2] collectionInfo["total_manifest_size"] = totalManifestSize collectionInfo["max_manifest_size"] = maxManifestSize }) } } // Write the heap profile for examining memory usage WriteHeapProfile() return }
// GetCollections gets collections from api func GetCollections(params GetCollectionsParams) (results ReadCollections, err error) { if ¶ms.Client == nil { err = fmt.Errorf("params.Client passed to GetCollections() should " + "contain a valid ArvadosClient, but instead it is nil.") return } fieldsWanted := []string{"manifest_text", "owner_uuid", "uuid", "replication_desired", "modified_at"} sdkParams := arvadosclient.Dict{ "select": fieldsWanted, "order": []string{"modified_at ASC", "uuid ASC"}, "filters": [][]string{[]string{"modified_at", ">=", "1900-01-01T00:00:00Z"}}, "offset": 0} if params.BatchSize > 0 { sdkParams["limit"] = params.BatchSize } var defaultReplicationLevel int { var value interface{} value, err = params.Client.Discovery("defaultCollectionReplication") if err != nil { return } defaultReplicationLevel = int(value.(float64)) if defaultReplicationLevel <= 0 { err = fmt.Errorf("Default collection replication returned by arvados SDK "+ "should be a positive integer but instead it was %d.", defaultReplicationLevel) return } } initialNumberOfCollectionsAvailable, err := util.NumberItemsAvailable(params.Client, "collections") if err != nil { return } // Include a 1% margin for collections added while we're reading so // that we don't have to grow the map in most cases. maxExpectedCollections := int( float64(initialNumberOfCollectionsAvailable) * 1.01) results.UUIDToCollection = make(map[string]Collection, maxExpectedCollections) if params.Logger != nil { params.Logger.Update(func(p map[string]interface{}, e map[string]interface{}) { collectionInfo := logger.GetOrCreateMap(p, "collection_info") collectionInfo["num_collections_at_start"] = initialNumberOfCollectionsAvailable collectionInfo["batch_size"] = params.BatchSize collectionInfo["default_replication_level"] = defaultReplicationLevel }) } // These values are just for getting the loop to run the first time, // afterwards they'll be set to real values. remainingCollections := 1 var totalCollections int var previousTotalCollections int for remainingCollections > 0 { // We're still finding new collections // Write the heap profile for examining memory usage err = WriteHeapProfile() if err != nil { return } // Get next batch of collections. var collections SdkCollectionList err = params.Client.List("collections", sdkParams, &collections) if err != nil { return } batchCollections := len(collections.Items) // We must always have at least one collection in the batch if batchCollections < 1 { err = fmt.Errorf("API query returned no collections for %+v", sdkParams) return } // Update count of remaining collections remainingCollections = collections.ItemsAvailable - sdkParams["offset"].(int) - batchCollections // Process collection and update our date filter. latestModificationDate, maxManifestSize, totalManifestSize, err := ProcessCollections(params.Logger, collections.Items, defaultReplicationLevel, results.UUIDToCollection) if err != nil { return results, err } if sdkParams["filters"].([][]string)[0][2] != latestModificationDate.Format(time.RFC3339) { sdkParams["filters"].([][]string)[0][2] = latestModificationDate.Format(time.RFC3339) sdkParams["offset"] = 0 } else { sdkParams["offset"] = sdkParams["offset"].(int) + batchCollections } // update counts previousTotalCollections = totalCollections totalCollections = len(results.UUIDToCollection) log.Printf("%d collections read, %d (%d new) in last batch, "+ "%d remaining, "+ "%s latest modified date, %.0f %d %d avg,max,total manifest size", totalCollections, batchCollections, totalCollections-previousTotalCollections, remainingCollections, sdkParams["filters"].([][]string)[0][2], float32(totalManifestSize)/float32(totalCollections), maxManifestSize, totalManifestSize) if params.Logger != nil { params.Logger.Update(func(p map[string]interface{}, e map[string]interface{}) { collectionInfo := logger.GetOrCreateMap(p, "collection_info") collectionInfo["collections_read"] = totalCollections collectionInfo["latest_modified_date_seen"] = sdkParams["filters"].([][]string)[0][2] collectionInfo["total_manifest_size"] = totalManifestSize collectionInfo["max_manifest_size"] = maxManifestSize }) } } // Make one final API request to verify that we have processed all collections available up to the latest modification date var collections SdkCollectionList sdkParams["filters"].([][]string)[0][1] = "<=" sdkParams["limit"] = 0 err = params.Client.List("collections", sdkParams, &collections) if err != nil { return } finalNumberOfCollectionsAvailable, err := util.NumberItemsAvailable(params.Client, "collections") if err != nil { return } if totalCollections < finalNumberOfCollectionsAvailable { err = fmt.Errorf("API server indicates a total of %d collections "+ "available up to %v, but we only retrieved %d. "+ "Refusing to continue as this could indicate an "+ "otherwise undetected failure.", finalNumberOfCollectionsAvailable, sdkParams["filters"].([][]string)[0][2], totalCollections) return } // Write the heap profile for examining memory usage err = WriteHeapProfile() return }