Example #1
0
// GetCollections gets collections from api
func GetCollections(params GetCollectionsParams) (results ReadCollections) {
	if &params.Client == nil {
		log.Fatalf("params.Client passed to GetCollections() should " +
			"contain a valid ArvadosClient, but instead it is nil.")
	}

	fieldsWanted := []string{"manifest_text",
		"owner_uuid",
		"uuid",
		"redundancy",
		"modified_at"}

	sdkParams := arvadosclient.Dict{
		"select":  fieldsWanted,
		"order":   []string{"modified_at ASC"},
		"filters": [][]string{[]string{"modified_at", ">=", "1900-01-01T00:00:00Z"}}}

	if params.BatchSize > 0 {
		sdkParams["limit"] = params.BatchSize
	}

	var defaultReplicationLevel int
	{
		value, err := params.Client.Discovery("defaultCollectionReplication")
		if err != nil {
			loggerutil.FatalWithMessage(params.Logger,
				fmt.Sprintf("Error querying default collection replication: %v", err))
		}

		defaultReplicationLevel = int(value.(float64))
		if defaultReplicationLevel <= 0 {
			loggerutil.FatalWithMessage(params.Logger,
				fmt.Sprintf("Default collection replication returned by arvados SDK "+
					"should be a positive integer but instead it was %d.",
					defaultReplicationLevel))
		}
	}

	initialNumberOfCollectionsAvailable, err :=
		util.NumberItemsAvailable(params.Client, "collections")
	if err != nil {
		loggerutil.FatalWithMessage(params.Logger,
			fmt.Sprintf("Error querying collection count: %v", err))
	}
	// Include a 1% margin for collections added while we're reading so
	// that we don't have to grow the map in most cases.
	maxExpectedCollections := int(
		float64(initialNumberOfCollectionsAvailable) * 1.01)
	results.UUIDToCollection = make(map[string]Collection, maxExpectedCollections)

	if params.Logger != nil {
		params.Logger.Update(func(p map[string]interface{}, e map[string]interface{}) {
			collectionInfo := logger.GetOrCreateMap(p, "collection_info")
			collectionInfo["num_collections_at_start"] = initialNumberOfCollectionsAvailable
			collectionInfo["batch_size"] = params.BatchSize
			collectionInfo["default_replication_level"] = defaultReplicationLevel
		})
	}

	// These values are just for getting the loop to run the first time,
	// afterwards they'll be set to real values.
	previousTotalCollections := -1
	totalCollections := 0
	for totalCollections > previousTotalCollections {
		// We're still finding new collections

		// Write the heap profile for examining memory usage
		WriteHeapProfile()

		// Get next batch of collections.
		var collections SdkCollectionList
		err := params.Client.List("collections", sdkParams, &collections)
		if err != nil {
			loggerutil.FatalWithMessage(params.Logger,
				fmt.Sprintf("Error querying collections: %v", err))
		}

		// Process collection and update our date filter.
		sdkParams["filters"].([][]string)[0][2] =
			ProcessCollections(params.Logger,
				collections.Items,
				defaultReplicationLevel,
				results.UUIDToCollection).Format(time.RFC3339)

		// update counts
		previousTotalCollections = totalCollections
		totalCollections = len(results.UUIDToCollection)

		log.Printf("%d collections read, %d new in last batch, "+
			"%s latest modified date, %.0f %d %d avg,max,total manifest size",
			totalCollections,
			totalCollections-previousTotalCollections,
			sdkParams["filters"].([][]string)[0][2],
			float32(totalManifestSize)/float32(totalCollections),
			maxManifestSize, totalManifestSize)

		if params.Logger != nil {
			params.Logger.Update(func(p map[string]interface{}, e map[string]interface{}) {
				collectionInfo := logger.GetOrCreateMap(p, "collection_info")
				collectionInfo["collections_read"] = totalCollections
				collectionInfo["latest_modified_date_seen"] = sdkParams["filters"].([][]string)[0][2]
				collectionInfo["total_manifest_size"] = totalManifestSize
				collectionInfo["max_manifest_size"] = maxManifestSize
			})
		}
	}

	// Write the heap profile for examining memory usage
	WriteHeapProfile()

	return
}
Example #2
0
// GetCollections gets collections from api
func GetCollections(params GetCollectionsParams) (results ReadCollections, err error) {
	if &params.Client == nil {
		err = fmt.Errorf("params.Client passed to GetCollections() should " +
			"contain a valid ArvadosClient, but instead it is nil.")
		return
	}

	fieldsWanted := []string{"manifest_text",
		"owner_uuid",
		"uuid",
		"replication_desired",
		"modified_at"}

	sdkParams := arvadosclient.Dict{
		"select":  fieldsWanted,
		"order":   []string{"modified_at ASC", "uuid ASC"},
		"filters": [][]string{[]string{"modified_at", ">=", "1900-01-01T00:00:00Z"}},
		"offset":  0}

	if params.BatchSize > 0 {
		sdkParams["limit"] = params.BatchSize
	}

	var defaultReplicationLevel int
	{
		var value interface{}
		value, err = params.Client.Discovery("defaultCollectionReplication")
		if err != nil {
			return
		}

		defaultReplicationLevel = int(value.(float64))
		if defaultReplicationLevel <= 0 {
			err = fmt.Errorf("Default collection replication returned by arvados SDK "+
				"should be a positive integer but instead it was %d.",
				defaultReplicationLevel)
			return
		}
	}

	initialNumberOfCollectionsAvailable, err :=
		util.NumberItemsAvailable(params.Client, "collections")
	if err != nil {
		return
	}
	// Include a 1% margin for collections added while we're reading so
	// that we don't have to grow the map in most cases.
	maxExpectedCollections := int(
		float64(initialNumberOfCollectionsAvailable) * 1.01)
	results.UUIDToCollection = make(map[string]Collection, maxExpectedCollections)

	if params.Logger != nil {
		params.Logger.Update(func(p map[string]interface{}, e map[string]interface{}) {
			collectionInfo := logger.GetOrCreateMap(p, "collection_info")
			collectionInfo["num_collections_at_start"] = initialNumberOfCollectionsAvailable
			collectionInfo["batch_size"] = params.BatchSize
			collectionInfo["default_replication_level"] = defaultReplicationLevel
		})
	}

	// These values are just for getting the loop to run the first time,
	// afterwards they'll be set to real values.
	remainingCollections := 1
	var totalCollections int
	var previousTotalCollections int
	for remainingCollections > 0 {
		// We're still finding new collections

		// Write the heap profile for examining memory usage
		err = WriteHeapProfile()
		if err != nil {
			return
		}

		// Get next batch of collections.
		var collections SdkCollectionList
		err = params.Client.List("collections", sdkParams, &collections)
		if err != nil {
			return
		}
		batchCollections := len(collections.Items)

		// We must always have at least one collection in the batch
		if batchCollections < 1 {
			err = fmt.Errorf("API query returned no collections for %+v", sdkParams)
			return
		}

		// Update count of remaining collections
		remainingCollections = collections.ItemsAvailable - sdkParams["offset"].(int) - batchCollections

		// Process collection and update our date filter.
		latestModificationDate, maxManifestSize, totalManifestSize, err := ProcessCollections(params.Logger,
			collections.Items,
			defaultReplicationLevel,
			results.UUIDToCollection)
		if err != nil {
			return results, err
		}
		if sdkParams["filters"].([][]string)[0][2] != latestModificationDate.Format(time.RFC3339) {
			sdkParams["filters"].([][]string)[0][2] = latestModificationDate.Format(time.RFC3339)
			sdkParams["offset"] = 0
		} else {
			sdkParams["offset"] = sdkParams["offset"].(int) + batchCollections
		}

		// update counts
		previousTotalCollections = totalCollections
		totalCollections = len(results.UUIDToCollection)

		log.Printf("%d collections read, %d (%d new) in last batch, "+
			"%d remaining, "+
			"%s latest modified date, %.0f %d %d avg,max,total manifest size",
			totalCollections,
			batchCollections,
			totalCollections-previousTotalCollections,
			remainingCollections,
			sdkParams["filters"].([][]string)[0][2],
			float32(totalManifestSize)/float32(totalCollections),
			maxManifestSize, totalManifestSize)

		if params.Logger != nil {
			params.Logger.Update(func(p map[string]interface{}, e map[string]interface{}) {
				collectionInfo := logger.GetOrCreateMap(p, "collection_info")
				collectionInfo["collections_read"] = totalCollections
				collectionInfo["latest_modified_date_seen"] = sdkParams["filters"].([][]string)[0][2]
				collectionInfo["total_manifest_size"] = totalManifestSize
				collectionInfo["max_manifest_size"] = maxManifestSize
			})
		}
	}

	// Make one final API request to verify that we have processed all collections available up to the latest modification date
	var collections SdkCollectionList
	sdkParams["filters"].([][]string)[0][1] = "<="
	sdkParams["limit"] = 0
	err = params.Client.List("collections", sdkParams, &collections)
	if err != nil {
		return
	}
	finalNumberOfCollectionsAvailable, err :=
		util.NumberItemsAvailable(params.Client, "collections")
	if err != nil {
		return
	}
	if totalCollections < finalNumberOfCollectionsAvailable {
		err = fmt.Errorf("API server indicates a total of %d collections "+
			"available up to %v, but we only retrieved %d. "+
			"Refusing to continue as this could indicate an "+
			"otherwise undetected failure.",
			finalNumberOfCollectionsAvailable,
			sdkParams["filters"].([][]string)[0][2],
			totalCollections)
		return
	}

	// Write the heap profile for examining memory usage
	err = WriteHeapProfile()

	return
}