Example #1
0
func countCollections(c *arvados.Client, params arvados.ResourceListParams) (int, error) {
	var page arvados.CollectionList
	var zero int
	params.Limit = &zero
	err := c.RequestAndDecode(&page, "GET", "arvados/v1/collections", nil, params)
	return page.ItemsAvailable, err
}
Example #2
0
// Perform a PUT request at path, with data (as JSON) in the request
// body.
func (srv *KeepService) put(c *arvados.Client, path string, data interface{}) error {
	// We'll start a goroutine to do the JSON encoding, so we can
	// stream it to the http client through a Pipe, rather than
	// keeping the entire encoded version in memory.
	jsonR, jsonW := io.Pipe()

	// errC communicates any encoding errors back to our main
	// goroutine.
	errC := make(chan error, 1)

	go func() {
		enc := json.NewEncoder(jsonW)
		errC <- enc.Encode(data)
		jsonW.Close()
	}()

	url := srv.URLBase() + "/" + path
	req, err := http.NewRequest("PUT", url, ioutil.NopCloser(jsonR))
	if err != nil {
		return fmt.Errorf("building request for %s: %v", url, err)
	}
	err = c.DoAndDecode(nil, req)

	// If there was an error encoding the request body, report
	// that instead of the response: obviously we won't get a
	// useful response if our request wasn't properly encoded.
	if encErr := <-errC; encErr != nil {
		return fmt.Errorf("encoding data for %s: %v", url, encErr)
	}

	return err
}
Example #3
0
// CheckSanityEarly checks for configuration and runtime errors that
// can be detected before GetCurrentState() and ComputeChangeSets()
// are called.
//
// If it returns an error, it is pointless to run GetCurrentState or
// ComputeChangeSets: after doing so, the statistics would be
// meaningless and it would be dangerous to run any Commit methods.
func (bal *Balancer) CheckSanityEarly(c *arvados.Client) error {
	u, err := c.CurrentUser()
	if err != nil {
		return fmt.Errorf("CurrentUser(): %v", err)
	}
	if !u.IsActive || !u.IsAdmin {
		return fmt.Errorf("current user (%s) is not an active admin user", u.UUID)
	}
	for _, srv := range bal.KeepServices {
		if srv.ServiceType == "proxy" {
			return fmt.Errorf("config error: %s: proxy servers cannot be balanced", srv)
		}
	}
	return nil
}
Example #4
0
// DiscoverKeepServices sets the list of KeepServices by calling the
// API to get a list of all services, and selecting the ones whose
// ServiceType is in okTypes.
func (bal *Balancer) DiscoverKeepServices(c *arvados.Client, okTypes []string) error {
	bal.KeepServices = make(map[string]*KeepService)
	ok := make(map[string]bool)
	for _, t := range okTypes {
		ok[t] = true
	}
	return c.EachKeepService(func(srv arvados.KeepService) error {
		if ok[srv.ServiceType] {
			bal.KeepServices[srv.UUID] = &KeepService{
				KeepService: srv,
				ChangeSet:   &ChangeSet{},
			}
		} else {
			bal.logf("skipping %v with service type %q", srv.UUID, srv.ServiceType)
		}
		return nil
	})
}
Example #5
0
// GetCurrentState determines the current replication state, and the
// desired replication level, for every block that is either
// retrievable or referenced.
//
// It determines the current replication state by reading the block index
// from every known Keep service.
//
// It determines the desired replication level by retrieving all
// collection manifests in the database (API server).
//
// It encodes the resulting information in BlockStateMap.
func (bal *Balancer) GetCurrentState(c *arvados.Client) error {
	defer timeMe(bal.Logger, "GetCurrentState")()
	bal.BlockStateMap = NewBlockStateMap()

	dd, err := c.DiscoveryDocument()
	if err != nil {
		return err
	}
	bal.DefaultReplication = dd.DefaultCollectionReplication
	bal.MinMtime = time.Now().Unix() - dd.BlobSignatureTTL

	errs := make(chan error, 2+len(bal.KeepServices))
	wg := sync.WaitGroup{}

	// Start one goroutine for each KeepService: retrieve the
	// index, and add the returned blocks to BlockStateMap.
	for _, srv := range bal.KeepServices {
		wg.Add(1)
		go func(srv *KeepService) {
			defer wg.Done()
			bal.logf("%s: retrieve index", srv)
			idx, err := srv.Index(c, "")
			if err != nil {
				errs <- fmt.Errorf("%s: %v", srv, err)
				return
			}
			bal.logf("%s: add %d replicas to map", srv, len(idx))
			bal.BlockStateMap.AddReplicas(srv, idx)
			bal.logf("%s: done", srv)
		}(srv)
	}

	// collQ buffers incoming collections so we can start fetching
	// the next page without waiting for the current page to
	// finish processing. (1000 happens to match the page size
	// used by (*arvados.Client)EachCollection(), but it's OK if
	// they don't match.)
	collQ := make(chan arvados.Collection, 1000)

	// Start a goroutine to process collections. (We could use a
	// worker pool here, but even with a single worker we already
	// process collections much faster than we can retrieve them.)
	wg.Add(1)
	go func() {
		defer wg.Done()
		for coll := range collQ {
			err := bal.addCollection(coll)
			if err != nil {
				errs <- err
				for range collQ {
				}
				return
			}
			bal.collScanned++
		}
	}()

	// Start a goroutine to retrieve all collections from the
	// Arvados database and send them to collQ for processing.
	wg.Add(1)
	go func() {
		defer wg.Done()
		err = EachCollection(c,
			func(coll arvados.Collection) error {
				collQ <- coll
				if len(errs) > 0 {
					// some other GetCurrentState
					// error happened: no point
					// getting any more
					// collections.
					return fmt.Errorf("")
				}
				return nil
			}, func(done, total int) {
				bal.logf("collections: %d/%d", done, total)
			})
		close(collQ)
		if err != nil {
			errs <- err
		}
	}()

	go func() {
		// Send a nil error when all goroutines finish. If
		// this is the first error sent to errs, then
		// everything worked.
		wg.Wait()
		errs <- nil
	}()
	return <-errs
}
Example #6
0
// EachCollection calls f once for every readable
// collection. EachCollection stops if it encounters an error, such as
// f returning a non-nil error.
//
// The progress function is called periodically with done (number of
// times f has been called) and total (number of times f is expected
// to be called).
func EachCollection(c *arvados.Client, f func(arvados.Collection) error, progress func(done, total int)) error {
	if progress == nil {
		progress = func(_, _ int) {}
	}

	expectCount, err := countCollections(c, arvados.ResourceListParams{})
	if err != nil {
		return err
	}

	limit := 1000
	params := arvados.ResourceListParams{
		Limit:  &limit,
		Order:  "modified_at, uuid",
		Select: []string{"uuid", "manifest_text", "modified_at", "portable_data_hash", "replication_desired"},
	}
	var last arvados.Collection
	var filterTime time.Time
	callCount := 0
	for {
		progress(callCount, expectCount)
		var page arvados.CollectionList
		err := c.RequestAndDecode(&page, "GET", "arvados/v1/collections", nil, params)
		if err != nil {
			return err
		}
		for _, coll := range page.Items {
			if last.ModifiedAt != nil && *last.ModifiedAt == *coll.ModifiedAt && last.UUID >= coll.UUID {
				continue
			}
			callCount++
			err = f(coll)
			if err != nil {
				return err
			}
			last = coll
		}
		if last.ModifiedAt == nil || *last.ModifiedAt == filterTime {
			if page.ItemsAvailable > len(page.Items) {
				// TODO: use "mtime=X && UUID>Y"
				// filters to get all collections with
				// this timestamp, then use "mtime>X"
				// to get the next timestamp.
				return fmt.Errorf("BUG: Received an entire page with the same modified_at timestamp (%v), cannot make progress", filterTime)
			}
			break
		}
		filterTime = *last.ModifiedAt
		params.Filters = []arvados.Filter{{
			Attr:     "modified_at",
			Operator: ">=",
			Operand:  filterTime,
		}, {
			Attr:     "uuid",
			Operator: "!=",
			Operand:  last.UUID,
		}}
	}
	progress(callCount, expectCount)

	if checkCount, err := countCollections(c, arvados.ResourceListParams{Filters: []arvados.Filter{{
		Attr:     "modified_at",
		Operator: "<=",
		Operand:  filterTime}}}); err != nil {
		return err
	} else if callCount < checkCount {
		return fmt.Errorf("Retrieved %d collections with modtime <= T=%q, but server now reports there are %d collections with modtime <= T", callCount, filterTime, checkCount)
	}

	return nil
}