예제 #1
0
func (rq *replicateQueue) process(
	ctx context.Context, now hlc.Timestamp, repl *Replica, sysCfg config.SystemConfig,
) error {
	retryOpts := retry.Options{
		InitialBackoff: 50 * time.Millisecond,
		MaxBackoff:     1 * time.Second,
		Multiplier:     2,
		MaxRetries:     5,
	}

	// Use a retry loop in order to backoff in the case of preemptive
	// snapshot errors, usually signalling that a rebalancing
	// reservation could not be made with the selected target.
	for r := retry.StartWithCtx(ctx, retryOpts); r.Next(); {
		if err := rq.processOneChange(ctx, now, repl, sysCfg); err != nil {
			if IsPreemptiveSnapshotError(err) {
				// If ChangeReplicas failed because the preemptive snapshot failed, we
				// log the error but then return success indicating we should retry the
				// operation. The most likely causes of the preemptive snapshot failing are
				// a declined reservation or the remote node being unavailable. In either
				// case we don't want to wait another scanner cycle before reconsidering
				// the range.
				log.Info(ctx, err)
				continue
			}
			return err
		}
		// Enqueue this replica again to see if there are more changes to be made.
		rq.MaybeAdd(repl, rq.clock.Now())
		return nil
	}
	return errors.Errorf("failed to replicate %s after %d retries", repl, retryOpts.MaxRetries)
}
예제 #2
0
// GetSnapshot wraps Snapshot() but does not require the replica lock
// to be held and it will block instead of returning
// ErrSnapshotTemporaryUnavailable. The caller is directly responsible for
// calling r.CloseOutSnap.
func (r *Replica) GetSnapshot(ctx context.Context, snapType string) (*OutgoingSnapshot, error) {
	// Use shorter-than-usual backoffs because this rarely succeeds on
	// the first attempt and this method is used a lot in tests.
	// Unsuccessful attempts are cheap, so we can have a low MaxBackoff.
	retryOpts := retry.Options{
		InitialBackoff: 1 * time.Millisecond,
		MaxBackoff:     100 * time.Millisecond,
		Multiplier:     2,
	}
	for retryObj := retry.StartWithCtx(ctx, retryOpts); retryObj.Next(); {
		log.Eventf(ctx, "snapshot retry loop pass %d", retryObj.CurrentAttempt())

		r.mu.Lock()
		doneChan := r.mu.outSnapDone
		r.mu.Unlock()

		<-doneChan

		r.mu.Lock()
		snap, err := r.snapshotWithContext(ctx, snapType)
		if err == nil {
			r.mu.outSnap.claimed = true
		}
		r.mu.Unlock()
		if err == raft.ErrSnapshotTemporarilyUnavailable {
			continue
		} else {
			return snap, err
		}
	}
	return nil, ctx.Err() // the only loop exit condition
}
예제 #3
0
// Seek positions the iterator at the specified key.
func (ri *RangeIterator) Seek(ctx context.Context, key roachpb.RKey, scanDir ScanDirection) {
	log.Eventf(ctx, "querying next range at %s", key)
	ri.scanDir = scanDir
	ri.init = true // the iterator is now initialized
	ri.pErr = nil  // clear any prior error
	ri.key = key   // set the key

	// Retry loop for looking up next range in the span. The retry loop
	// deals with retryable range descriptor lookups.
	for r := retry.StartWithCtx(ctx, ri.ds.rpcRetryOptions); r.Next(); {
		log.Event(ctx, "meta descriptor lookup")
		var err error
		ri.desc, ri.token, err = ri.ds.getDescriptor(
			ctx, ri.key, ri.token, ri.scanDir == Descending)

		// getDescriptor may fail retryably if, for example, the first
		// range isn't available via Gossip. Assume that all errors at
		// this level are retryable. Non-retryable errors would be for
		// things like malformed requests which we should have checked
		// for before reaching this point.
		if err != nil {
			log.VEventf(ctx, 1, "range descriptor lookup failed: %s", err)
			continue
		}

		// It's possible that the returned descriptor misses parts of the
		// keys it's supposed to include after it's truncated to match the
		// descriptor. Example revscan [a,g), first desc lookup for "g"
		// returns descriptor [c,d) -> [d,g) is never scanned.
		// We evict and retry in such a case.
		// TODO: this code is subject to removal. See
		// https://groups.google.com/d/msg/cockroach-db/DebjQEgU9r4/_OhMe7atFQAJ
		reverse := ri.scanDir == Descending
		if (reverse && !ri.desc.ContainsExclusiveEndKey(ri.key)) ||
			(!reverse && !ri.desc.ContainsKey(ri.key)) {
			log.Eventf(ctx, "addressing error: %s does not include key %s", ri.desc, ri.key)
			if err := ri.token.Evict(ctx); err != nil {
				ri.pErr = roachpb.NewError(err)
				return
			}
			// On addressing errors, don't backoff; retry immediately.
			r.Reset()
			continue
		}
		return
	}

	// Check for an early exit from the retry loop.
	if pErr := ri.ds.deduceRetryEarlyExitError(ctx); pErr != nil {
		ri.pErr = pErr
	} else {
		ri.pErr = roachpb.NewErrorf("RangeIterator failed to seek to %s", key)
	}
}
예제 #4
0
// heartbeat is called to update a node's expiration timestamp. This
// method does a conditional put on the node liveness record, and if
// successful, stores the updated liveness record in the nodes map.
func (nl *NodeLiveness) heartbeat(ctx context.Context) error {
	nodeID := nl.gossip.NodeID.Get()

	var newLiveness Liveness
	var oldLiveness *Liveness
	liveness, err := nl.GetLiveness(nodeID)
	if err == nil {
		oldLiveness = &liveness
		newLiveness = liveness
	} else {
		newLiveness = Liveness{
			NodeID: nodeID,
			Epoch:  1,
		}
	}

	// Retry heartbeat in the event the conditional put fails.
	for r := retry.StartWithCtx(ctx, base.DefaultRetryOptions()); r.Next(); {
		newLiveness.Expiration = nl.clock.Now().Add(nl.livenessThreshold.Nanoseconds(), 0)
		tryAgain := false
		if err := nl.updateLiveness(ctx, nodeID, &newLiveness, oldLiveness, func(actual Liveness) {
			oldLiveness = &actual
			newLiveness = actual
			tryAgain = true
		}); err != nil {
			nl.metrics.HeartbeatFailures.Inc(1)
			return err
		}
		if !tryAgain {
			break
		}
	}

	log.VEventf(ctx, 1, "heartbeat node %d liveness with expiration %s", nodeID, newLiveness.Expiration)
	nl.mu.Lock()
	defer nl.mu.Unlock()
	nl.mu.self = newLiveness
	nl.metrics.HeartbeatSuccesses.Inc(1)
	return nil
}
예제 #5
0
// StartHeartbeat starts a periodic heartbeat to refresh this node's
// last heartbeat in the node liveness table.
func (nl *NodeLiveness) StartHeartbeat(ctx context.Context, stopper *stop.Stopper) {
	log.VEventf(ctx, 1, "starting liveness heartbeat")
	retryOpts := base.DefaultRetryOptions()
	retryOpts.Closer = stopper.ShouldQuiesce()

	stopper.RunWorker(func() {
		ambient := nl.ambientCtx
		ambient.AddLogTag("hb", nil)
		ticker := time.NewTicker(nl.heartbeatInterval)
		defer ticker.Stop()
		for {
			if !nl.pauseHeartbeat.Load().(bool) {
				ctx, sp := ambient.AnnotateCtxWithSpan(context.Background(), "heartbeat")
				ctx, cancel := context.WithTimeout(ctx, nl.heartbeatInterval)
				// Retry heartbeat in the event the conditional put fails.
				for r := retry.StartWithCtx(ctx, retryOpts); r.Next(); {
					liveness, err := nl.Self()
					if err != nil && err != ErrNoLivenessRecord {
						log.Errorf(ctx, "unexpected error getting liveness: %v", err)
					}
					if err := nl.Heartbeat(ctx, liveness); err != nil {
						if err == errSkippedHeartbeat {
							continue
						}
						log.Errorf(ctx, "failed liveness heartbeat: %v", err)
					}
					break
				}
				cancel()
				sp.Finish()
			}
			select {
			case <-ticker.C:
			case <-stopper.ShouldStop():
				return
			}
		}
	})
}
예제 #6
0
// sendPartialBatch sends the supplied batch to the range specified by
// desc. The batch request is first truncated so that it contains only
// requests which intersect the range descriptor and keys for each
// request are limited to the range's key span. The send occurs in a
// retry loop to handle send failures. On failure to send to any
// replicas, we backoff and retry by refetching the range
// descriptor. If the underlying range seems to have split, we
// recursively invoke divideAndSendBatchToRanges to re-enumerate the
// ranges in the span and resend to each.
func (ds *DistSender) sendPartialBatch(
	ctx context.Context,
	ba roachpb.BatchRequest,
	rs roachpb.RSpan,
	desc *roachpb.RangeDescriptor,
	evictToken *EvictionToken,
	isFirst bool,
) response {
	var reply *roachpb.BatchResponse
	var pErr *roachpb.Error
	isReverse := ba.IsReverse()

	// Truncate the request to range descriptor.
	intersected, err := rs.Intersect(desc)
	if err != nil {
		return response{pErr: roachpb.NewError(err)}
	}
	truncBA, numActive, err := truncate(ba, intersected)
	if numActive == 0 && err == nil {
		// This shouldn't happen in the wild, but some tests exercise it.
		return response{
			pErr: roachpb.NewErrorf("truncation resulted in empty batch on %s: %s", intersected, ba),
		}
	}
	if err != nil {
		return response{pErr: roachpb.NewError(err)}
	}

	// Start a retry loop for sending the batch to the range.
	for r := retry.StartWithCtx(ctx, ds.rpcRetryOptions); r.Next(); {
		// If we've cleared the descriptor on a send failure, re-lookup.
		if desc == nil {
			var descKey roachpb.RKey
			if isReverse {
				descKey = intersected.EndKey
			} else {
				descKey = intersected.Key
			}
			desc, evictToken, err = ds.getDescriptor(ctx, descKey, nil, isReverse)
			if err != nil {
				log.ErrEventf(ctx, "range descriptor re-lookup failed: %s", err)
				continue
			}
		}

		reply, pErr = ds.sendSingleRange(ctx, truncBA, desc)

		// If sending succeeded, return immediately.
		if pErr == nil {
			return response{reply: reply}
		}

		log.ErrEventf(ctx, "reply error %s: %s", ba, pErr)

		// Error handling: If the error indicates that our range
		// descriptor is out of date, evict it from the cache and try
		// again. Errors that apply only to a single replica were
		// handled in send().
		//
		// TODO(bdarnell): Don't retry endlessly. If we fail twice in a
		// row and the range descriptor hasn't changed, return the error
		// to our caller.
		switch tErr := pErr.GetDetail().(type) {
		case *roachpb.SendError:
			// We've tried all the replicas without success. Either
			// they're all down, or we're using an out-of-date range
			// descriptor. Invalidate the cache and try again with the new
			// metadata.
			log.Event(ctx, "evicting range descriptor on send error and backoff for re-lookup")
			if err := evictToken.Evict(ctx); err != nil {
				return response{pErr: roachpb.NewError(err)}
			}
			// Clear the descriptor to reload on the next attempt.
			desc = nil
			continue
		case *roachpb.RangeKeyMismatchError:
			// Range descriptor might be out of date - evict it. This is
			// likely the result of a range split. If we have new range
			// descriptors, insert them instead as long as they are different
			// from the last descriptor to avoid endless loops.
			var replacements []roachpb.RangeDescriptor
			different := func(rd *roachpb.RangeDescriptor) bool {
				return !desc.RSpan().Equal(rd.RSpan())
			}
			if tErr.MismatchedRange != nil && different(tErr.MismatchedRange) {
				replacements = append(replacements, *tErr.MismatchedRange)
			}
			if tErr.SuggestedRange != nil && different(tErr.SuggestedRange) {
				if includesFrontOfCurSpan(isReverse, tErr.SuggestedRange, rs) {
					replacements = append(replacements, *tErr.SuggestedRange)
				}
			}
			// Same as Evict() if replacements is empty.
			if err := evictToken.EvictAndReplace(ctx, replacements...); err != nil {
				return response{pErr: roachpb.NewError(err)}
			}
			// On addressing errors (likely a split), we need to re-invoke
			// the range descriptor lookup machinery, so we recurse by
			// sending batch to just the partial span this descriptor was
			// supposed to cover.
			log.VEventf(ctx, 1, "likely split; resending batch to span: %s", tErr)
			reply, pErr = ds.divideAndSendBatchToRanges(ctx, ba, intersected, isFirst)
			return response{reply: reply, pErr: pErr}
		}
		break
	}

	// Propagate error if either the retry closer or context done
	// channels were closed.
	if pErr == nil {
		if pErr = ds.deduceRetryEarlyExitError(ctx); pErr == nil {
			log.Fatal(ctx, "exited retry loop without an error")
		}
	}

	return response{pErr: pErr}
}
예제 #7
0
// restoreTable inserts the given DatabaseDescriptor. If the name conflicts with
// an existing table, the one being restored is rekeyed with a new ID and the
// old data is deleted.
func restoreTable(
	ctx context.Context,
	db client.DB,
	database sqlbase.DatabaseDescriptor,
	table *sqlbase.TableDescriptor,
	ranges []sqlbase.BackupRangeDescriptor,
) error {
	if log.V(1) {
		log.Infof(ctx, "Restoring Table %q", table.Name)
	}

	var newTableID sqlbase.ID
	if err := db.Txn(ctx, func(txn *client.Txn) error {
		// Make sure there's a database with a name that matches the original.
		if _, err := getDescriptorID(txn, tableKey{name: database.Name}); err != nil {
			return errors.Wrapf(err, "a database named %q needs to exist to restore table %q",
				database.Name, table.Name)
		}

		// Assign a new ID for the table. TODO(dan): For now, we're always
		// generating a new ID, but varints get longer as they get bigger and so
		// our keys will, too. We should someday figure out how to overwrite an
		// existing table and steal its ID.
		var err error
		newTableID, err = GenerateUniqueDescID(txn)
		return err
	}); err != nil {
		return err
	}

	// Create the iteration keys before we give the table its new ID.
	tableStartKeyOld := roachpb.Key(sqlbase.MakeIndexKeyPrefix(table, table.PrimaryIndex.ID))
	tableEndKeyOld := tableStartKeyOld.PrefixEnd()

	// This loop makes restoring multiple tables O(N*M), where N is the number
	// of tables and M is the number of ranges. We could reduce this using an
	// interval tree if necessary.
	var wg sync.WaitGroup
	result := struct {
		syncutil.Mutex
		firstErr error
		numErrs  int
	}{}
	for _, rangeDesc := range ranges {
		if len(rangeDesc.Path) == 0 {
			// Empty path means empty range.
			continue
		}

		intersectBegin, intersectEnd := IntersectHalfOpen(
			rangeDesc.StartKey, rangeDesc.EndKey, tableStartKeyOld, tableEndKeyOld)
		if intersectBegin != nil && intersectEnd != nil {
			// Write the data under the new ID.
			// TODO(dan): There's no SQL descriptors that point at this yet, so it
			// should be possible to remove it from the one txn this is all currently
			// run under. If we do that, make sure this data gets cleaned up on errors.
			wg.Add(1)
			go func(desc sqlbase.BackupRangeDescriptor) {
				for r := retry.StartWithCtx(ctx, base.DefaultRetryOptions()); r.Next(); {
					err := db.Txn(ctx, func(txn *client.Txn) error {
						return Ingest(ctx, txn, desc.Path, desc.CRC, intersectBegin, intersectEnd, newTableID)
					})
					if _, ok := err.(*client.AutoCommitError); ok {
						log.Errorf(ctx, "auto commit error during ingest: %s", err)
						// TODO(dan): Ingest currently does not rely on the
						// range being empty, but the plan is that it will. When
						// that change happens, this will have to delete any
						// partially ingested data or something.
						continue
					}

					if err != nil {
						log.Errorf(ctx, "%T %s", err, err)
						result.Lock()
						defer result.Unlock()
						if result.firstErr != nil {
							result.firstErr = err
						}
						result.numErrs++
					}
					break
				}
				wg.Done()
			}(rangeDesc)
		}
	}
	wg.Wait()
	// All concurrent accesses have finished, we don't need the lock anymore.
	if result.firstErr != nil {
		// This leaves the data that did get imported in case the user wants to
		// retry.
		// TODO(dan): Build tooling to allow a user to restart a failed restore.
		return errors.Wrapf(result.firstErr, "ingest encountered %d errors", result.numErrs)
	}

	table.ID = newTableID
	return db.Txn(ctx, func(txn *client.Txn) error {
		// Pass the descriptors by value to keep this idempotent.
		return restoreTableDesc(ctx, txn, database, *table)
	})
}
예제 #8
0
// EnsureMigrations should be run during node startup to ensure that all
// required migrations have been run (and running all those that are definitely
// safe to run).
func (m *Manager) EnsureMigrations(ctx context.Context) error {
	// First, check whether there are any migrations that need to be run.
	completedMigrations, err := m.getCompletedMigrations(ctx)
	if err != nil {
		return err
	}
	allMigrationsCompleted := true
	for _, migration := range backwardCompatibleMigrations {
		key := migrationKey(migration)
		if _, ok := completedMigrations[string(key)]; !ok {
			allMigrationsCompleted = false
		}
	}
	if allMigrationsCompleted {
		return nil
	}

	// If there are any, grab the migration lease to ensure that only one
	// node is ever doing migrations at a time.
	// Note that we shouldn't ever let client.LeaseNotAvailableErrors cause us
	// to stop trying, because if we return an error the server will be shut down,
	// and this server being down may prevent the leaseholder from finishing.
	var lease *client.Lease
	if log.V(1) {
		log.Info(ctx, "trying to acquire lease")
	}
	for r := retry.StartWithCtx(ctx, base.DefaultRetryOptions()); r.Next(); {
		lease, err = m.leaseManager.AcquireLease(ctx, keys.MigrationLease)
		if err == nil {
			break
		}
		log.Errorf(ctx, "failed attempt to acquire migration lease: %s", err)
	}
	if err != nil {
		return errors.Wrapf(err, "failed to acquire lease for running necessary migrations")
	}

	// Ensure that we hold the lease throughout the migration process and release
	// it when we're done.
	done := make(chan interface{}, 1)
	defer func() {
		done <- nil
		if log.V(1) {
			log.Info(ctx, "trying to release the lease")
		}
		if err := m.leaseManager.ReleaseLease(ctx, lease); err != nil {
			log.Errorf(ctx, "failed to release migration lease: %s", err)
		}
	}()
	if err := m.stopper.RunAsyncTask(ctx, func(ctx context.Context) {
		select {
		case <-done:
			return
		case <-time.After(leaseRefreshInterval):
			if err := m.leaseManager.ExtendLease(ctx, lease); err != nil {
				log.Warningf(ctx, "unable to extend ownership of expiration lease: %s", err)
			}
			if m.leaseManager.TimeRemaining(lease) < leaseRefreshInterval {
				// Note that we may be able to do better than this by influencing the
				// deadline of migrations' transactions based on the least expiration
				// time, but simply kill the process for now for the sake of simplicity.
				log.Fatal(ctx, "not enough time left on migration lease, terminating for safety")
			}
		}
	}); err != nil {
		return err
	}

	// Re-get the list of migrations in case any of them were completed between
	// our initial check and our grabbing of the lease.
	completedMigrations, err = m.getCompletedMigrations(ctx)
	if err != nil {
		return err
	}

	startTime := timeutil.Now().String()
	r := runner{
		db:          m.db,
		sqlExecutor: m.sqlExecutor,
	}
	for _, migration := range backwardCompatibleMigrations {
		key := migrationKey(migration)
		if _, ok := completedMigrations[string(key)]; ok {
			continue
		}

		if log.V(1) {
			log.Infof(ctx, "running migration %q", migration.name)
		}
		if err := migration.workFn(ctx, r); err != nil {
			return errors.Wrapf(err, "failed to run migration %q", migration.name)
		}

		if log.V(1) {
			log.Infof(ctx, "trying to persist record of completing migration %s", migration.name)
		}
		if err := m.db.Put(ctx, key, startTime); err != nil {
			return errors.Wrapf(err, "failed to persist record of completing migration %q",
				migration.name)
		}
	}

	return nil
}