func (rq *replicateQueue) process( ctx context.Context, now hlc.Timestamp, repl *Replica, sysCfg config.SystemConfig, ) error { retryOpts := retry.Options{ InitialBackoff: 50 * time.Millisecond, MaxBackoff: 1 * time.Second, Multiplier: 2, MaxRetries: 5, } // Use a retry loop in order to backoff in the case of preemptive // snapshot errors, usually signalling that a rebalancing // reservation could not be made with the selected target. for r := retry.StartWithCtx(ctx, retryOpts); r.Next(); { if err := rq.processOneChange(ctx, now, repl, sysCfg); err != nil { if IsPreemptiveSnapshotError(err) { // If ChangeReplicas failed because the preemptive snapshot failed, we // log the error but then return success indicating we should retry the // operation. The most likely causes of the preemptive snapshot failing are // a declined reservation or the remote node being unavailable. In either // case we don't want to wait another scanner cycle before reconsidering // the range. log.Info(ctx, err) continue } return err } // Enqueue this replica again to see if there are more changes to be made. rq.MaybeAdd(repl, rq.clock.Now()) return nil } return errors.Errorf("failed to replicate %s after %d retries", repl, retryOpts.MaxRetries) }
// GetSnapshot wraps Snapshot() but does not require the replica lock // to be held and it will block instead of returning // ErrSnapshotTemporaryUnavailable. The caller is directly responsible for // calling r.CloseOutSnap. func (r *Replica) GetSnapshot(ctx context.Context, snapType string) (*OutgoingSnapshot, error) { // Use shorter-than-usual backoffs because this rarely succeeds on // the first attempt and this method is used a lot in tests. // Unsuccessful attempts are cheap, so we can have a low MaxBackoff. retryOpts := retry.Options{ InitialBackoff: 1 * time.Millisecond, MaxBackoff: 100 * time.Millisecond, Multiplier: 2, } for retryObj := retry.StartWithCtx(ctx, retryOpts); retryObj.Next(); { log.Eventf(ctx, "snapshot retry loop pass %d", retryObj.CurrentAttempt()) r.mu.Lock() doneChan := r.mu.outSnapDone r.mu.Unlock() <-doneChan r.mu.Lock() snap, err := r.snapshotWithContext(ctx, snapType) if err == nil { r.mu.outSnap.claimed = true } r.mu.Unlock() if err == raft.ErrSnapshotTemporarilyUnavailable { continue } else { return snap, err } } return nil, ctx.Err() // the only loop exit condition }
// Seek positions the iterator at the specified key. func (ri *RangeIterator) Seek(ctx context.Context, key roachpb.RKey, scanDir ScanDirection) { log.Eventf(ctx, "querying next range at %s", key) ri.scanDir = scanDir ri.init = true // the iterator is now initialized ri.pErr = nil // clear any prior error ri.key = key // set the key // Retry loop for looking up next range in the span. The retry loop // deals with retryable range descriptor lookups. for r := retry.StartWithCtx(ctx, ri.ds.rpcRetryOptions); r.Next(); { log.Event(ctx, "meta descriptor lookup") var err error ri.desc, ri.token, err = ri.ds.getDescriptor( ctx, ri.key, ri.token, ri.scanDir == Descending) // getDescriptor may fail retryably if, for example, the first // range isn't available via Gossip. Assume that all errors at // this level are retryable. Non-retryable errors would be for // things like malformed requests which we should have checked // for before reaching this point. if err != nil { log.VEventf(ctx, 1, "range descriptor lookup failed: %s", err) continue } // It's possible that the returned descriptor misses parts of the // keys it's supposed to include after it's truncated to match the // descriptor. Example revscan [a,g), first desc lookup for "g" // returns descriptor [c,d) -> [d,g) is never scanned. // We evict and retry in such a case. // TODO: this code is subject to removal. See // https://groups.google.com/d/msg/cockroach-db/DebjQEgU9r4/_OhMe7atFQAJ reverse := ri.scanDir == Descending if (reverse && !ri.desc.ContainsExclusiveEndKey(ri.key)) || (!reverse && !ri.desc.ContainsKey(ri.key)) { log.Eventf(ctx, "addressing error: %s does not include key %s", ri.desc, ri.key) if err := ri.token.Evict(ctx); err != nil { ri.pErr = roachpb.NewError(err) return } // On addressing errors, don't backoff; retry immediately. r.Reset() continue } return } // Check for an early exit from the retry loop. if pErr := ri.ds.deduceRetryEarlyExitError(ctx); pErr != nil { ri.pErr = pErr } else { ri.pErr = roachpb.NewErrorf("RangeIterator failed to seek to %s", key) } }
// heartbeat is called to update a node's expiration timestamp. This // method does a conditional put on the node liveness record, and if // successful, stores the updated liveness record in the nodes map. func (nl *NodeLiveness) heartbeat(ctx context.Context) error { nodeID := nl.gossip.NodeID.Get() var newLiveness Liveness var oldLiveness *Liveness liveness, err := nl.GetLiveness(nodeID) if err == nil { oldLiveness = &liveness newLiveness = liveness } else { newLiveness = Liveness{ NodeID: nodeID, Epoch: 1, } } // Retry heartbeat in the event the conditional put fails. for r := retry.StartWithCtx(ctx, base.DefaultRetryOptions()); r.Next(); { newLiveness.Expiration = nl.clock.Now().Add(nl.livenessThreshold.Nanoseconds(), 0) tryAgain := false if err := nl.updateLiveness(ctx, nodeID, &newLiveness, oldLiveness, func(actual Liveness) { oldLiveness = &actual newLiveness = actual tryAgain = true }); err != nil { nl.metrics.HeartbeatFailures.Inc(1) return err } if !tryAgain { break } } log.VEventf(ctx, 1, "heartbeat node %d liveness with expiration %s", nodeID, newLiveness.Expiration) nl.mu.Lock() defer nl.mu.Unlock() nl.mu.self = newLiveness nl.metrics.HeartbeatSuccesses.Inc(1) return nil }
// StartHeartbeat starts a periodic heartbeat to refresh this node's // last heartbeat in the node liveness table. func (nl *NodeLiveness) StartHeartbeat(ctx context.Context, stopper *stop.Stopper) { log.VEventf(ctx, 1, "starting liveness heartbeat") retryOpts := base.DefaultRetryOptions() retryOpts.Closer = stopper.ShouldQuiesce() stopper.RunWorker(func() { ambient := nl.ambientCtx ambient.AddLogTag("hb", nil) ticker := time.NewTicker(nl.heartbeatInterval) defer ticker.Stop() for { if !nl.pauseHeartbeat.Load().(bool) { ctx, sp := ambient.AnnotateCtxWithSpan(context.Background(), "heartbeat") ctx, cancel := context.WithTimeout(ctx, nl.heartbeatInterval) // Retry heartbeat in the event the conditional put fails. for r := retry.StartWithCtx(ctx, retryOpts); r.Next(); { liveness, err := nl.Self() if err != nil && err != ErrNoLivenessRecord { log.Errorf(ctx, "unexpected error getting liveness: %v", err) } if err := nl.Heartbeat(ctx, liveness); err != nil { if err == errSkippedHeartbeat { continue } log.Errorf(ctx, "failed liveness heartbeat: %v", err) } break } cancel() sp.Finish() } select { case <-ticker.C: case <-stopper.ShouldStop(): return } } }) }
// sendPartialBatch sends the supplied batch to the range specified by // desc. The batch request is first truncated so that it contains only // requests which intersect the range descriptor and keys for each // request are limited to the range's key span. The send occurs in a // retry loop to handle send failures. On failure to send to any // replicas, we backoff and retry by refetching the range // descriptor. If the underlying range seems to have split, we // recursively invoke divideAndSendBatchToRanges to re-enumerate the // ranges in the span and resend to each. func (ds *DistSender) sendPartialBatch( ctx context.Context, ba roachpb.BatchRequest, rs roachpb.RSpan, desc *roachpb.RangeDescriptor, evictToken *EvictionToken, isFirst bool, ) response { var reply *roachpb.BatchResponse var pErr *roachpb.Error isReverse := ba.IsReverse() // Truncate the request to range descriptor. intersected, err := rs.Intersect(desc) if err != nil { return response{pErr: roachpb.NewError(err)} } truncBA, numActive, err := truncate(ba, intersected) if numActive == 0 && err == nil { // This shouldn't happen in the wild, but some tests exercise it. return response{ pErr: roachpb.NewErrorf("truncation resulted in empty batch on %s: %s", intersected, ba), } } if err != nil { return response{pErr: roachpb.NewError(err)} } // Start a retry loop for sending the batch to the range. for r := retry.StartWithCtx(ctx, ds.rpcRetryOptions); r.Next(); { // If we've cleared the descriptor on a send failure, re-lookup. if desc == nil { var descKey roachpb.RKey if isReverse { descKey = intersected.EndKey } else { descKey = intersected.Key } desc, evictToken, err = ds.getDescriptor(ctx, descKey, nil, isReverse) if err != nil { log.ErrEventf(ctx, "range descriptor re-lookup failed: %s", err) continue } } reply, pErr = ds.sendSingleRange(ctx, truncBA, desc) // If sending succeeded, return immediately. if pErr == nil { return response{reply: reply} } log.ErrEventf(ctx, "reply error %s: %s", ba, pErr) // Error handling: If the error indicates that our range // descriptor is out of date, evict it from the cache and try // again. Errors that apply only to a single replica were // handled in send(). // // TODO(bdarnell): Don't retry endlessly. If we fail twice in a // row and the range descriptor hasn't changed, return the error // to our caller. switch tErr := pErr.GetDetail().(type) { case *roachpb.SendError: // We've tried all the replicas without success. Either // they're all down, or we're using an out-of-date range // descriptor. Invalidate the cache and try again with the new // metadata. log.Event(ctx, "evicting range descriptor on send error and backoff for re-lookup") if err := evictToken.Evict(ctx); err != nil { return response{pErr: roachpb.NewError(err)} } // Clear the descriptor to reload on the next attempt. desc = nil continue case *roachpb.RangeKeyMismatchError: // Range descriptor might be out of date - evict it. This is // likely the result of a range split. If we have new range // descriptors, insert them instead as long as they are different // from the last descriptor to avoid endless loops. var replacements []roachpb.RangeDescriptor different := func(rd *roachpb.RangeDescriptor) bool { return !desc.RSpan().Equal(rd.RSpan()) } if tErr.MismatchedRange != nil && different(tErr.MismatchedRange) { replacements = append(replacements, *tErr.MismatchedRange) } if tErr.SuggestedRange != nil && different(tErr.SuggestedRange) { if includesFrontOfCurSpan(isReverse, tErr.SuggestedRange, rs) { replacements = append(replacements, *tErr.SuggestedRange) } } // Same as Evict() if replacements is empty. if err := evictToken.EvictAndReplace(ctx, replacements...); err != nil { return response{pErr: roachpb.NewError(err)} } // On addressing errors (likely a split), we need to re-invoke // the range descriptor lookup machinery, so we recurse by // sending batch to just the partial span this descriptor was // supposed to cover. log.VEventf(ctx, 1, "likely split; resending batch to span: %s", tErr) reply, pErr = ds.divideAndSendBatchToRanges(ctx, ba, intersected, isFirst) return response{reply: reply, pErr: pErr} } break } // Propagate error if either the retry closer or context done // channels were closed. if pErr == nil { if pErr = ds.deduceRetryEarlyExitError(ctx); pErr == nil { log.Fatal(ctx, "exited retry loop without an error") } } return response{pErr: pErr} }
// restoreTable inserts the given DatabaseDescriptor. If the name conflicts with // an existing table, the one being restored is rekeyed with a new ID and the // old data is deleted. func restoreTable( ctx context.Context, db client.DB, database sqlbase.DatabaseDescriptor, table *sqlbase.TableDescriptor, ranges []sqlbase.BackupRangeDescriptor, ) error { if log.V(1) { log.Infof(ctx, "Restoring Table %q", table.Name) } var newTableID sqlbase.ID if err := db.Txn(ctx, func(txn *client.Txn) error { // Make sure there's a database with a name that matches the original. if _, err := getDescriptorID(txn, tableKey{name: database.Name}); err != nil { return errors.Wrapf(err, "a database named %q needs to exist to restore table %q", database.Name, table.Name) } // Assign a new ID for the table. TODO(dan): For now, we're always // generating a new ID, but varints get longer as they get bigger and so // our keys will, too. We should someday figure out how to overwrite an // existing table and steal its ID. var err error newTableID, err = GenerateUniqueDescID(txn) return err }); err != nil { return err } // Create the iteration keys before we give the table its new ID. tableStartKeyOld := roachpb.Key(sqlbase.MakeIndexKeyPrefix(table, table.PrimaryIndex.ID)) tableEndKeyOld := tableStartKeyOld.PrefixEnd() // This loop makes restoring multiple tables O(N*M), where N is the number // of tables and M is the number of ranges. We could reduce this using an // interval tree if necessary. var wg sync.WaitGroup result := struct { syncutil.Mutex firstErr error numErrs int }{} for _, rangeDesc := range ranges { if len(rangeDesc.Path) == 0 { // Empty path means empty range. continue } intersectBegin, intersectEnd := IntersectHalfOpen( rangeDesc.StartKey, rangeDesc.EndKey, tableStartKeyOld, tableEndKeyOld) if intersectBegin != nil && intersectEnd != nil { // Write the data under the new ID. // TODO(dan): There's no SQL descriptors that point at this yet, so it // should be possible to remove it from the one txn this is all currently // run under. If we do that, make sure this data gets cleaned up on errors. wg.Add(1) go func(desc sqlbase.BackupRangeDescriptor) { for r := retry.StartWithCtx(ctx, base.DefaultRetryOptions()); r.Next(); { err := db.Txn(ctx, func(txn *client.Txn) error { return Ingest(ctx, txn, desc.Path, desc.CRC, intersectBegin, intersectEnd, newTableID) }) if _, ok := err.(*client.AutoCommitError); ok { log.Errorf(ctx, "auto commit error during ingest: %s", err) // TODO(dan): Ingest currently does not rely on the // range being empty, but the plan is that it will. When // that change happens, this will have to delete any // partially ingested data or something. continue } if err != nil { log.Errorf(ctx, "%T %s", err, err) result.Lock() defer result.Unlock() if result.firstErr != nil { result.firstErr = err } result.numErrs++ } break } wg.Done() }(rangeDesc) } } wg.Wait() // All concurrent accesses have finished, we don't need the lock anymore. if result.firstErr != nil { // This leaves the data that did get imported in case the user wants to // retry. // TODO(dan): Build tooling to allow a user to restart a failed restore. return errors.Wrapf(result.firstErr, "ingest encountered %d errors", result.numErrs) } table.ID = newTableID return db.Txn(ctx, func(txn *client.Txn) error { // Pass the descriptors by value to keep this idempotent. return restoreTableDesc(ctx, txn, database, *table) }) }
// EnsureMigrations should be run during node startup to ensure that all // required migrations have been run (and running all those that are definitely // safe to run). func (m *Manager) EnsureMigrations(ctx context.Context) error { // First, check whether there are any migrations that need to be run. completedMigrations, err := m.getCompletedMigrations(ctx) if err != nil { return err } allMigrationsCompleted := true for _, migration := range backwardCompatibleMigrations { key := migrationKey(migration) if _, ok := completedMigrations[string(key)]; !ok { allMigrationsCompleted = false } } if allMigrationsCompleted { return nil } // If there are any, grab the migration lease to ensure that only one // node is ever doing migrations at a time. // Note that we shouldn't ever let client.LeaseNotAvailableErrors cause us // to stop trying, because if we return an error the server will be shut down, // and this server being down may prevent the leaseholder from finishing. var lease *client.Lease if log.V(1) { log.Info(ctx, "trying to acquire lease") } for r := retry.StartWithCtx(ctx, base.DefaultRetryOptions()); r.Next(); { lease, err = m.leaseManager.AcquireLease(ctx, keys.MigrationLease) if err == nil { break } log.Errorf(ctx, "failed attempt to acquire migration lease: %s", err) } if err != nil { return errors.Wrapf(err, "failed to acquire lease for running necessary migrations") } // Ensure that we hold the lease throughout the migration process and release // it when we're done. done := make(chan interface{}, 1) defer func() { done <- nil if log.V(1) { log.Info(ctx, "trying to release the lease") } if err := m.leaseManager.ReleaseLease(ctx, lease); err != nil { log.Errorf(ctx, "failed to release migration lease: %s", err) } }() if err := m.stopper.RunAsyncTask(ctx, func(ctx context.Context) { select { case <-done: return case <-time.After(leaseRefreshInterval): if err := m.leaseManager.ExtendLease(ctx, lease); err != nil { log.Warningf(ctx, "unable to extend ownership of expiration lease: %s", err) } if m.leaseManager.TimeRemaining(lease) < leaseRefreshInterval { // Note that we may be able to do better than this by influencing the // deadline of migrations' transactions based on the least expiration // time, but simply kill the process for now for the sake of simplicity. log.Fatal(ctx, "not enough time left on migration lease, terminating for safety") } } }); err != nil { return err } // Re-get the list of migrations in case any of them were completed between // our initial check and our grabbing of the lease. completedMigrations, err = m.getCompletedMigrations(ctx) if err != nil { return err } startTime := timeutil.Now().String() r := runner{ db: m.db, sqlExecutor: m.sqlExecutor, } for _, migration := range backwardCompatibleMigrations { key := migrationKey(migration) if _, ok := completedMigrations[string(key)]; ok { continue } if log.V(1) { log.Infof(ctx, "running migration %q", migration.name) } if err := migration.workFn(ctx, r); err != nil { return errors.Wrapf(err, "failed to run migration %q", migration.name) } if log.V(1) { log.Infof(ctx, "trying to persist record of completing migration %s", migration.name) } if err := m.db.Put(ctx, key, startTime); err != nil { return errors.Wrapf(err, "failed to persist record of completing migration %q", migration.name) } } return nil }