Exemplo n.º 1
0
func (q *consistencyQueue) shouldQueue(
	ctx context.Context, now hlc.Timestamp, repl *Replica, _ config.SystemConfig,
) (bool, float64) {
	shouldQ, priority := true, float64(0)
	if !repl.store.cfg.TestingKnobs.DisableLastProcessedCheck {
		lpTS, err := repl.getQueueLastProcessed(ctx, q.name)
		if err != nil {
			log.ErrEventf(ctx, "consistency queue last processed timestamp: %s", err)
		}
		if shouldQ, priority = shouldQueueAgain(now, lpTS, q.interval); !shouldQ {
			return false, 0
		}
	}
	// Check if all replicas are live. Some tests run without a NodeLiveness configured.
	if repl.store.cfg.NodeLiveness != nil {
		for _, rep := range repl.Desc().Replicas {
			if live, err := repl.store.cfg.NodeLiveness.IsLive(rep.NodeID); err != nil {
				log.ErrEventf(ctx, "node %d liveness failed: %s", rep.NodeID, err)
				return false, 0
			} else if !live {
				return false, 0
			}
		}
	}
	return true, priority
}
Exemplo n.º 2
0
// process() is called on every range for which this node is a lease holder.
func (q *consistencyQueue) process(ctx context.Context, repl *Replica, _ config.SystemConfig) error {
	req := roachpb.CheckConsistencyRequest{}
	if _, pErr := repl.CheckConsistency(ctx, req); pErr != nil {
		log.Error(ctx, pErr.GoError())
	}
	// Update the last processed time for this queue.
	if err := repl.setQueueLastProcessed(ctx, q.name, repl.store.Clock().Now()); err != nil {
		log.ErrEventf(ctx, "failed to update last processed time: %v", err)
	}
	return nil
}
Exemplo n.º 3
0
func (q *consistencyQueue) shouldQueue(
	ctx context.Context, now hlc.Timestamp, repl *Replica, _ config.SystemConfig,
) (bool, float64) {
	if !repl.store.cfg.TestingKnobs.DisableLastProcessedCheck {
		lpTS, err := repl.getQueueLastProcessed(ctx, q.name)
		if err != nil {
			log.ErrEventf(ctx, "consistency queue last processed timestamp: %s", err)
		}
		return shouldQueueAgain(now, lpTS, q.interval)
	}
	return true, 0
}
Exemplo n.º 4
0
func (q *timeSeriesMaintenanceQueue) process(
	ctx context.Context, now hlc.Timestamp, repl *Replica, sysCfg config.SystemConfig,
) error {
	desc := repl.Desc()
	snap := repl.store.Engine().NewSnapshot()
	defer snap.Close()
	if err := q.tsData.PruneTimeSeries(ctx, snap, desc.StartKey, desc.EndKey, q.db, now); err != nil {
		return err
	}
	// Update the last processed time for this queue.
	if err := repl.setQueueLastProcessed(ctx, q.name, now); err != nil {
		log.ErrEventf(ctx, "failed to update last processed time: %v", err)
	}
	return nil
}
Exemplo n.º 5
0
func (q *timeSeriesMaintenanceQueue) shouldQueue(
	ctx context.Context, now hlc.Timestamp, repl *Replica, _ config.SystemConfig,
) (shouldQ bool, priority float64) {
	if !repl.store.cfg.TestingKnobs.DisableLastProcessedCheck {
		lpTS, err := repl.getQueueLastProcessed(ctx, q.name)
		if err != nil {
			log.ErrEventf(ctx, "time series maintenance queue last processed timestamp: %s", err)
		}
		shouldQ, priority = shouldQueueAgain(now, lpTS, TimeSeriesMaintenanceInterval)
		if !shouldQ {
			return
		}
	}
	desc := repl.Desc()
	if q.tsData.ContainsTimeSeries(desc.StartKey, desc.EndKey) {
		return
	}
	return false, 0
}
Exemplo n.º 6
0
// sendPartialBatch sends the supplied batch to the range specified by
// desc. The batch request is first truncated so that it contains only
// requests which intersect the range descriptor and keys for each
// request are limited to the range's key span. The send occurs in a
// retry loop to handle send failures. On failure to send to any
// replicas, we backoff and retry by refetching the range
// descriptor. If the underlying range seems to have split, we
// recursively invoke divideAndSendBatchToRanges to re-enumerate the
// ranges in the span and resend to each.
func (ds *DistSender) sendPartialBatch(
	ctx context.Context,
	ba roachpb.BatchRequest,
	rs roachpb.RSpan,
	desc *roachpb.RangeDescriptor,
	evictToken *EvictionToken,
	isFirst bool,
) response {
	var reply *roachpb.BatchResponse
	var pErr *roachpb.Error
	isReverse := ba.IsReverse()

	// Truncate the request to range descriptor.
	intersected, err := rs.Intersect(desc)
	if err != nil {
		return response{pErr: roachpb.NewError(err)}
	}
	truncBA, numActive, err := truncate(ba, intersected)
	if numActive == 0 && err == nil {
		// This shouldn't happen in the wild, but some tests exercise it.
		return response{
			pErr: roachpb.NewErrorf("truncation resulted in empty batch on %s: %s", intersected, ba),
		}
	}
	if err != nil {
		return response{pErr: roachpb.NewError(err)}
	}

	// Start a retry loop for sending the batch to the range.
	for r := retry.StartWithCtx(ctx, ds.rpcRetryOptions); r.Next(); {
		// If we've cleared the descriptor on a send failure, re-lookup.
		if desc == nil {
			var descKey roachpb.RKey
			if isReverse {
				descKey = intersected.EndKey
			} else {
				descKey = intersected.Key
			}
			desc, evictToken, err = ds.getDescriptor(ctx, descKey, nil, isReverse)
			if err != nil {
				log.ErrEventf(ctx, "range descriptor re-lookup failed: %s", err)
				continue
			}
		}

		reply, pErr = ds.sendSingleRange(ctx, truncBA, desc)

		// If sending succeeded, return immediately.
		if pErr == nil {
			return response{reply: reply}
		}

		log.ErrEventf(ctx, "reply error %s: %s", ba, pErr)

		// Error handling: If the error indicates that our range
		// descriptor is out of date, evict it from the cache and try
		// again. Errors that apply only to a single replica were
		// handled in send().
		//
		// TODO(bdarnell): Don't retry endlessly. If we fail twice in a
		// row and the range descriptor hasn't changed, return the error
		// to our caller.
		switch tErr := pErr.GetDetail().(type) {
		case *roachpb.SendError:
			// We've tried all the replicas without success. Either
			// they're all down, or we're using an out-of-date range
			// descriptor. Invalidate the cache and try again with the new
			// metadata.
			log.Event(ctx, "evicting range descriptor on send error and backoff for re-lookup")
			if err := evictToken.Evict(ctx); err != nil {
				return response{pErr: roachpb.NewError(err)}
			}
			// Clear the descriptor to reload on the next attempt.
			desc = nil
			continue
		case *roachpb.RangeKeyMismatchError:
			// Range descriptor might be out of date - evict it. This is
			// likely the result of a range split. If we have new range
			// descriptors, insert them instead as long as they are different
			// from the last descriptor to avoid endless loops.
			var replacements []roachpb.RangeDescriptor
			different := func(rd *roachpb.RangeDescriptor) bool {
				return !desc.RSpan().Equal(rd.RSpan())
			}
			if tErr.MismatchedRange != nil && different(tErr.MismatchedRange) {
				replacements = append(replacements, *tErr.MismatchedRange)
			}
			if tErr.SuggestedRange != nil && different(tErr.SuggestedRange) {
				if includesFrontOfCurSpan(isReverse, tErr.SuggestedRange, rs) {
					replacements = append(replacements, *tErr.SuggestedRange)
				}
			}
			// Same as Evict() if replacements is empty.
			if err := evictToken.EvictAndReplace(ctx, replacements...); err != nil {
				return response{pErr: roachpb.NewError(err)}
			}
			// On addressing errors (likely a split), we need to re-invoke
			// the range descriptor lookup machinery, so we recurse by
			// sending batch to just the partial span this descriptor was
			// supposed to cover.
			log.VEventf(ctx, 1, "likely split; resending batch to span: %s", tErr)
			reply, pErr = ds.divideAndSendBatchToRanges(ctx, ba, intersected, isFirst)
			return response{reply: reply, pErr: pErr}
		}
		break
	}

	// Propagate error if either the retry closer or context done
	// channels were closed.
	if pErr == nil {
		if pErr = ds.deduceRetryEarlyExitError(ctx); pErr == nil {
			log.Fatal(ctx, "exited retry loop without an error")
		}
	}

	return response{pErr: pErr}
}
Exemplo n.º 7
0
// execStmtInOpenTxn executes one statement in the context
// of the planner's transaction (which is assumed to exist).
// It handles statements that affect the transaction state (BEGIN, COMMIT)
// and delegates everything else to `execStmt`.
// It binds placeholders.
//
// The current transaction might be committed/rolled back when this returns.
// It might also have transitioned to the aborted or RestartWait state.
//
// Args:
// implicitTxn: set if the current transaction was implicitly
//  created by the system (i.e. the client sent the statement outside of
//  a transaction).
//  COMMIT/ROLLBACK statements are rejected if set. Also, the transaction
//  might be auto-committed in this function.
// firstInTxn: set for the first statement in a transaction. Used
//  so that nested BEGIN statements are caught.
// stmtTimestamp: Used as the statement_timestamp().
//
// Returns:
// - a Result
// - an error, if any. In case of error, the result returned also reflects this error.
func (e *Executor) execStmtInOpenTxn(
	stmt parser.Statement, planMaker *planner, implicitTxn bool, firstInTxn bool, txnState *txnState,
) (Result, error) {
	if txnState.State != Open {
		panic("execStmtInOpenTxn called outside of an open txn")
	}
	if planMaker.txn == nil {
		panic("execStmtInOpenTxn called with a txn not set on the planner")
	}

	planMaker.evalCtx.SetTxnTimestamp(txnState.sqlTimestamp)
	planMaker.evalCtx.SetStmtTimestamp(e.cfg.Clock.PhysicalTime())

	session := planMaker.session
	log.Eventf(session.context, "%s", stmt)

	// TODO(cdo): Figure out how to not double count on retries.
	e.updateStmtCounts(stmt)
	switch s := stmt.(type) {
	case *parser.BeginTransaction:
		if !firstInTxn {
			txnState.updateStateAndCleanupOnErr(errTransactionInProgress, e)
			return Result{Err: errTransactionInProgress}, errTransactionInProgress
		}
	case *parser.CommitTransaction:
		if implicitTxn {
			return e.noTransactionHelper(txnState)
		}
		// CommitTransaction is executed fully here; there's no planNode for it
		// and the planner is not involved at all.
		res, err := commitSQLTransaction(txnState, planMaker, commit, e)
		return res, err
	case *parser.ReleaseSavepoint:
		if implicitTxn {
			return e.noTransactionHelper(txnState)
		}
		if err := parser.ValidateRestartCheckpoint(s.Savepoint); err != nil {
			return Result{Err: err}, err
		}
		// ReleaseSavepoint is executed fully here; there's no planNode for it
		// and the planner is not involved at all.
		res, err := commitSQLTransaction(txnState, planMaker, release, e)
		return res, err
	case *parser.RollbackTransaction:
		if implicitTxn {
			return e.noTransactionHelper(txnState)
		}
		// RollbackTransaction is executed fully here; there's no planNode for it
		// and the planner is not involved at all.
		// Notice that we don't return any errors on rollback.
		return rollbackSQLTransaction(txnState, planMaker), nil
	case *parser.SetTransaction:
		if implicitTxn {
			return e.noTransactionHelper(txnState)
		}
	case *parser.Savepoint:
		if implicitTxn {
			return e.noTransactionHelper(txnState)
		}
		if err := parser.ValidateRestartCheckpoint(s.Name); err != nil {
			return Result{Err: err}, err
		}
		// We want to disallow SAVEPOINTs to be issued after a transaction has
		// started running, but such enforcement is problematic in the
		// presence of transaction retries (since the transaction proto is
		// necessarily reused). To work around this, we keep track of the
		// transaction's retrying state and special-case SAVEPOINT when it is
		// set.
		//
		// TODO(andrei): the check for retrying is a hack - we erroneously
		// allow SAVEPOINT to be issued at any time during a retry, not just
		// in the beginning. We should figure out how to track whether we
		// started using the transaction during a retry.
		if txnState.txn.Proto.IsInitialized() && !txnState.retrying {
			err := fmt.Errorf("SAVEPOINT %s needs to be the first statement in a transaction",
				parser.RestartSavepointName)
			txnState.updateStateAndCleanupOnErr(err, e)
			return Result{Err: err}, err
		}
		// Note that Savepoint doesn't have a corresponding plan node.
		// This here is all the execution there is.
		txnState.retryIntent = true
		return Result{}, nil
	case *parser.RollbackToSavepoint:
		err := parser.ValidateRestartCheckpoint(s.Savepoint)
		if err == nil {
			// Can't restart if we didn't get an error first, which would've put the
			// txn in a different state.
			err = errNotRetriable
		}
		txnState.updateStateAndCleanupOnErr(err, e)
		return Result{Err: err}, err
	case *parser.Prepare:
		err := util.UnimplementedWithIssueErrorf(7568,
			"Prepared statements are supported only via the Postgres wire protocol")
		txnState.updateStateAndCleanupOnErr(err, e)
		return Result{Err: err}, err
	case *parser.Execute:
		err := util.UnimplementedWithIssueErrorf(7568,
			"Executing prepared statements is supported only via the Postgres wire protocol")
		txnState.updateStateAndCleanupOnErr(err, e)
		return Result{Err: err}, err
	case *parser.Deallocate:
		if s.Name == "" {
			planMaker.session.PreparedStatements.DeleteAll()
		} else {
			if found := planMaker.session.PreparedStatements.Delete(string(s.Name)); !found {
				err := fmt.Errorf("prepared statement %s does not exist", s.Name)
				txnState.updateStateAndCleanupOnErr(err, e)
				return Result{Err: err}, err
			}
		}
		return Result{PGTag: s.StatementTag()}, nil
	}

	autoCommit := implicitTxn && !e.cfg.TestingKnobs.DisableAutoCommit
	result, err := e.execStmt(stmt, planMaker, autoCommit)
	if err != nil {
		if result.Rows != nil {
			result.Rows.Close()
			result.Rows = nil
		}
		if traceSQL {
			log.ErrEventf(txnState.txn.Context, "ERROR: %v", err)
		}
		log.ErrEventf(session.context, "ERROR: %v", err)
		txnState.updateStateAndCleanupOnErr(err, e)
		return Result{Err: err}, err
	}

	tResult := &traceResult{tag: result.PGTag, count: -1}
	switch result.Type {
	case parser.RowsAffected:
		tResult.count = result.RowsAffected
	case parser.Rows:
		tResult.count = result.Rows.Len()
	}
	if traceSQL {
		log.Eventf(txnState.txn.Context, "%s done", tResult)
	}
	log.Eventf(session.context, "%s done", tResult)
	return result, nil
}
Exemplo n.º 8
0
func (n *Node) batchInternal(
	ctx context.Context, args *roachpb.BatchRequest,
) (*roachpb.BatchResponse, error) {
	// TODO(marc): grpc's authentication model (which gives credential access in
	// the request handler) doesn't really fit with the current design of the
	// security package (which assumes that TLS state is only given at connection
	// time) - that should be fixed.
	if peer, ok := peer.FromContext(ctx); ok {
		if tlsInfo, ok := peer.AuthInfo.(credentials.TLSInfo); ok {
			certUser, err := security.GetCertificateUser(&tlsInfo.State)
			if err != nil {
				return nil, err
			}
			if certUser != security.NodeUser {
				return nil, errors.Errorf("user %s is not allowed", certUser)
			}
		}
	}

	var br *roachpb.BatchResponse

	type snowballInfo struct {
		syncutil.Mutex
		collectedSpans [][]byte
		done           bool
	}
	var snowball *snowballInfo

	if err := n.stopper.RunTaskWithErr(func() error {
		const opName = "node.Batch"
		sp, err := tracing.JoinOrNew(n.storeCfg.AmbientCtx.Tracer, args.TraceContext, opName)
		if err != nil {
			return err
		}
		// If this is a snowball span, it gets special treatment: It skips the
		// regular tracing machinery, and we instead send the collected spans
		// back with the response. This is more expensive, but then again,
		// those are individual requests traced by users, so they can be.
		if sp.BaggageItem(tracing.Snowball) != "" {
			sp.LogEvent("delegating to snowball tracing")
			sp.Finish()

			snowball = new(snowballInfo)
			recorder := func(rawSpan basictracer.RawSpan) {
				snowball.Lock()
				defer snowball.Unlock()
				if snowball.done {
					// This is a late span that we must discard because the request was
					// already completed.
					return
				}
				encSp, err := tracing.EncodeRawSpan(&rawSpan, nil)
				if err != nil {
					log.Warning(ctx, err)
				}
				snowball.collectedSpans = append(snowball.collectedSpans, encSp)
			}

			if sp, err = tracing.JoinOrNewSnowball(opName, args.TraceContext, recorder); err != nil {
				return err
			}
		}
		defer sp.Finish()
		traceCtx := opentracing.ContextWithSpan(ctx, sp)
		log.Event(traceCtx, args.Summary())

		tStart := timeutil.Now()
		var pErr *roachpb.Error
		br, pErr = n.stores.Send(traceCtx, *args)
		if pErr != nil {
			br = &roachpb.BatchResponse{}
			log.ErrEventf(traceCtx, "%T", pErr.GetDetail())
		}
		if br.Error != nil {
			panic(roachpb.ErrorUnexpectedlySet(n.stores, br))
		}
		n.metrics.callComplete(timeutil.Since(tStart), pErr)
		br.Error = pErr
		return nil
	}); err != nil {
		return nil, err
	}

	if snowball != nil {
		snowball.Lock()
		br.CollectedSpans = snowball.collectedSpans
		snowball.done = true
		snowball.Unlock()
	}

	return br, nil
}
Exemplo n.º 9
0
func (rq *replicateQueue) shouldQueue(
	ctx context.Context, now hlc.Timestamp, repl *Replica, sysCfg config.SystemConfig,
) (shouldQ bool, priority float64) {
	if !repl.store.splitQueue.Disabled() && repl.needsSplitBySize() {
		// If the range exceeds the split threshold, let that finish first.
		// Ranges must fit in memory on both sender and receiver nodes while
		// being replicated. This supplements the check provided by
		// acceptsUnsplitRanges, which looks at zone config boundaries rather
		// than data size.
		//
		// This check is ignored if the split queue is disabled, since in that
		// case, the split will never come.
		return
	}

	// Find the zone config for this range.
	desc := repl.Desc()
	zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey)
	if err != nil {
		log.Error(ctx, err)
		return
	}

	action, priority := rq.allocator.ComputeAction(zone, desc)
	if action != AllocatorNoop {
		if log.V(2) {
			log.Infof(ctx, "%s repair needed (%s), enqueuing", repl, action)
		}
		return true, priority
	}

	// If we hold the lease, check to see if we should transfer it.
	var leaseStoreID roachpb.StoreID
	if lease, _ := repl.getLease(); lease != nil && lease.Covers(now) {
		leaseStoreID = lease.Replica.StoreID
		if rq.canTransferLease() &&
			rq.allocator.ShouldTransferLease(zone.Constraints, leaseStoreID, desc.RangeID) {
			if log.V(2) {
				log.Infof(ctx, "%s lease transfer needed, enqueuing", repl)
			}
			return true, 0
		}
	}

	// Check for a rebalancing opportunity. Note that leaseStoreID will be 0 if
	// the range doesn't currently have a lease which will allow the current
	// replica to be considered a rebalancing source.
	target, err := rq.allocator.RebalanceTarget(
		zone.Constraints,
		desc.Replicas,
		leaseStoreID,
		desc.RangeID,
	)
	if err != nil {
		log.ErrEventf(ctx, "rebalance target failed: %s", err)
		return false, 0
	}
	if log.V(2) {
		if target != nil {
			log.Infof(ctx, "%s rebalance target found, enqueuing", repl)
		} else {
			log.Infof(ctx, "%s no rebalance target found, not enqueuing", repl)
		}
	}
	return target != nil, 0
}
Exemplo n.º 10
0
func (rq *replicateQueue) processOneChange(
	ctx context.Context, now hlc.Timestamp, repl *Replica, sysCfg config.SystemConfig,
) error {
	desc := repl.Desc()
	// Find the zone config for this range.
	zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey)
	if err != nil {
		return err
	}
	action, _ := rq.allocator.ComputeAction(zone, desc)

	// Avoid taking action if the range has too many dead replicas to make
	// quorum.
	deadReplicas := rq.allocator.storePool.deadReplicas(desc.RangeID, desc.Replicas)
	quorum := computeQuorum(len(desc.Replicas))
	liveReplicaCount := len(desc.Replicas) - len(deadReplicas)
	if liveReplicaCount < quorum {
		return errors.Errorf("range requires a replication change, but lacks a quorum of live nodes.")
	}

	switch action {
	case AllocatorAdd:
		log.Event(ctx, "adding a new replica")
		newStore, err := rq.allocator.AllocateTarget(
			zone.Constraints,
			desc.Replicas,
			desc.RangeID,
			true,
		)
		if err != nil {
			return err
		}
		newReplica := roachpb.ReplicaDescriptor{
			NodeID:  newStore.Node.NodeID,
			StoreID: newStore.StoreID,
		}

		log.VEventf(ctx, 1, "adding replica to %+v due to under-replication", newReplica)
		if err := rq.addReplica(ctx, repl, newReplica, desc); err != nil {
			return err
		}
	case AllocatorRemove:
		log.Event(ctx, "removing a replica")
		// If the lease holder (our local store) is an overfull store (in terms of
		// leases) allow transferring the lease away.
		leaseHolderStoreID := repl.store.StoreID()
		if rq.allocator.ShouldTransferLease(zone.Constraints, leaseHolderStoreID, desc.RangeID) {
			leaseHolderStoreID = 0
		}
		removeReplica, err := rq.allocator.RemoveTarget(
			zone.Constraints,
			desc.Replicas,
			leaseHolderStoreID,
		)
		if err != nil {
			return err
		}
		if removeReplica.StoreID == repl.store.StoreID() {
			// The local replica was selected as the removal target, but that replica
			// is the leaseholder, so transfer the lease instead. We don't check that
			// the current store has too many leases in this case under the
			// assumption that replica balance is a greater concern. Also note that
			// AllocatorRemove action takes preference over AllocatorNoop
			// (rebalancing) which is where lease transfer would otherwise occur. We
			// need to be able to transfer leases in AllocatorRemove in order to get
			// out of situations where this store is overfull and yet holds all the
			// leases.
			candidates := filterBehindReplicas(repl.RaftStatus(), desc.Replicas)
			target := rq.allocator.TransferLeaseTarget(
				zone.Constraints, candidates, repl.store.StoreID(), desc.RangeID,
				false /* checkTransferLeaseSource */)
			if target != (roachpb.ReplicaDescriptor{}) {
				log.VEventf(ctx, 1, "transferring lease to s%d", target.StoreID)
				if err := repl.AdminTransferLease(target.StoreID); err != nil {
					return errors.Wrapf(err, "%s: unable to transfer lease to s%d", repl, target.StoreID)
				}
				rq.lastLeaseTransfer.Store(timeutil.Now())
				// Do not requeue as we transferred our lease away.
				return nil
			}
		} else {
			log.VEventf(ctx, 1, "removing replica %+v due to over-replication", removeReplica)
			if err := rq.removeReplica(ctx, repl, removeReplica, desc); err != nil {
				return err
			}
		}
	case AllocatorRemoveDead:
		log.Event(ctx, "removing a dead replica")
		if len(deadReplicas) == 0 {
			if log.V(1) {
				log.Warningf(ctx, "Range of replica %s was identified as having dead replicas, but no dead replicas were found.", repl)
			}
			break
		}
		deadReplica := deadReplicas[0]
		log.VEventf(ctx, 1, "removing dead replica %+v from store", deadReplica)
		if err := repl.ChangeReplicas(ctx, roachpb.REMOVE_REPLICA, deadReplica, desc); err != nil {
			return err
		}
	case AllocatorNoop:
		// The Noop case will result if this replica was queued in order to
		// rebalance. Attempt to find a rebalancing target.
		log.Event(ctx, "considering a rebalance")

		if rq.canTransferLease() {
			// We require the lease in order to process replicas, so
			// repl.store.StoreID() corresponds to the lease-holder's store ID.
			candidates := filterBehindReplicas(repl.RaftStatus(), desc.Replicas)
			target := rq.allocator.TransferLeaseTarget(
				zone.Constraints, candidates, repl.store.StoreID(), desc.RangeID,
				true /* checkTransferLeaseSource */)
			if target != (roachpb.ReplicaDescriptor{}) {
				log.VEventf(ctx, 1, "transferring lease to s%d", target.StoreID)
				if err := repl.AdminTransferLease(target.StoreID); err != nil {
					return errors.Wrapf(err, "%s: unable to transfer lease to s%d", repl, target.StoreID)
				}
				rq.lastLeaseTransfer.Store(timeutil.Now())
				// Do not requeue as we transferred our lease away.
				return nil
			}
		}

		rebalanceStore, err := rq.allocator.RebalanceTarget(
			zone.Constraints,
			desc.Replicas,
			repl.store.StoreID(),
			desc.RangeID,
		)
		if err != nil {
			log.ErrEventf(ctx, "rebalance target failed %s", err)
			return nil
		}
		if rebalanceStore == nil {
			log.VEventf(ctx, 1, "no suitable rebalance target")
			// No action was necessary and no rebalance target was found. Return
			// without re-queuing this replica.
			return nil
		}
		rebalanceReplica := roachpb.ReplicaDescriptor{
			NodeID:  rebalanceStore.Node.NodeID,
			StoreID: rebalanceStore.StoreID,
		}
		log.VEventf(ctx, 1, "rebalancing to %+v", rebalanceReplica)
		if err := rq.addReplica(ctx, repl, rebalanceReplica, desc); err != nil {
			return err
		}
	}

	return nil
}
Exemplo n.º 11
0
// sendToReplicas sends one or more RPCs to clients specified by the
// slice of replicas. On success, Send returns the first successful
// reply. If an error occurs which is not specific to a single
// replica, it's returned immediately. Otherwise, when all replicas
// have been tried and failed, returns a send error.
func (ds *DistSender) sendToReplicas(
	ctx context.Context,
	opts SendOptions,
	rangeID roachpb.RangeID,
	replicas ReplicaSlice,
	args roachpb.BatchRequest,
	rpcContext *rpc.Context,
) (*roachpb.BatchResponse, error) {
	if len(replicas) < 1 {
		return nil, roachpb.NewSendError(
			fmt.Sprintf("insufficient replicas (%d) to satisfy send request of %d",
				len(replicas), 1))
	}

	var ambiguousResult bool
	var haveCommit bool
	// We only check for committed txns, not aborts because aborts may
	// be retried without any risk of inconsistencies.
	if etArg, ok := args.GetArg(roachpb.EndTransaction); ok &&
		etArg.(*roachpb.EndTransactionRequest).Commit {
		haveCommit = true
	}
	done := make(chan BatchCall, len(replicas))

	transportFactory := opts.transportFactory
	if transportFactory == nil {
		transportFactory = grpcTransportFactory
	}
	transport, err := transportFactory(opts, rpcContext, replicas, args)
	if err != nil {
		return nil, err
	}
	defer transport.Close()
	if transport.IsExhausted() {
		return nil, roachpb.NewSendError(
			fmt.Sprintf("sending to all %d replicas failed", len(replicas)))
	}

	// Send the first request.
	pending := 1
	log.VEventf(ctx, 2, "sending batch %s to range %d", args.Summary(), rangeID)
	transport.SendNext(ctx, done)

	// Wait for completions. This loop will retry operations that fail
	// with errors that reflect per-replica state and may succeed on
	// other replicas.
	var sendNextTimer timeutil.Timer
	defer sendNextTimer.Stop()
	slowTimer := time.After(base.SlowRequestThreshold)
	for {
		sendNextTimer.Reset(opts.SendNextTimeout)
		select {
		case <-sendNextTimer.C:
			sendNextTimer.Read = true
			// On successive RPC timeouts, send to additional replicas if available.
			if !transport.IsExhausted() {
				ds.metrics.SendNextTimeoutCount.Inc(1)
				log.VEventf(ctx, 2, "timeout, trying next peer")
				pending++
				transport.SendNext(ctx, done)
			}

		case <-slowTimer:
			slowTimer = nil
			log.Warningf(ctx, "have been waiting %s sending RPC for batch: %s",
				base.SlowRequestThreshold, args.Summary())
			ds.metrics.SlowRequestsCount.Inc(1)
			defer ds.metrics.SlowRequestsCount.Dec(1)

		case call := <-done:
			pending--
			err := call.Err
			if err == nil {
				if log.V(2) {
					log.Infof(ctx, "RPC reply: %s", call.Reply)
				}

				if call.Reply.Error == nil {
					return call.Reply, nil
				} else if !ds.handlePerReplicaError(ctx, transport, rangeID, call.Reply.Error) {
					// The error received is not specific to this replica, so we
					// should return it instead of trying other replicas. However,
					// if we're trying to commit a transaction and there are
					// still other RPCs outstanding or an ambiguous RPC error
					// was already received, we must return an ambiguous commit
					// error instead of returned error.
					log.ErrEventf(ctx, "application error: %s", call.Reply.Error)
					if haveCommit && (pending > 0 || ambiguousResult) {
						log.ErrEventf(ctx, "returning ambiguous result (pending=%d)", pending)
						return nil, roachpb.NewAmbiguousResultError(
							fmt.Sprintf("error=%s, pending RPCs=%d", call.Reply.Error, pending))
					}
					return call.Reply, nil
				}

				// Extract the detail so it can be included in the error
				// message if this is our last replica.
				//
				// TODO(bdarnell): The last error is not necessarily the best
				// one to return; we may want to remember the "best" error
				// we've seen (for example, a NotLeaseHolderError conveys more
				// information than a RangeNotFound).
				log.ErrEventf(ctx, "application error: %s", call.Reply.Error)
				err = call.Reply.Error.GoError()
			} else {
				// All connection errors except for an unavailable node (this
				// is GRPC's fail-fast error), may mean that the request
				// succeeded on the remote server, but we were unable to
				// receive the reply. Set the ambiguous commit flag.
				//
				// We retry ambiguous commit batches to avoid returning the
				// unrecoverable AmbiguousResultError. This is safe because
				// repeating an already-successfully applied batch is
				// guaranteed to return either a TransactionReplayError (in
				// case the replay happens at the original leader), or a
				// TransactionRetryError (in case the replay happens at a new
				// leader). If the original attempt merely timed out or was
				// lost, then the batch will succeed and we can be assured the
				// commit was applied just once.
				//
				// The Unavailable code is used by GRPC to indicate that a
				// request fails fast and is not sent, so we can be sure there
				// is no ambiguity on these errors. Note that these are common
				// if a node is down.
				// See https://github.com/grpc/grpc-go/blob/52f6504dc290bd928a8139ba94e3ab32ed9a6273/call.go#L182
				// See https://github.com/grpc/grpc-go/blob/52f6504dc290bd928a8139ba94e3ab32ed9a6273/stream.go#L158
				if haveCommit && grpc.Code(err) != codes.Unavailable {
					log.ErrEventf(ctx, "txn may have committed despite RPC error: %s", err)
					ambiguousResult = true
				} else {
					log.ErrEventf(ctx, "RPC error: %s", err)
				}
			}

			// Send to additional replicas if available.
			if !transport.IsExhausted() {
				ds.metrics.NextReplicaErrCount.Inc(1)
				log.VEventf(ctx, 2, "error, trying next peer")
				pending++
				transport.SendNext(ctx, done)
			}
			if pending == 0 {
				if ambiguousResult {
					err = roachpb.NewAmbiguousResultError(
						fmt.Sprintf("sending to all %d replicas failed; last error: %v, "+
							"but RPC failure may have masked txn commit", len(replicas), err))
				} else {
					err = roachpb.NewSendError(
						fmt.Sprintf("sending to all %d replicas failed; last error: %v", len(replicas), err),
					)
				}
				log.ErrEvent(ctx, err.Error())
				return nil, err
			}
		}
	}
}