// executeCmd interprets the given message as a *roachpb.BatchRequest and sends it // via the local sender. func (n *Node) executeCmd(argsI proto.Message) (proto.Message, error) { ba := argsI.(*roachpb.BatchRequest) var br *roachpb.BatchResponse f := func() { // TODO(tschottdorf) get a hold of the client's ID, add it to the // context before dispatching, and create an ID for tracing the request. sp := n.ctx.Tracer.StartSpan("node") defer sp.Finish() ctx, _ := opentracing.ContextWithSpan((*Node)(n).context(), sp) tStart := time.Now() var pErr *roachpb.Error br, pErr = n.stores.Send(ctx, *ba) if pErr != nil { br = &roachpb.BatchResponse{} sp.LogEvent(fmt.Sprintf("error: %T", pErr.GetDetail())) } if br.Error != nil { panic(roachpb.ErrorUnexpectedlySet(n.stores, br)) } n.feed.CallComplete(*ba, time.Now().Sub(tStart), pErr) br.Error = pErr } if !n.stopper.RunTask(f) { return nil, util.Errorf("node %d stopped", n.Descriptor.NodeID) } return br, nil }
func convertBatchError(tableDesc *TableDescriptor, b client.Batch, origPErr *roachpb.Error) *roachpb.Error { if origPErr.Index == nil { return origPErr } index := origPErr.Index.Index if index >= int32(len(b.Results)) { panic(fmt.Sprintf("index %d outside of results: %+v", index, b.Results)) } result := b.Results[index] if _, ok := origPErr.GoError().(*roachpb.ConditionFailedError); ok { for _, row := range result.Rows { indexID, key, pErr := decodeIndexKeyPrefix(tableDesc, row.Key) if pErr != nil { return pErr } index, pErr := tableDesc.FindIndexByID(indexID) if pErr != nil { return pErr } valTypes, pErr := makeKeyVals(tableDesc, index.ColumnIDs) if pErr != nil { return pErr } vals := make([]parser.Datum, len(valTypes)) if _, pErr := decodeKeyVals(valTypes, vals, key); pErr != nil { return pErr } return roachpb.NewError(errUniquenessConstraintViolation{index: index, vals: vals}) } } return origPErr }
// If we hit an error and there is a pending transaction, rollback // the transaction before returning. The client does not have to // deal with cleaning up transaction state. func makeResultFromError(planMaker *planner, pErr *roachpb.Error) Result { if planMaker.txn != nil { if _, ok := pErr.GoError().(*roachpb.SqlTransactionAbortedError); !ok { planMaker.txn.Cleanup(pErr) } } return Result{Err: pErr.GoError()} }
// TestTxnCoordSenderEndTxn verifies that ending a transaction // sends resolve write intent requests and removes the transaction // from the txns map. func TestTxnCoordSenderEndTxn(t *testing.T) { defer leaktest.AfterTest(t) s := createTestDB(t) defer s.Stop() // 4 cases: no deadline, past deadline, equal deadline, future deadline. for i := 0; i < 4; i++ { key := roachpb.Key("key: " + strconv.Itoa(i)) txn := client.NewTxn(*s.DB) // Initialize the transaction if pErr := txn.Put(key, []byte("value")); pErr != nil { t.Fatal(pErr) } { var pErr *roachpb.Error switch i { case 0: // No deadline. pErr = txn.Commit() case 1: // Past deadline. pErr = txn.CommitBy(txn.Proto.Timestamp.Prev()) case 2: // Equal deadline. pErr = txn.CommitBy(txn.Proto.Timestamp) case 3: // Future deadline. pErr = txn.CommitBy(txn.Proto.Timestamp.Next()) } switch i { case 0: // No deadline. if pErr != nil { t.Error(pErr) } case 1: // Past deadline. if _, ok := pErr.GoError().(*roachpb.TransactionAbortedError); !ok { t.Errorf("expected TransactionAbortedError but got %T: %s", pErr, pErr) } case 2: // Equal deadline. if pErr != nil { t.Error(pErr) } case 3: // Future deadline. if pErr != nil { t.Error(pErr) } } } verifyCleanup(key, s.Sender, s.Eng, t) } }
func (c *v3Conn) sendPError(pErr *roachpb.Error) error { var errCode string if sqlErr, ok := pErr.GetDetail().(*roachpb.ErrorWithPGCode); ok { errCode = sqlErr.ErrorCode } else { errCode = sql.CodeInternalError } return c.sendError(errCode, pErr.String()) }
// processWriteIntentError tries to push the conflicting // transaction(s) responsible for the given WriteIntentError, and to // resolve those intents if possible. Returns a new error to be used // in place of the original. // // The returned error may be a copy of the original WriteIntentError, // with or without the Resolved flag set, which governs the client's // retry behavior (if the transaction is pushed, the Resolved flag is // set to tell the client to retry immediately; otherwise it is false // to cause the client to back off). func (ir *intentResolver) processWriteIntentError(ctx context.Context, wiPErr *roachpb.Error, args roachpb.Request, h roachpb.Header, pushType roachpb.PushTxnType) *roachpb.Error { wiErr, ok := wiPErr.GetDetail().(*roachpb.WriteIntentError) if !ok { return roachpb.NewErrorf("not a WriteIntentError: %v", wiPErr) } if log.V(6) { log.Infof(ctx, "resolving write intent %s", wiErr) } method := args.Method() readOnly := roachpb.IsReadOnly(args) // TODO(tschottdorf): pass as param resolveIntents, pushErr := ir.maybePushTransactions(ctx, wiErr.Intents, h, pushType, false) if resErr := ir.resolveIntents(ctx, resolveIntents, false /* !wait */, pushType == roachpb.PUSH_ABORT /* poison */); resErr != nil { // When resolving without waiting, errors should not // usually be returned here, although there are some cases // when they may be (especially when a test cluster is in // the process of shutting down). log.Warningf(ctx, "asynchronous resolveIntents failed: %s", resErr) } if pushErr != nil { if log.V(1) { log.Infof(ctx, "on %s: %s", method, pushErr) } if _, isExpected := pushErr.GetDetail().(*roachpb.TransactionPushError); !isExpected { // If an unexpected error occurred, make sure it bubbles up to the // client. Examples are timeouts and logic errors. return pushErr } // For write/write conflicts within a transaction, propagate the // push failure, not the original write intent error. The push // failure will instruct the client to restart the transaction // with a backoff. if h.Txn != nil && h.Txn.ID != nil && !readOnly { return pushErr } // For read/write conflicts, and non-transactional write/write // conflicts, return the write intent error which engages // backoff/retry (with !Resolved). We don't need to restart the // txn, only resend the read with a backoff. return wiPErr } // We pushed all transactions, so tell the client everything's // resolved and it can retry immediately. wiErr.Resolved = true return wiPErr // references wiErr }
// If we hit an error and there is a pending transaction, rollback // the transaction before returning. The client does not have to // deal with cleaning up transaction state. func makeResultFromError(planMaker *planner, pErr *roachpb.Error) driver.Response_Result { if planMaker.txn != nil { if _, ok := pErr.GoError().(*roachpb.SqlTransactionAbortedError); !ok { planMaker.txn.Cleanup(pErr) } } errString := pErr.GoError().Error() return driver.Response_Result{Error: &errString} }
// Exec executes fn in the context of a distributed transaction. // Execution is controlled by opt (see comments in TxnExecOptions). // // opt is passed to fn, and it's valid for fn to modify opt as it sees // fit during each execution attempt. // // It's valid for txn to be nil (meaning the txn has already aborted) if fn // can handle that. This is useful for continuing transactions that have been // aborted because of an error in a previous batch of statements in the hope // that a ROLLBACK will reset the state. Neither opt.AutoRetry not opt.AutoCommit // can be set in this case. // // If an error is returned, the txn has been aborted. func (txn *Txn) Exec( opt TxnExecOptions, fn func(txn *Txn, opt *TxnExecOptions) *roachpb.Error) *roachpb.Error { // Run fn in a retry loop until we encounter a success or // error condition this loop isn't capable of handling. var pErr *roachpb.Error var retryOptions retry.Options if txn == nil && (opt.AutoRetry || opt.AutoCommit) { panic("asked to retry or commit a txn that is already aborted") } if opt.AutoRetry { retryOptions = txn.db.txnRetryOptions } RetryLoop: for r := retry.Start(retryOptions); r.Next(); { pErr = fn(txn, &opt) if (pErr == nil) && opt.AutoCommit && (txn.Proto.Status == roachpb.PENDING) { // fn succeeded, but didn't commit. pErr = txn.commit(nil) } if pErr == nil { break } // Make sure the txn record that pErr carries is for this txn. // We check only when txn.Proto.ID has been initialized after an initial successful send. if pErr.GetTxn() != nil && txn.Proto.ID != nil { if errTxn := pErr.GetTxn(); !errTxn.Equal(&txn.Proto) { return roachpb.NewErrorf("mismatching transaction record in the error:\n%s\nv.s.\n%s", errTxn, txn.Proto) } } if !opt.AutoRetry { break RetryLoop } switch pErr.TransactionRestart { case roachpb.TransactionRestart_IMMEDIATE: r.Reset() case roachpb.TransactionRestart_BACKOFF: default: break RetryLoop } if log.V(2) { log.Infof("automatically retrying transaction: %s because of error: %s", txn.DebugName(), pErr) } } if txn != nil { // TODO(andrei): don't do Cleanup() on retriable errors here. // Let the sql executor do it. txn.Cleanup(pErr) } return pErr }
func encodeDTuple(b []byte, d parser.DTuple) ([]byte, error) { for _, val := range d { var pErr *roachpb.Error b, pErr = encodeDatum(b, val) if pErr != nil { return nil, pErr.GoError() } } return b, nil }
// executeCmd interprets the given message as a *roachpb.BatchRequest and sends it // via the local sender. func (n *Node) executeCmd(argsI proto.Message) (proto.Message, error) { ba := argsI.(*roachpb.BatchRequest) var br *roachpb.BatchResponse opName := "node " + strconv.Itoa(int(n.Descriptor.NodeID)) // could save allocs here fail := func(err error) { br = &roachpb.BatchResponse{} br.Error = roachpb.NewError(err) } f := func() { sp, err := tracing.JoinOrNew(n.ctx.Tracer, ba.Trace, opName) if err != nil { fail(err) return } // If this is a snowball span, it gets special treatment: It skips the // regular tracing machinery, and we instead send the collected spans // back with the response. This is more expensive, but then again, // those are individual requests traced by users, so they can be. if sp.BaggageItem(tracing.Snowball) != "" { if sp, err = tracing.JoinOrNewSnowball(opName, ba.Trace, func(rawSpan basictracer.RawSpan) { encSp, err := tracing.EncodeRawSpan(&rawSpan, nil) if err != nil { log.Warning(err) } br.CollectedSpans = append(br.CollectedSpans, encSp) }); err != nil { fail(err) return } } defer sp.Finish() ctx := opentracing.ContextWithSpan((*Node)(n).context(), sp) tStart := time.Now() var pErr *roachpb.Error br, pErr = n.stores.Send(ctx, *ba) if pErr != nil { br = &roachpb.BatchResponse{} sp.LogEvent(fmt.Sprintf("error: %T", pErr.GetDetail())) } if br.Error != nil { panic(roachpb.ErrorUnexpectedlySet(n.stores, br)) } n.metrics.callComplete(time.Now().Sub(tStart), pErr) br.Error = pErr } if !n.stopper.RunTask(f) { return nil, util.Errorf("node %d stopped", n.Descriptor.NodeID) } return br, nil }
// Responses with write-too-old, write-intent and not leader errors // are retried on the server, and so are not recorded in the sequence // cache in the hopes of retrying to a successful outcome. func (sc *SequenceCache) shouldCacheError(pErr *roachpb.Error) bool { switch pErr.GoError().(type) { case *roachpb.WriteTooOldError, *roachpb.WriteIntentError, *roachpb.NotLeaderError, *roachpb.RangeKeyMismatchError: return false } return true }
func (txn *Txn) exec(retryable func(txn *Txn) *roachpb.Error) *roachpb.Error { // Run retryable in a retry loop until we encounter a success or // error condition this loop isn't capable of handling. var pErr *roachpb.Error for r := retry.Start(txn.db.txnRetryOptions); r.Next(); { pErr = retryable(txn) if pErr == nil && txn.Proto.Status == roachpb.PENDING { // retryable succeeded, but didn't commit. pErr = txn.commit(nil) } if pErr != nil { // Make sure the txn record that pErr carries is for this txn. // We check only when txn.Proto.ID has been initialized after an initial successful send. if pErr.GetTxn() != nil && txn.Proto.ID != nil { if errTxn := pErr.GetTxn(); !errTxn.Equal(&txn.Proto) { return roachpb.NewErrorf("mismatching transaction record in the error:\n%s\nv.s.\n%s", errTxn, txn.Proto) } } switch pErr.TransactionRestart { case roachpb.TransactionRestart_IMMEDIATE: if log.V(2) { log.Warning(pErr) } r.Reset() continue case roachpb.TransactionRestart_BACKOFF: if log.V(2) { log.Warning(pErr) } continue } // By default, fall through and break. } break } txn.Cleanup(pErr) return pErr }
func convertBatchError(tableDesc *TableDescriptor, b client.Batch, origPErr *roachpb.Error) *roachpb.Error { if origPErr.Index == nil { return origPErr } index := origPErr.Index.Index if index >= int32(len(b.Results)) { panic(fmt.Sprintf("index %d outside of results: %+v", index, b.Results)) } result := b.Results[index] if _, ok := origPErr.GetDetail().(*roachpb.ConditionFailedError); ok { for _, row := range result.Rows { indexID, key, err := decodeIndexKeyPrefix(tableDesc, row.Key) if err != nil { return roachpb.NewError(err) } index, err := tableDesc.FindIndexByID(indexID) if err != nil { return roachpb.NewError(err) } valTypes, err := makeKeyVals(tableDesc, index.ColumnIDs) if err != nil { return roachpb.NewError(err) } dirs := make([]encoding.Direction, 0, len(index.ColumnIDs)) for _, dir := range index.ColumnDirections { convertedDir, err := dir.toEncodingDirection() if err != nil { return roachpb.NewError(err) } dirs = append(dirs, convertedDir) } vals := make([]parser.Datum, len(valTypes)) if _, err := decodeKeyVals(valTypes, vals, dirs, key); err != nil { return roachpb.NewError(err) } return sqlErrToPErr(&errUniquenessConstraintViolation{index: index, vals: vals}) } } return origPErr }
// CallComplete is called by a node whenever it completes a request. This will // publish appropriate events to the feed: // - For a successful request, a corresponding event for each request in the batch, // - on error without index information, a failure of the Batch, and // - on an indexed error a failure of the individual request. func (nef NodeEventFeed) CallComplete(ba roachpb.BatchRequest, pErr *roachpb.Error) { if pErr != nil && pErr.TransactionRestart == roachpb.TransactionRestart_ABORT { method := roachpb.Batch if iErr, ok := pErr.GoError().(roachpb.IndexedError); ok { if index, ok := iErr.ErrorIndex(); ok { method = ba.Requests[index].GetInner().Method() } } nef.f.Publish(&CallErrorEvent{ NodeID: nef.id, Method: method, }) return } for _, union := range ba.Requests { nef.f.Publish(&CallSuccessEvent{ NodeID: nef.id, Method: union.GetInner().Method(), }) } }
// handlePerReplicaError returns true if the given error is likely to // be unique to the replica that reported it, and retrying on other // replicas is likely to produce different results. This method should // be called only once for each error as it may have side effects such // as updating caches. func (ds *DistSender) handlePerReplicaError(rangeID roachpb.RangeID, pErr *roachpb.Error) bool { switch tErr := pErr.GetDetail().(type) { case *roachpb.RangeNotFoundError: return true case *roachpb.NodeUnavailableError: return true case *roachpb.NotLeaseHolderError: if tErr.LeaseHolder != nil { // If the replica we contacted knows the new lease holder, update the cache. ds.updateLeaseHolderCache(rangeID, *tErr.LeaseHolder) // TODO(bdarnell): Move the new lease holder to the head of the queue // for the next retry. } return true } return false }
// Query returns datapoints for the named time series during the supplied time // span. Data is returned as a series of consecutive data points. // // Data is queried only at the Resolution supplied: if data for the named time // series is not stored at the given resolution, an empty result will be // returned. // // All data stored on the server is downsampled to some degree; the data points // returned represent the average value within a sample period. Each datapoint's // timestamp falls in the middle of the sample period it represents. // // If data for the named time series was collected from multiple sources, each // returned datapoint will represent the sum of datapoints from all sources at // the same time. The returned string slices contains a list of all sources for // the metric which were aggregated to produce the result. func (db *DB) Query(query Query, r Resolution, startNanos, endNanos int64) ([]TimeSeriesDatapoint, []string, error) { // Normalize startNanos and endNanos the nearest SampleDuration boundary. startNanos -= startNanos % r.SampleDuration() var rows []client.KeyValue if len(query.Sources) == 0 { // Based on the supplied timestamps and resolution, construct start and end // keys for a scan that will return every key with data relevant to the // query. startKey := MakeDataKey(query.Name, "" /* source */, r, startNanos) endKey := MakeDataKey(query.Name, "" /* source */, r, endNanos).PrefixEnd() var pErr *roachpb.Error rows, pErr = db.db.ScanInconsistent(startKey, endKey, 0) if pErr != nil { return nil, nil, pErr.GoError() } } else { b := db.db.NewBatch() b.ReadConsistency = roachpb.INCONSISTENT // Iterate over all key timestamps which may contain data for the given // sources, based on the given start/end time and the resolution. kd := r.KeyDuration() startKeyNanos := startNanos - (startNanos % kd) endKeyNanos := endNanos - (endNanos % kd) for currentTimestamp := startKeyNanos; currentTimestamp <= endKeyNanos; currentTimestamp += kd { for _, source := range query.Sources { key := MakeDataKey(query.Name, source, r, currentTimestamp) b.Get(key) } } pErr := db.db.Run(b) if pErr != nil { return nil, nil, pErr.GoError() } for _, result := range b.Results { row := result.Rows[0] if row.Value == nil { continue } rows = append(rows, row) } } // Convert the queried source data into a set of data spans, one for each // source. sourceSpans, err := makeDataSpans(rows, startNanos) if err != nil { return nil, nil, err } // Compute a downsample function which will be used to return values from // each source for each sample period. downsampler, err := getDownsampleFunction(query.GetDownsampler()) if err != nil { return nil, nil, err } // If we are returning a derivative, iteration needs to start at offset -1 // (in order to correctly compute the rate of change at offset 0). var startOffset int32 isDerivative := query.GetDerivative() != TimeSeriesQueryDerivative_NONE if isDerivative { startOffset = -1 } // Create an interpolatingIterator for each dataSpan, adding each iterator // into a unionIterator collection. This is also where we compute a list of // all sources with data present in the query. sources := make([]string, 0, len(sourceSpans)) iters := make(unionIterator, 0, len(sourceSpans)) for name, span := range sourceSpans { sources = append(sources, name) iters = append(iters, span.newIterator(startOffset, downsampler)) } // Choose an aggregation function to use when taking values from the // unionIterator. var valueFn func() float64 switch query.GetSourceAggregator() { case TimeSeriesQueryAggregator_SUM: valueFn = iters.sum case TimeSeriesQueryAggregator_AVG: valueFn = iters.avg case TimeSeriesQueryAggregator_MAX: valueFn = iters.max case TimeSeriesQueryAggregator_MIN: valueFn = iters.min } // Iterate over all requested offsets, recording a value from the // unionIterator at each offset encountered. If the query is requesting a // derivative, a rate of change is recorded instead of the actual values. iters.init() var last TimeSeriesDatapoint if isDerivative { last = TimeSeriesDatapoint{ TimestampNanos: iters.timestamp(), Value: valueFn(), } // For derivatives, the iterator was initialized at offset -1 in order // to calculate the rate of change at offset zero. However, in some // cases (such as the very first value recorded) offset -1 is not // available. In this case, we treat the rate-of-change at the first // offset as zero. if iters.offset() < 0 { iters.advance() } } var responseData []TimeSeriesDatapoint for iters.isValid() && iters.timestamp() <= endNanos { current := TimeSeriesDatapoint{ TimestampNanos: iters.timestamp(), Value: valueFn(), } response := current if isDerivative { dTime := (current.TimestampNanos - last.TimestampNanos) / time.Second.Nanoseconds() if dTime == 0 { response.Value = 0 } else { response.Value = (current.Value - last.Value) / float64(dTime) } if response.Value < 0 && query.GetDerivative() == TimeSeriesQueryDerivative_NON_NEGATIVE_DERIVATIVE { response.Value = 0 } } responseData = append(responseData, response) last = current iters.advance() } return responseData, sources, nil }
// updateState updates the transaction state in both the success and // error cases, applying those updates to the corresponding txnMeta // object when adequate. It also updates certain errors with the // updated transaction for use by client restarts. func (tc *TxnCoordSender) updateState(ctx context.Context, ba roachpb.BatchRequest, br *roachpb.BatchResponse, pErr *roachpb.Error) *roachpb.Error { trace := tracer.FromCtx(ctx) newTxn := &roachpb.Transaction{} newTxn.Update(ba.GetTxn()) // TODO(tamird): remove this clone. It's currently needed to avoid race conditions. pErr = proto.Clone(pErr).(*roachpb.Error) err := pErr.GoError() // TODO(bdarnell): We're writing to errors here (and where using ErrorWithIndex); // since there's no concept of ownership copy-on-write is always preferable. switch t := err.(type) { case nil: newTxn.Update(br.Txn) // Move txn timestamp forward to response timestamp if applicable. // TODO(tschottdorf): see (*Replica).executeBatch and comments within. // Looks like this isn't necessary any more, nor did it prevent a bug // referenced in a TODO there. newTxn.Timestamp.Forward(br.Timestamp) case *roachpb.TransactionStatusError: // Likely already committed or more obscure errors such as epoch or // timestamp regressions; consider txn dead. defer tc.cleanupTxn(trace, t.Txn) case *roachpb.OpRequiresTxnError: panic("OpRequiresTxnError must not happen at this level") case *roachpb.ReadWithinUncertaintyIntervalError: // Mark the host as certain. See the protobuf comment for // Transaction.CertainNodes for details. if t.NodeID == 0 { panic("no replica set in header on uncertainty restart") } newTxn.Update(&t.Txn) newTxn.CertainNodes.Add(t.NodeID) // If the reader encountered a newer write within the uncertainty // interval, move the timestamp forward, just past that write or // up to MaxTimestamp, whichever comes first. candidateTS := newTxn.MaxTimestamp candidateTS.Backward(t.ExistingTimestamp.Add(0, 1)) newTxn.Timestamp.Forward(candidateTS) newTxn.Restart(ba.GetUserPriority(), newTxn.Priority, newTxn.Timestamp) t.Txn = *newTxn case *roachpb.TransactionAbortedError: trace.SetError() newTxn.Update(&t.Txn) // Increase timestamp if applicable. newTxn.Timestamp.Forward(t.Txn.Timestamp) newTxn.Priority = t.Txn.Priority t.Txn = *newTxn // Clean up the freshly aborted transaction in defer(), avoiding a // race with the state update below. defer tc.cleanupTxn(trace, t.Txn) case *roachpb.TransactionPushError: newTxn.Update(t.Txn) // Increase timestamp if applicable, ensuring that we're // just ahead of the pushee. newTxn.Timestamp.Forward(t.PusheeTxn.Timestamp.Add(0, 1)) newTxn.Restart(ba.GetUserPriority(), t.PusheeTxn.Priority-1, newTxn.Timestamp) t.Txn = newTxn case *roachpb.TransactionRetryError: newTxn.Update(&t.Txn) newTxn.Restart(ba.GetUserPriority(), t.Txn.Priority, newTxn.Timestamp) t.Txn = *newTxn case roachpb.TransactionRestartError: // Assertion: The above cases should exhaust all ErrorDetails which // carry a Transaction. if pErr.Detail != nil { panic(fmt.Sprintf("unhandled TransactionRestartError %T", err)) } default: trace.SetError() } return func() *roachpb.Error { if len(newTxn.ID) <= 0 { return pErr } id := string(newTxn.ID) tc.Lock() defer tc.Unlock() txnMeta := tc.txns[id] // For successful transactional requests, keep the written intents and // the updated transaction record to be sent along with the reply. // The transaction metadata is created with the first writing operation. // A tricky edge case is that of a transaction which "fails" on the // first writing request, but actually manages to write some intents // (for example, due to being multi-range). In this case, there will // be an error, but the transaction will be marked as Writing and the // coordinator must track the state, for the client's retry will be // performed with a Writing transaction which the coordinator rejects // unless it is tracking it (on top of it making sense to track it; // after all, it **has** laid down intents and only the coordinator // can augment a potential EndTransaction call). // consider re-using those. if intents := ba.GetIntents(); len(intents) > 0 && (err == nil || newTxn.Writing) { if txnMeta == nil { if !newTxn.Writing { panic("txn with intents marked as non-writing") } txnMeta = &txnMetadata{ txn: *newTxn, keys: cache.NewIntervalCache(cache.Config{Policy: cache.CacheNone}), firstUpdateNanos: tc.clock.PhysicalNow(), lastUpdateNanos: tc.clock.PhysicalNow(), timeoutDuration: tc.clientTimeout, txnEnd: make(chan struct{}), } tc.txns[id] = txnMeta // If the transaction is already over, there's no point in // launching a one-off coordinator which will shut down right // away. If we ended up here with an error, we'll always start // the coordinator - the transaction has laid down intents, so // we expect it to be committed/aborted at some point in the // future. if _, isEnding := ba.GetArg(roachpb.EndTransaction); err != nil || !isEnding { trace.Event("coordinator spawns") if !tc.stopper.RunAsyncTask(func() { tc.heartbeatLoop(id) }) { // The system is already draining and we can't start the // heartbeat. We refuse new transactions for now because // they're likely not going to have all intents committed. // In principle, we can relax this as needed though. tc.unregisterTxnLocked(id) return roachpb.NewError(&roachpb.NodeUnavailableError{}) } } } for _, intent := range intents { txnMeta.addKeyRange(intent.Key, intent.EndKey) } } // Update our record of this transaction, even on error. if txnMeta != nil { txnMeta.txn = *newTxn if !txnMeta.txn.Writing { panic("tracking a non-writing txn") } txnMeta.setLastUpdate(tc.clock.PhysicalNow()) } if err == nil { // For successful transactional requests, always send the updated txn // record back. br.Txn = newTxn } return pErr }() }
// Send implements the batch.Sender interface. If the request is part of a // transaction, the TxnCoordSender adds the transaction to a map of active // transactions and begins heartbeating it. Every subsequent request for the // same transaction updates the lastUpdate timestamp to prevent live // transactions from being considered abandoned and garbage collected. // Read/write mutating requests have their key or key range added to the // transaction's interval tree of key ranges for eventual cleanup via resolved // write intents; they're tagged to an outgoing EndTransaction request, with // the receiving replica in charge of resolving them. func (tc *TxnCoordSender) Send(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) { if err := tc.maybeBeginTxn(&ba); err != nil { return nil, roachpb.NewError(err) } ba.CmdID = ba.GetOrCreateCmdID(tc.clock.PhysicalNow()) var startNS int64 // This is the earliest point at which the request has a ClientCmdID and/or // TxnID (if applicable). Begin a Trace which follows this request. trace := tc.tracer.NewTrace(tracer.Coord, &ba) defer trace.Finalize() defer trace.Epoch("sending batch")() ctx = tracer.ToCtx(ctx, trace) var id string // optional transaction ID if ba.Txn != nil { // If this request is part of a transaction... id = string(ba.Txn.ID) // Verify that if this Transaction is not read-only, we have it on // file. If not, refuse writes - the client must have issued a write on // another coordinator previously. if ba.Txn.Writing && ba.IsTransactionWrite() { tc.Lock() _, ok := tc.txns[id] tc.Unlock() if !ok { return nil, roachpb.NewError(util.Errorf("transaction must not write on multiple coordinators")) } } // Set the timestamp to the original timestamp for read-only // commands and to the transaction timestamp for read/write // commands. if ba.IsReadOnly() { ba.Timestamp = ba.Txn.OrigTimestamp } else { ba.Timestamp = ba.Txn.Timestamp } if rArgs, ok := ba.GetArg(roachpb.EndTransaction); ok { et := rArgs.(*roachpb.EndTransactionRequest) if len(et.Key) != 0 { return nil, roachpb.NewError(util.Errorf("EndTransaction must not have a Key set")) } et.Key = ba.Txn.Key // Remember when EndTransaction started in case we want to // be linearizable. startNS = tc.clock.PhysicalNow() if len(et.Intents) > 0 { // TODO(tschottdorf): it may be useful to allow this later. // That would be part of a possible plan to allow txns which // write on multiple coordinators. return nil, roachpb.NewError(util.Errorf("client must not pass intents to EndTransaction")) } tc.Lock() txnMeta, metaOK := tc.txns[id] if id != "" && metaOK { et.Intents = txnMeta.intents() } tc.Unlock() if intents := ba.GetIntents(); len(intents) > 0 { // Writes in Batch, so EndTransaction is fine. Should add // outstanding intents to EndTransaction, though. // TODO(tschottdorf): possible issues when the batch fails, // but the intents have been added anyways. // TODO(tschottdorf): some of these intents may be covered // by others, for example {[a,b), a}). This can lead to // some extra requests when those are non-local to the txn // record. But it doesn't seem worth optimizing now. et.Intents = append(et.Intents, intents...) } else if !metaOK { // If we don't have the transaction, then this must be a retry // by the client. We can no longer reconstruct a correct // request so we must fail. // // TODO(bdarnell): if we had a GetTransactionStatus API then // we could lookup the transaction and return either nil or // TransactionAbortedError instead of this ambivalent error. return nil, roachpb.NewError(util.Errorf("transaction is already committed or aborted")) } if len(et.Intents) == 0 { // If there aren't any intents, then there's factually no // transaction to end. Read-only txns have all of their state in // the client. return nil, roachpb.NewError(util.Errorf("cannot commit a read-only transaction")) } if log.V(1) { for _, intent := range et.Intents { trace.Event(fmt.Sprintf("intent: [%s,%s)", intent.Key, intent.EndKey)) } } } } // Send the command through wrapped sender, taking appropriate measures // on error. var br *roachpb.BatchResponse { var pErr *roachpb.Error br, pErr = tc.wrapped.Send(ctx, ba) if _, ok := pErr.GoError().(*roachpb.OpRequiresTxnError); ok { br, pErr = tc.resendWithTxn(ba) } if pErr := tc.updateState(ctx, ba, br, pErr); pErr != nil { return nil, pErr } } if br.Txn == nil { return br, nil } if _, ok := ba.GetArg(roachpb.EndTransaction); !ok { return br, nil } // If the --linearizable flag is set, we want to make sure that // all the clocks in the system are past the commit timestamp // of the transaction. This is guaranteed if either // - the commit timestamp is MaxOffset behind startNS // - MaxOffset ns were spent in this function // when returning to the client. Below we choose the option // that involves less waiting, which is likely the first one // unless a transaction commits with an odd timestamp. if tsNS := br.Txn.Timestamp.WallTime; startNS > tsNS { startNS = tsNS } sleepNS := tc.clock.MaxOffset() - time.Duration(tc.clock.PhysicalNow()-startNS) if tc.linearizable && sleepNS > 0 { defer func() { if log.V(1) { log.Infof("%v: waiting %s on EndTransaction for linearizability", br.Txn.Short(), util.TruncateDuration(sleepNS, time.Millisecond)) } time.Sleep(sleepNS) }() } if br.Txn.Status != roachpb.PENDING { tc.cleanupTxn(trace, *br.Txn) } return br, nil }
// sendChunk is in charge of sending an "admissible" piece of batch, i.e. one // which doesn't need to be subdivided further before going to a range (so no // mixing of forward and reverse scans, etc). The parameters and return values // correspond to client.Sender with the exception of the returned boolean, // which is true when indicating that the caller should retry but needs to send // EndTransaction in a separate request. func (ds *DistSender) sendChunk(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error, bool) { isReverse := ba.IsReverse() // TODO(radu): when contexts are properly plumbed, we should be able to get // the tracer from ctx, not from the DistSender. ctx, cleanup := tracing.EnsureContext(ctx, tracing.TracerFromCtx(ds.Ctx)) defer cleanup() // The minimal key range encompassing all requests contained within. // Local addressing has already been resolved. // TODO(tschottdorf): consider rudimentary validation of the batch here // (for example, non-range requests with EndKey, or empty key ranges). rs, err := keys.Range(ba) if err != nil { return nil, roachpb.NewError(err), false } var br *roachpb.BatchResponse // Send the request to one range per iteration. for { // Increase the sequence counter only once before sending RPCs to // the ranges involved in this chunk of the batch (as opposed to for // each RPC individually). On RPC errors, there's no guarantee that // the request hasn't made its way to the target regardless of the // error; we'd like the second execution to be caught by the sequence // cache if that happens. There is a small chance that that we address // a range twice in this chunk (stale/suboptimal descriptors due to // splits/merges) which leads to a transaction retry. // TODO(tschottdorf): it's possible that if we don't evict from the // cache we could be in for a busy loop. ba.SetNewRequest() var curReply *roachpb.BatchResponse var desc *roachpb.RangeDescriptor var evictToken *evictionToken var needAnother bool var pErr *roachpb.Error var finished bool var numAttempts int for r := retry.StartWithCtx(ctx, ds.rpcRetryOptions); r.Next(); { numAttempts++ { const magicLogCurAttempt = 20 var seq int32 if ba.Txn != nil { seq = ba.Txn.Sequence } if numAttempts%magicLogCurAttempt == 0 || seq%magicLogCurAttempt == 0 { // Log a message if a request appears to get stuck for a long // time or, potentially, forever. See #8975. // The local counter captures this loop here; the Sequence number // should capture anything higher up (as it needs to be // incremented every time this method is called). log.Warningf( ctx, "%d retries for an RPC at sequence %d, last error was: %s, remaining key ranges %s: %s", numAttempts, seq, pErr, rs, ba, ) } } // Get range descriptor (or, when spanning range, descriptors). Our // error handling below may clear them on certain errors, so we // refresh (likely from the cache) on every retry. log.Trace(ctx, "meta descriptor lookup") var err error desc, needAnother, evictToken, err = ds.getDescriptors(ctx, rs, evictToken, isReverse) // getDescriptors may fail retryably if, for example, the first // range isn't available via Gossip. Assume that all errors at // this level are retryable. Non-retryable errors would be for // things like malformed requests which we should have checked // for before reaching this point. if err != nil { log.Trace(ctx, "range descriptor lookup failed: "+err.Error()) if log.V(1) { log.Warning(ctx, err) } pErr = roachpb.NewError(err) continue } if needAnother && br == nil { // TODO(tschottdorf): we should have a mechanism for discovering // range merges (descriptor staleness will mostly go unnoticed), // or we'll be turning single-range queries into multi-range // queries for no good reason. // If there's no transaction and op spans ranges, possibly // re-run as part of a transaction for consistency. The // case where we don't need to re-run is if the read // consistency is not required. if ba.Txn == nil && ba.IsPossibleTransaction() && ba.ReadConsistency != roachpb.INCONSISTENT { return nil, roachpb.NewError(&roachpb.OpRequiresTxnError{}), false } // If the request is more than but ends with EndTransaction, we // want the caller to come again with the EndTransaction in an // extra call. if l := len(ba.Requests) - 1; l > 0 && ba.Requests[l].GetInner().Method() == roachpb.EndTransaction { return nil, roachpb.NewError(errors.New("cannot send 1PC txn to multiple ranges")), true /* shouldSplitET */ } } // It's possible that the returned descriptor misses parts of the // keys it's supposed to scan after it's truncated to match the // descriptor. Example revscan [a,g), first desc lookup for "g" // returns descriptor [c,d) -> [d,g) is never scanned. // We evict and retry in such a case. includesFrontOfCurSpan := func(rd *roachpb.RangeDescriptor) bool { if isReverse { return desc.ContainsExclusiveEndKey(rs.EndKey) } return desc.ContainsKey(rs.Key) } if !includesFrontOfCurSpan(desc) { if err := evictToken.Evict(ctx); err != nil { return nil, roachpb.NewError(err), false } // On addressing errors, don't backoff; retry immediately. r.Reset() continue } curReply, pErr = func() (*roachpb.BatchResponse, *roachpb.Error) { // Truncate the request to our current key range. intersected, iErr := rs.Intersect(desc) if iErr != nil { return nil, roachpb.NewError(iErr) } truncBA, numActive, trErr := truncate(ba, intersected) if numActive == 0 && trErr == nil { // This shouldn't happen in the wild, but some tests // exercise it. return nil, roachpb.NewErrorf("truncation resulted in empty batch on [%s,%s): %s", rs.Key, rs.EndKey, ba) } if trErr != nil { return nil, roachpb.NewError(trErr) } return ds.sendSingleRange(ctx, truncBA, desc) }() // If sending succeeded, break this loop. if pErr == nil { finished = true break } log.VTracef(1, ctx, "reply error %s: %s", ba, pErr) // Error handling: If the error indicates that our range // descriptor is out of date, evict it from the cache and try // again. Errors that apply only to a single replica were // handled in send(). // // TODO(bdarnell): Don't retry endlessly. If we fail twice in a // row and the range descriptor hasn't changed, return the error // to our caller. switch tErr := pErr.GetDetail().(type) { case *roachpb.SendError: // We've tried all the replicas without success. Either // they're all down, or we're using an out-of-date range // descriptor. Invalidate the cache and try again with the new // metadata. if err := evictToken.Evict(ctx); err != nil { return nil, roachpb.NewError(err), false } continue case *roachpb.RangeKeyMismatchError: // Range descriptor might be out of date - evict it. This is // likely the result of a range split. If we have new range // descriptors, insert them instead as long as they are different // from the last descriptor to avoid endless loops. var replacements []roachpb.RangeDescriptor different := func(rd *roachpb.RangeDescriptor) bool { return !desc.RSpan().Equal(rd.RSpan()) } if tErr.MismatchedRange != nil && different(tErr.MismatchedRange) { replacements = append(replacements, *tErr.MismatchedRange) } if tErr.SuggestedRange != nil && different(tErr.SuggestedRange) { if includesFrontOfCurSpan(tErr.SuggestedRange) { replacements = append(replacements, *tErr.SuggestedRange) } } // Same as Evict() if replacements is empty. if err := evictToken.EvictAndReplace(ctx, replacements...); err != nil { return nil, roachpb.NewError(err), false } // On addressing errors, don't backoff; retry immediately. r.Reset() if log.V(1) { log.Warning(ctx, tErr) } continue } break } // Immediately return if querying a range failed non-retryably. if pErr != nil { return nil, pErr, false } else if !finished { select { case <-ds.rpcRetryOptions.Closer: return nil, roachpb.NewError(&roachpb.NodeUnavailableError{}), false case <-ctx.Done(): return nil, roachpb.NewError(ctx.Err()), false default: log.Fatal(ctx, "exited retry loop with nil error but finished=false") } } ba.UpdateTxn(curReply.Txn) if br == nil { // First response from a Range. br = curReply } else { // This was the second or later call in a cross-Range request. // Combine the new response with the existing one. if err := br.Combine(curReply); err != nil { return nil, roachpb.NewError(err), false } } if isReverse { // In next iteration, query previous range. // We use the StartKey of the current descriptor as opposed to the // EndKey of the previous one since that doesn't have bugs when // stale descriptors come into play. rs.EndKey, err = prev(ba, desc.StartKey) } else { // In next iteration, query next range. // It's important that we use the EndKey of the current descriptor // as opposed to the StartKey of the next one: if the former is stale, // it's possible that the next range has since merged the subsequent // one, and unless both descriptors are stale, the next descriptor's // StartKey would move us to the beginning of the current range, // resulting in a duplicate scan. rs.Key, err = next(ba, desc.EndKey) } if err != nil { return nil, roachpb.NewError(err), false } if ba.MaxSpanRequestKeys > 0 { // Count how many results we received. var numResults int64 for _, resp := range curReply.Responses { numResults += resp.GetInner().Header().NumKeys } if numResults > ba.MaxSpanRequestKeys { panic(fmt.Sprintf("received %d results, limit was %d", numResults, ba.MaxSpanRequestKeys)) } ba.MaxSpanRequestKeys -= numResults if ba.MaxSpanRequestKeys == 0 { // prepare the batch response after meeting the max key limit. fillSkippedResponses(ba, br, rs) // done, exit loop. return br, nil, false } } // If this was the last range accessed by this call, exit loop. if !needAnother { return br, nil, false } // key cannot be less that the end key. if !rs.Key.Less(rs.EndKey) { panic(fmt.Sprintf("start key %s is less than %s", rs.Key, rs.EndKey)) } log.Trace(ctx, "querying next range") } }
// Exec executes fn in the context of a distributed transaction. // Execution is controlled by opt (see comments in TxnExecOptions). // // opt is passed to fn, and it's valid for fn to modify opt as it sees // fit during each execution attempt. // // It's valid for txn to be nil (meaning the txn has already aborted) if fn // can handle that. This is useful for continuing transactions that have been // aborted because of an error in a previous batch of statements in the hope // that a ROLLBACK will reset the state. Neither opt.AutoRetry not opt.AutoCommit // can be set in this case. // // When this method returns, txn might be in any state; Exec does not attempt // to clean up the transaction before returning an error. In case of // TransactionAbortedError, txn is reset to a fresh transaction, ready to be // used. // // TODO(andrei): Make Exec() return error; make fn return an error + a retriable // bit. There's no reason to propagate roachpb.Error (protos) above this point. func (txn *Txn) Exec( opt TxnExecOptions, fn func(txn *Txn, opt *TxnExecOptions) *roachpb.Error) *roachpb.Error { // Run fn in a retry loop until we encounter a success or // error condition this loop isn't capable of handling. var pErr *roachpb.Error var retryOptions retry.Options if txn == nil && (opt.AutoRetry || opt.AutoCommit) { panic("asked to retry or commit a txn that is already aborted") } if opt.AutoRetry { retryOptions = txn.db.txnRetryOptions } RetryLoop: for r := retry.Start(retryOptions); r.Next(); { if txn != nil { // If we're looking at a brand new transaction, then communicate // what should be used as initial timestamp for the KV txn created // by TxnCoordSender. if txn.Proto.OrigTimestamp == roachpb.ZeroTimestamp { txn.Proto.OrigTimestamp = opt.MinInitialTimestamp } } pErr = fn(txn, &opt) if txn != nil { txn.retrying = true defer func() { txn.retrying = false }() } if (pErr == nil) && opt.AutoCommit && (txn.Proto.Status == roachpb.PENDING) { // fn succeeded, but didn't commit. pErr = txn.Commit() } if pErr == nil { break } // Make sure the txn record that pErr carries is for this txn. // We check only when txn.Proto.ID has been initialized after an initial successful send. if pErr.GetTxn() != nil && txn.Proto.ID != nil { if errTxn := pErr.GetTxn(); !errTxn.Equal(&txn.Proto) { return roachpb.NewErrorf("mismatching transaction record in the error:\n%s\nv.s.\n%s", errTxn, txn.Proto) } } if !opt.AutoRetry { break RetryLoop } switch pErr.TransactionRestart { case roachpb.TransactionRestart_IMMEDIATE: r.Reset() case roachpb.TransactionRestart_BACKOFF: default: break RetryLoop } if log.V(2) { log.Infof("automatically retrying transaction: %s because of error: %s", txn.DebugName(), pErr) } } if pErr != nil { pErr.StripErrorTransaction() } return pErr }
func (b *Batch) fillResults(br *roachpb.BatchResponse, pErr *roachpb.Error) error { offset := 0 for i := range b.Results { result := &b.Results[i] for k := 0; k < result.calls; k++ { args := b.reqs[offset+k] var reply roachpb.Response if result.Err == nil { result.Err = pErr.GoError() if result.Err == nil { if br != nil && offset+k < len(br.Responses) { reply = br.Responses[offset+k].GetInner() } else if args.Method() != roachpb.EndTransaction { // TODO(tschottdorf): EndTransaction is excepted here // because it may be elided (r/o txns). Might prefer to // simulate an EndTransaction response instead; this // effectively just leaks here. panic("not enough responses for calls") } } } switch req := args.(type) { case *roachpb.GetRequest: row := &result.Rows[k] row.Key = []byte(req.Key) if result.Err == nil { row.Value = reply.(*roachpb.GetResponse).Value } case *roachpb.PutRequest: row := &result.Rows[k] row.Key = []byte(req.Key) if result.Err == nil { row.Value = &req.Value row.setTimestamp(reply.(*roachpb.PutResponse).Timestamp) } case *roachpb.ConditionalPutRequest: row := &result.Rows[k] row.Key = []byte(req.Key) if result.Err == nil { row.Value = &req.Value row.setTimestamp(reply.(*roachpb.ConditionalPutResponse).Timestamp) } case *roachpb.IncrementRequest: row := &result.Rows[k] row.Key = []byte(req.Key) if result.Err == nil { t := reply.(*roachpb.IncrementResponse) row.Value = &roachpb.Value{} row.Value.SetInt(t.NewValue) row.setTimestamp(t.Timestamp) } case *roachpb.ScanRequest: if result.Err == nil { t := reply.(*roachpb.ScanResponse) result.Rows = make([]KeyValue, len(t.Rows)) for j := range t.Rows { src := &t.Rows[j] dst := &result.Rows[j] dst.Key = src.Key dst.Value = &src.Value } } case *roachpb.ReverseScanRequest: if result.Err == nil { t := reply.(*roachpb.ReverseScanResponse) result.Rows = make([]KeyValue, len(t.Rows)) for j := range t.Rows { src := &t.Rows[j] dst := &result.Rows[j] dst.Key = src.Key dst.Value = &src.Value } } case *roachpb.DeleteRequest: row := &result.Rows[k] row.Key = []byte(args.(*roachpb.DeleteRequest).Key) case *roachpb.DeleteRangeRequest: case *roachpb.BeginTransactionRequest: case *roachpb.EndTransactionRequest: case *roachpb.AdminMergeRequest: case *roachpb.AdminSplitRequest: case *roachpb.HeartbeatTxnRequest: case *roachpb.GCRequest: case *roachpb.PushTxnRequest: case *roachpb.RangeLookupRequest: case *roachpb.ResolveIntentRequest: case *roachpb.ResolveIntentRangeRequest: case *roachpb.MergeRequest: case *roachpb.TruncateLogRequest: case *roachpb.LeaderLeaseRequest: // Nothing to do for these methods as they do not generate any // rows. default: if result.Err == nil { result.Err = fmt.Errorf("unsupported reply: %T", reply) } } } offset += result.calls } for i := range b.Results { result := &b.Results[i] if result.Err != nil { return result.Err } } return nil }
// updateState updates the transaction state in both the success and // error cases, applying those updates to the corresponding txnMeta // object when adequate. It also updates certain errors with the // updated transaction for use by client restarts. func (tc *TxnCoordSender) updateState(ctx context.Context, ba roachpb.BatchRequest, br *roachpb.BatchResponse, pErr *roachpb.Error) *roachpb.Error { trace := tracer.FromCtx(ctx) newTxn := &roachpb.Transaction{} newTxn.Update(ba.GetTxn()) err := pErr.GoError() switch t := err.(type) { case nil: newTxn.Update(br.GetTxn()) // Move txn timestamp forward to response timestamp if applicable. // TODO(tschottdorf): see (*Replica).executeBatch and comments within. // Looks like this isn't necessary any more, nor did it prevent a bug // referenced in a TODO there. newTxn.Timestamp.Forward(br.Timestamp) case *roachpb.TransactionStatusError: // Likely already committed or more obscure errors such as epoch or // timestamp regressions; consider txn dead. defer tc.cleanupTxn(trace, t.Txn) case *roachpb.OpRequiresTxnError: // TODO(tschottdorf): range-spanning autowrap currently broken. panic("TODO(tschottdorf): disabled") case *roachpb.ReadWithinUncertaintyIntervalError: // Mark the host as certain. See the protobuf comment for // Transaction.CertainNodes for details. if t.NodeID == 0 { panic("no replica set in header on uncertainty restart") } newTxn.CertainNodes.Add(t.NodeID) // If the reader encountered a newer write within the uncertainty // interval, move the timestamp forward, just past that write or // up to MaxTimestamp, whichever comes first. candidateTS := newTxn.MaxTimestamp candidateTS.Backward(t.ExistingTimestamp.Add(0, 1)) newTxn.Timestamp.Forward(candidateTS) newTxn.Restart(ba.GetUserPriority(), newTxn.Priority, newTxn.Timestamp) t.Txn = *newTxn case *roachpb.TransactionAbortedError: // Increase timestamp if applicable. newTxn.Timestamp.Forward(t.Txn.Timestamp) newTxn.Priority = t.Txn.Priority t.Txn = *newTxn // Clean up the freshly aborted transaction in defer(), avoiding a // race with the state update below. defer tc.cleanupTxn(trace, t.Txn) case *roachpb.TransactionPushError: // Increase timestamp if applicable, ensuring that we're // just ahead of the pushee. newTxn.Timestamp.Forward(t.PusheeTxn.Timestamp.Add(0, 1)) newTxn.Restart(ba.GetUserPriority(), t.PusheeTxn.Priority-1, newTxn.Timestamp) t.Txn = newTxn case *roachpb.TransactionRetryError: // Increase timestamp if applicable. newTxn.Timestamp.Forward(t.Txn.Timestamp) newTxn.Restart(ba.GetUserPriority(), t.Txn.Priority, newTxn.Timestamp) t.Txn = *newTxn case roachpb.TransactionRestartError: // Assertion: The above cases should exhaust all ErrorDetails which // carry a Transaction. if pErr.Detail != nil { panic(fmt.Sprintf("unhandled TransactionRestartError %T", err)) } } return func() *roachpb.Error { if len(newTxn.ID) <= 0 { return pErr } id := string(newTxn.ID) tc.Lock() defer tc.Unlock() txnMeta := tc.txns[id] // For successful transactional requests, keep the written intents and // the updated transaction record to be sent along with the reply. // The transaction metadata is created with the first writing operation // TODO(tschottdorf): already computed the intents prior to sending, // consider re-using those. if intents := ba.GetIntents(); len(intents) > 0 && err == nil { if txnMeta == nil { newTxn.Writing = true txnMeta = &txnMetadata{ txn: *newTxn, keys: cache.NewIntervalCache(cache.Config{Policy: cache.CacheNone}), firstUpdateNanos: tc.clock.PhysicalNow(), lastUpdateNanos: tc.clock.PhysicalNow(), timeoutDuration: tc.clientTimeout, txnEnd: make(chan struct{}), } tc.txns[id] = txnMeta // If the transaction is already over, there's no point in // launching a one-off coordinator which will shut down right // away. if _, isEnding := ba.GetArg(roachpb.EndTransaction); !isEnding { trace.Event("coordinator spawns") if !tc.stopper.RunAsyncTask(func() { tc.heartbeatLoop(id) }) { // The system is already draining and we can't start the // heartbeat. We refuse new transactions for now because // they're likely not going to have all intents committed. // In principle, we can relax this as needed though. tc.unregisterTxnLocked(id) return roachpb.NewError(&roachpb.NodeUnavailableError{}) } } } for _, intent := range intents { txnMeta.addKeyRange(intent.Key, intent.EndKey) } } // Update our record of this transaction, even on error. if txnMeta != nil { txnMeta.txn.Update(newTxn) // better to replace after #2300 if !txnMeta.txn.Writing { panic("tracking a non-writing txn") } txnMeta.setLastUpdate(tc.clock.PhysicalNow()) } if err == nil { // For successful transactional requests, always send the updated txn // record back. if br.Txn == nil { br.Txn = &roachpb.Transaction{} } *br.Txn = *newTxn } return pErr }() }
// updateState updates the transaction state in both the success and // error cases, applying those updates to the corresponding txnMeta // object when adequate. It also updates certain errors with the // updated transaction for use by client restarts. func (tc *TxnCoordSender) updateState( startNS int64, ctx context.Context, ba roachpb.BatchRequest, br *roachpb.BatchResponse, pErr *roachpb.Error) *roachpb.Error { newTxn := &roachpb.Transaction{} newTxn.Update(ba.Txn) if pErr == nil { newTxn.Update(br.Txn) } else { newTxn.Update(pErr.GetTxn()) } switch t := pErr.GetDetail().(type) { case *roachpb.TransactionStatusError: // Likely already committed or more obscure errors such as epoch or // timestamp regressions; consider txn dead. defer tc.cleanupTxn(ctx, *pErr.GetTxn()) case *roachpb.OpRequiresTxnError: panic("OpRequiresTxnError must not happen at this level") case *roachpb.ReadWithinUncertaintyIntervalError: // If the reader encountered a newer write within the uncertainty // interval, we advance the txn's timestamp just past the last observed // timestamp from the node. restartTS, ok := newTxn.GetObservedTimestamp(pErr.OriginNode) if !ok { pErr = roachpb.NewError(util.Errorf("no observed timestamp for node %d found on uncertainty restart", pErr.OriginNode)) } else { newTxn.Timestamp.Forward(restartTS) newTxn.Restart(ba.UserPriority, newTxn.Priority, newTxn.Timestamp) } case *roachpb.TransactionAbortedError: // Increase timestamp if applicable. newTxn.Timestamp.Forward(pErr.GetTxn().Timestamp) newTxn.Priority = pErr.GetTxn().Priority // Clean up the freshly aborted transaction in defer(), avoiding a // race with the state update below. defer tc.cleanupTxn(ctx, *newTxn) case *roachpb.TransactionPushError: // Increase timestamp if applicable, ensuring that we're // just ahead of the pushee. newTxn.Timestamp.Forward(t.PusheeTxn.Timestamp) newTxn.Restart(ba.UserPriority, t.PusheeTxn.Priority-1, newTxn.Timestamp) case *roachpb.TransactionRetryError: // Increase timestamp so on restart, we're ahead of any timestamp // cache entries or newer versions which caused the restart. newTxn.Restart(ba.UserPriority, pErr.GetTxn().Priority, newTxn.Timestamp) case *roachpb.WriteTooOldError: newTxn.Restart(ba.UserPriority, newTxn.Priority, t.ActualTimestamp) case nil: // Nothing to do here, avoid the default case. default: if pErr.GetTxn() != nil { if pErr.CanRetry() { panic("Retryable internal error must not happen at this level") } else { // Do not clean up the transaction here since the client might still // want to continue the transaction. For example, a client might // continue its transaction after receiving ConditionFailedError, which // can come from a unique index violation. } } } if pErr != nil && pErr.GetTxn() != nil { // Avoid changing existing errors because sometimes they escape into // goroutines and then there are races. Fairly sure there isn't one // here, but better safe than sorry. pErrShallow := *pErr pErrShallow.SetTxn(newTxn) pErr = &pErrShallow } if newTxn.ID == nil { return pErr } txnID := *newTxn.ID tc.Lock() defer tc.Unlock() txnMeta := tc.txns[txnID] // For successful transactional requests, keep the written intents and // the updated transaction record to be sent along with the reply. // The transaction metadata is created with the first writing operation. // A tricky edge case is that of a transaction which "fails" on the // first writing request, but actually manages to write some intents // (for example, due to being multi-range). In this case, there will // be an error, but the transaction will be marked as Writing and the // coordinator must track the state, for the client's retry will be // performed with a Writing transaction which the coordinator rejects // unless it is tracking it (on top of it making sense to track it; // after all, it **has** laid down intents and only the coordinator // can augment a potential EndTransaction call). See #3303. var intentGroup interval.RangeGroup if txnMeta != nil { intentGroup = txnMeta.keys } else if pErr == nil || newTxn.Writing { intentGroup = interval.NewRangeTree() } if intentGroup != nil { // Adding the intents even on error reduces the likelihood of dangling // intents blocking concurrent writers for extended periods of time. // See #3346. ba.IntentSpanIterate(func(key, endKey roachpb.Key) { addKeyRange(intentGroup, key, endKey) }) if txnMeta == nil && intentGroup.Len() > 0 { if !newTxn.Writing { panic("txn with intents marked as non-writing") } // If the transaction is already over, there's no point in // launching a one-off coordinator which will shut down right // away. If we ended up here with an error, we'll always start // the coordinator - the transaction has laid down intents, so // we expect it to be committed/aborted at some point in the // future. if _, isEnding := ba.GetArg(roachpb.EndTransaction); pErr != nil || !isEnding { log.Trace(ctx, "coordinator spawns") txnMeta = &txnMetadata{ txn: *newTxn, keys: intentGroup, firstUpdateNanos: startNS, lastUpdateNanos: tc.clock.PhysicalNow(), timeoutDuration: tc.clientTimeout, txnEnd: make(chan struct{}), } tc.txns[txnID] = txnMeta if !tc.stopper.RunAsyncTask(func() { tc.heartbeatLoop(ctx, txnID) }) { // The system is already draining and we can't start the // heartbeat. We refuse new transactions for now because // they're likely not going to have all intents committed. // In principle, we can relax this as needed though. tc.unregisterTxnLocked(txnID) return roachpb.NewError(&roachpb.NodeUnavailableError{}) } } else { // If this was a successful one phase commit, update stats // directly as they won't otherwise be updated on heartbeat // loop shutdown. etArgs, ok := br.Responses[len(br.Responses)-1].GetInner().(*roachpb.EndTransactionResponse) tc.updateStats(tc.clock.PhysicalNow()-startNS, 0, newTxn.Status, ok && etArgs.OnePhaseCommit) } } } // Update our record of this transaction, even on error. if txnMeta != nil { txnMeta.txn = *newTxn if !txnMeta.txn.Writing { panic("tracking a non-writing txn") } txnMeta.setLastUpdate(tc.clock.PhysicalNow()) } if pErr == nil { // For successful transactional requests, always send the updated txn // record back. br.Txn = newTxn } return pErr }
// Send implements the batch.Sender interface. If the request is part of a // transaction, the TxnCoordSender adds the transaction to a map of active // transactions and begins heartbeating it. Every subsequent request for the // same transaction updates the lastUpdate timestamp to prevent live // transactions from being considered abandoned and garbage collected. // Read/write mutating requests have their key or key range added to the // transaction's interval tree of key ranges for eventual cleanup via resolved // write intents; they're tagged to an outgoing EndTransaction request, with // the receiving replica in charge of resolving them. func (tc *TxnCoordSender) Send(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) { { // Start new or pick up active trace and embed its trace metadata into // header for use by RPC recipients. From here on, there's always an active // Trace, though its overhead is small unless it's sampled. sp := opentracing.SpanFromContext(ctx) if sp == nil { sp = tc.tracer.StartSpan(opTxnCoordSender) defer sp.Finish() ctx = opentracing.ContextWithSpan(ctx, sp) } // TODO(tschottdorf): To get rid of the spurious alloc below we need to // implement the carrier interface on ba.Header or make Span non-nullable, // both of which force all of ba on the Heap. It's already there, so may // not be a big deal, but ba should live on the stack. Also not easy to use // a buffer pool here since anything that goes into the RPC layer could be // used by goroutines we didn't wait for. if ba.Header.Trace == nil { ba.Header.Trace = &tracing.Span{} } if err := tc.tracer.Inject(sp, basictracer.Delegator, ba.Trace); err != nil { return nil, roachpb.NewError(err) } } startNS := tc.clock.PhysicalNow() if ba.Txn != nil { // If this request is part of a transaction... if err := tc.maybeBeginTxn(&ba); err != nil { return nil, roachpb.NewError(err) } txnID := *ba.Txn.ID // Verify that if this Transaction is not read-only, we have it on file. // If not, refuse further operations - the transaction was aborted due // to a timeout or the client must have issued a write on another // coordinator previously. if ba.Txn.Writing { tc.Lock() _, ok := tc.txns[txnID] tc.Unlock() if !ok { pErr := roachpb.NewErrorf("writing transaction timed out, was aborted, " + "or ran on multiple coordinators") return nil, pErr } } if rArgs, ok := ba.GetArg(roachpb.EndTransaction); ok { et := rArgs.(*roachpb.EndTransactionRequest) if len(et.Key) != 0 { return nil, roachpb.NewErrorf("EndTransaction must not have a Key set") } et.Key = ba.Txn.Key if len(et.IntentSpans) > 0 { // TODO(tschottdorf): it may be useful to allow this later. // That would be part of a possible plan to allow txns which // write on multiple coordinators. return nil, roachpb.NewErrorf("client must not pass intents to EndTransaction") } tc.Lock() txnMeta, metaOK := tc.txns[txnID] { // Populate et.IntentSpans, taking into account both existing // writes (if any) and new writes in this batch, and taking // care to perform proper deduplication. var keys interval.RangeGroup if metaOK { keys = txnMeta.keys } else { keys = interval.NewRangeTree() } ba.IntentSpanIterate(func(key, endKey roachpb.Key) { addKeyRange(keys, key, endKey) }) et.IntentSpans = collectIntentSpans(keys) } tc.Unlock() if len(et.IntentSpans) > 0 { // All good, proceed. } else if !metaOK { // If we don't have the transaction, then this must be a retry // by the client. We can no longer reconstruct a correct // request so we must fail. // // TODO(bdarnell): if we had a GetTransactionStatus API then // we could lookup the transaction and return either nil or // TransactionAbortedError instead of this ambivalent error. return nil, roachpb.NewErrorf("transaction is already committed or aborted") } if len(et.IntentSpans) == 0 { // If there aren't any intents, then there's factually no // transaction to end. Read-only txns have all of their state in // the client. return nil, roachpb.NewErrorf("cannot commit a read-only transaction") } if log.V(1) { for _, intent := range et.IntentSpans { log.Trace(ctx, fmt.Sprintf("intent: [%s,%s)", intent.Key, intent.EndKey)) } } } } // Send the command through wrapped sender, taking appropriate measures // on error. var br *roachpb.BatchResponse { var pErr *roachpb.Error br, pErr = tc.wrapped.Send(ctx, ba) if _, ok := pErr.GetDetail().(*roachpb.OpRequiresTxnError); ok { // TODO(tschottdorf): needs to keep the trace. br, pErr = tc.resendWithTxn(ba) } if pErr = tc.updateState(startNS, ctx, ba, br, pErr); pErr != nil { log.Trace(ctx, fmt.Sprintf("error: %s", pErr)) return nil, pErr } } if br.Txn == nil { return br, nil } if _, ok := ba.GetArg(roachpb.EndTransaction); !ok { return br, nil } // If the --linearizable flag is set, we want to make sure that // all the clocks in the system are past the commit timestamp // of the transaction. This is guaranteed if either // - the commit timestamp is MaxOffset behind startNS // - MaxOffset ns were spent in this function // when returning to the client. Below we choose the option // that involves less waiting, which is likely the first one // unless a transaction commits with an odd timestamp. if tsNS := br.Txn.Timestamp.WallTime; startNS > tsNS { startNS = tsNS } sleepNS := tc.clock.MaxOffset() - time.Duration(tc.clock.PhysicalNow()-startNS) if tc.linearizable && sleepNS > 0 { defer func() { if log.V(1) { log.Infof("%v: waiting %s on EndTransaction for linearizability", br.Txn.ID.Short(), util.TruncateDuration(sleepNS, time.Millisecond)) } time.Sleep(sleepNS) }() } if br.Txn.Status != roachpb.PENDING { tc.cleanupTxn(ctx, *br.Txn) } return br, nil }
// TestClientRetryNonTxn verifies that non-transactional client will // succeed despite write/write and read/write conflicts. In the case // where the non-transactional put can push the txn, we expect the // transaction's value to be written after all retries are complete. func TestClientRetryNonTxn(t *testing.T) { defer leaktest.AfterTest(t) s := server.StartTestServer(t) defer s.Stop() s.SetRangeRetryOptions(retry.Options{ InitialBackoff: 1 * time.Millisecond, MaxBackoff: 5 * time.Millisecond, Multiplier: 2, MaxRetries: 1, }) testCases := []struct { args roachpb.Request isolation roachpb.IsolationType canPush bool expAttempts int }{ // Write/write conflicts. {&roachpb.PutRequest{}, roachpb.SNAPSHOT, true, 2}, {&roachpb.PutRequest{}, roachpb.SERIALIZABLE, true, 2}, {&roachpb.PutRequest{}, roachpb.SNAPSHOT, false, 1}, {&roachpb.PutRequest{}, roachpb.SERIALIZABLE, false, 1}, // Read/write conflicts. {&roachpb.GetRequest{}, roachpb.SNAPSHOT, true, 1}, {&roachpb.GetRequest{}, roachpb.SERIALIZABLE, true, 2}, {&roachpb.GetRequest{}, roachpb.SNAPSHOT, false, 1}, {&roachpb.GetRequest{}, roachpb.SERIALIZABLE, false, 1}, } // Lay down a write intent using a txn and attempt to write to same // key. Try this twice--once with priorities which will allow the // intent to be pushed and once with priorities which will not. for i, test := range testCases { key := roachpb.Key(fmt.Sprintf("key-%d", i)) var txnPri int32 = 1 var clientPri roachpb.UserPriority = 1 if test.canPush { clientPri = 2 } else { txnPri = 2 } db, sender := createTestNotifyClient(s.Stopper(), s.ServingAddr(), -clientPri) // doneCall signals when the non-txn read or write has completed. doneCall := make(chan struct{}) count := 0 // keeps track of retries pErr := db.Txn(func(txn *client.Txn) *roachpb.Error { if test.isolation == roachpb.SNAPSHOT { if pErr := txn.SetIsolation(roachpb.SNAPSHOT); pErr != nil { return pErr } } txn.InternalSetPriority(txnPri) count++ // Lay down the intent. if pErr := txn.Put(key, "txn-value"); pErr != nil { return pErr } // The wait group lets us pause txn until after the non-txn method has run once. wg := sync.WaitGroup{} // On the first true, send the non-txn put or get. if count == 1 { // We use a "notifying" sender here, which allows us to know exactly when the // call has been processed; otherwise, we'd be dependent on timing. sender.reset(&wg) // We must try the non-txn put or get in a goroutine because // it might have to retry and will only succeed immediately in // the event we can push. go func() { var pErr *roachpb.Error for i := 0; ; i++ { if _, ok := test.args.(*roachpb.GetRequest); ok { _, pErr = db.Get(key) } else { pErr = db.Put(key, "value") } if _, ok := pErr.GetDetail().(*roachpb.WriteIntentError); !ok { break } } close(doneCall) if pErr != nil { t.Fatalf("%d: expected success on non-txn call to %s; got %s", i, test.args.Method(), pErr) } }() sender.wait() } return nil }) if pErr != nil { t.Fatalf("%d: expected success writing transactionally; got %s", i, pErr) } // Make sure non-txn put or get has finished. <-doneCall // Get the current value to verify whether the txn happened first. gr, pErr := db.Get(key) if pErr != nil { t.Fatalf("%d: expected success getting %q: %s", i, key, pErr) } if _, isGet := test.args.(*roachpb.GetRequest); isGet || test.canPush { if !bytes.Equal(gr.ValueBytes(), []byte("txn-value")) { t.Errorf("%d: expected \"txn-value\"; got %q", i, gr.ValueBytes()) } } else { if !bytes.Equal(gr.ValueBytes(), []byte("value")) { t.Errorf("%d: expected \"value\"; got %q", i, gr.ValueBytes()) } } if count != test.expAttempts { t.Errorf("%d: expected %d attempt(s); got %d", i, test.expAttempts, count) } } }
// sendChunk is in charge of sending an "admissible" piece of batch, i.e. one // which doesn't need to be subdivided further before going to a range (so no // mixing of forward and reverse scans, etc). The parameters and return values // correspond to client.Sender with the exception of the returned boolean, // which is true when indicating that the caller should retry but needs to send // EndTransaction in a separate request. func (ds *DistSender) sendChunk(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error, bool) { isReverse := ba.IsReverse() ctx, cleanup := tracing.EnsureContext(ctx, ds.Tracer) defer cleanup() // The minimal key range encompassing all requests contained within. // Local addressing has already been resolved. // TODO(tschottdorf): consider rudimentary validation of the batch here // (for example, non-range requests with EndKey, or empty key ranges). rs, err := keys.Range(ba) if err != nil { return nil, roachpb.NewError(err), false } var br *roachpb.BatchResponse // Send the request to one range per iteration. for { // Increase the sequence counter only once before sending RPCs to // the ranges involved in this chunk of the batch (as opposed to for // each RPC individually). On RPC errors, there's no guarantee that // the request hasn't made its way to the target regardless of the // error; we'd like the second execution to be caught by the sequence // cache if that happens. There is a small chance that that we address // a range twice in this chunk (stale/suboptimal descriptors due to // splits/merges) which leads to a transaction retry. // TODO(tschottdorf): it's possible that if we don't evict from the // cache we could be in for a busy loop. ba.SetNewRequest() var curReply *roachpb.BatchResponse var desc *roachpb.RangeDescriptor var evictToken evictionToken var needAnother bool var pErr *roachpb.Error var finished bool for r := retry.Start(ds.rpcRetryOptions); r.Next(); { // Get range descriptor (or, when spanning range, descriptors). Our // error handling below may clear them on certain errors, so we // refresh (likely from the cache) on every retry. log.Trace(ctx, "meta descriptor lookup") desc, needAnother, evictToken, pErr = ds.getDescriptors(rs, evictToken, isReverse) // getDescriptors may fail retryably if the first range isn't // available via Gossip. if pErr != nil { log.Trace(ctx, "range descriptor lookup failed: "+pErr.String()) if pErr.Retryable { if log.V(1) { log.Warning(pErr) } continue } break } else { log.Trace(ctx, "looked up range descriptor") } if needAnother && br == nil { // TODO(tschottdorf): we should have a mechanism for discovering // range merges (descriptor staleness will mostly go unnoticed), // or we'll be turning single-range queries into multi-range // queries for no good reason. // If there's no transaction and op spans ranges, possibly // re-run as part of a transaction for consistency. The // case where we don't need to re-run is if the read // consistency is not required. if ba.Txn == nil && ba.IsPossibleTransaction() && ba.ReadConsistency != roachpb.INCONSISTENT { return nil, roachpb.NewError(&roachpb.OpRequiresTxnError{}), false } // If the request is more than but ends with EndTransaction, we // want the caller to come again with the EndTransaction in an // extra call. if l := len(ba.Requests) - 1; l > 0 && ba.Requests[l].GetInner().Method() == roachpb.EndTransaction { return nil, roachpb.NewError(errors.New("cannot send 1PC txn to multiple ranges")), true /* shouldSplitET */ } } // It's possible that the returned descriptor misses parts of the // keys it's supposed to scan after it's truncated to match the // descriptor. Example revscan [a,g), first desc lookup for "g" // returns descriptor [c,d) -> [d,g) is never scanned. // We evict and retry in such a case. includesFrontOfCurSpan := func(rd *roachpb.RangeDescriptor) bool { if isReverse { // This approach is needed because rs.EndKey is exclusive. return desc.ContainsKeyRange(desc.StartKey, rs.EndKey) } return desc.ContainsKey(rs.Key) } if !includesFrontOfCurSpan(desc) { if err := evictToken.Evict(); err != nil { return nil, roachpb.NewError(err), false } // On addressing errors, don't backoff; retry immediately. r.Reset() continue } curReply, pErr = func() (*roachpb.BatchResponse, *roachpb.Error) { // Truncate the request to our current key range. intersected, iErr := rs.Intersect(desc) if iErr != nil { return nil, roachpb.NewError(iErr) } truncBA, numActive, trErr := truncate(ba, intersected) if numActive == 0 && trErr == nil { // This shouldn't happen in the wild, but some tests // exercise it. return nil, roachpb.NewErrorf("truncation resulted in empty batch on [%s,%s): %s", rs.Key, rs.EndKey, ba) } if trErr != nil { return nil, roachpb.NewError(trErr) } return ds.sendSingleRange(ctx, truncBA, desc) }() // If sending succeeded, break this loop. if pErr == nil { finished = true break } if log.V(1) { log.Warningf("failed to invoke %s: %s", ba, pErr) } log.Trace(ctx, fmt.Sprintf("reply error: %T", pErr.GetDetail())) // Error handling below. // If retryable, allow retry. For range not found or range // key mismatch errors, we don't backoff on the retry, // but reset the backoff loop so we can retry immediately. switch tErr := pErr.GetDetail().(type) { case *roachpb.SendError: // For an RPC error to occur, we must've been unable to contact // any replicas. In this case, likely all nodes are down (or // not getting back to us within a reasonable amount of time). // We may simply not be trying to talk to the up-to-date // replicas, so clearing the descriptor here should be a good // idea. if err := evictToken.Evict(); err != nil { return nil, roachpb.NewError(err), false } if tErr.CanRetry() { continue } case *roachpb.RangeNotFoundError: // Range descriptor might be out of date - evict it. This is // likely the result of a rebalance. if err := evictToken.Evict(); err != nil { return nil, roachpb.NewError(err), false } // On addressing errors, don't backoff; retry immediately. r.Reset() if log.V(1) { log.Warning(tErr) } continue case *roachpb.RangeKeyMismatchError: // Range descriptor might be out of date - evict it. This is // likely the result of a range split. If we have new range // descriptors, insert them instead as long as they are different // from the last descriptor to avoid endless loops. var replacements []roachpb.RangeDescriptor different := func(rd *roachpb.RangeDescriptor) bool { return !desc.RSpan().Equal(rd.RSpan()) } if tErr.MismatchedRange != nil && different(tErr.MismatchedRange) { replacements = append(replacements, *tErr.MismatchedRange) } if tErr.SuggestedRange != nil && different(tErr.SuggestedRange) { if includesFrontOfCurSpan(tErr.SuggestedRange) { replacements = append(replacements, *tErr.SuggestedRange) } } // Same as Evict() if replacements is empty. if err := evictToken.EvictAndReplace(replacements...); err != nil { return nil, roachpb.NewError(err), false } // On addressing errors, don't backoff; retry immediately. r.Reset() if log.V(1) { log.Warning(tErr) } continue case *roachpb.NotLeaderError: newLeader := tErr.Leader if newLeader != nil { // Verify that leader is a known replica according to the // descriptor. If not, we've got a stale range descriptor; // evict cache. if i, _ := desc.FindReplica(newLeader.StoreID); i == -1 { if log.V(1) { log.Infof("error indicates unknown leader %s, expunging descriptor %s", newLeader, desc) } if err := evictToken.Evict(); err != nil { return nil, roachpb.NewError(err), false } } } else { // If the new leader is unknown, we were talking to a // replica that is partitioned away from the majority. Our // range descriptor may be stale, so clear the cache. // // TODO(bdarnell): An unknown-leader error doesn't // necessarily mean our descriptor is stale. Ideally we // would treat these errors more like SendError: retry on // another node (at a lower level), and then if it reaches // this level then we know we've exhausted our options and // must clear the cache. if err := evictToken.Evict(); err != nil { return nil, roachpb.NewError(err), false } newLeader = &roachpb.ReplicaDescriptor{} } // Next, cache the new leader. ds.updateLeaderCache(roachpb.RangeID(desc.RangeID), *newLeader) if log.V(1) { log.Warning(tErr) } r.Reset() continue case retry.Retryable: if tErr.CanRetry() { if log.V(1) { log.Warning(tErr) } continue } } break } // Immediately return if querying a range failed non-retryably. if pErr != nil { return nil, pErr, false } else if !finished { select { case <-ds.rpcRetryOptions.Closer: return nil, roachpb.NewError(&roachpb.NodeUnavailableError{}), false default: log.Fatal("exited retry loop with nil error but finished=false") } } ba.Txn.Update(curReply.Txn) if br == nil { // First response from a Range. br = curReply } else { // This was the second or later call in a cross-Range request. // Combine the new response with the existing one. if err := br.Combine(curReply); err != nil { return nil, roachpb.NewError(err), false } } if ba.MaxScanResults > 0 { // Count how many results we received. var numResults int64 for _, resp := range curReply.Responses { if cResp, ok := resp.GetInner().(roachpb.Countable); ok { numResults += cResp.Count() } } if numResults > ba.MaxScanResults { panic(fmt.Sprintf("received %d results, limit was %d", numResults, ba.MaxScanResults)) } ba.MaxScanResults -= numResults if ba.MaxScanResults == 0 { // We are done with this batch. Some requests might have NoopResponses; we must // replace them with empty responses of the proper type. for i, req := range ba.Requests { if _, ok := br.Responses[i].GetInner().(*roachpb.NoopResponse); !ok { continue } union := roachpb.ResponseUnion{} var reply roachpb.Response if _, ok := req.GetInner().(*roachpb.ScanRequest); ok { reply = &roachpb.ScanResponse{} } else { _ = req.GetInner().(*roachpb.ReverseScanRequest) reply = &roachpb.ReverseScanResponse{} } union.MustSetInner(reply) br.Responses[i] = union } return br, nil, false } } // If this request has a bound (such as MaxResults in // ScanRequest) and we are going to query at least one more range, // check whether enough rows have been retrieved. // TODO(tschottdorf): need tests for executing a multi-range batch // with various bounded requests which saturate at different times. if needAnother { // Start with the assumption that all requests are saturated. // Below, we look at each and decide whether that's true. // Everything that is indeed saturated is "masked out" from the // batch request; only if that's all requests does needAnother // remain false. needAnother = false if br == nil { // Clone ba.Requests. This is because we're multi-range, and // some requests may be bounded, which could lead to them being // masked out once they're saturated. We don't want to risk // removing requests that way in the "master copy" since that // could lead to omitting requests in certain retry scenarios. ba.Requests = append([]roachpb.RequestUnion(nil), ba.Requests...) } for i, union := range ba.Requests { args := union.GetInner() if _, ok := args.(*roachpb.NoopRequest); ok { // NoopRequests are skipped. continue } boundedArg, ok := args.(roachpb.Bounded) if !ok { // Non-bounded request. We will have to query all ranges. needAnother = true continue } prevBound := boundedArg.GetBound() cReply, ok := curReply.Responses[i].GetInner().(roachpb.Countable) if !ok || prevBound <= 0 { // Request bounded, but without max results. Again, will // need to query everything we can. The case in which the reply // isn't countable occurs when the request wasn't active for // that range (since it didn't apply to it), so the response // is a NoopResponse. needAnother = true continue } nextBound := prevBound - cReply.Count() if nextBound <= 0 { // We've hit max results for this piece of the batch. Mask // it out (we've copied the requests slice above, so this // is kosher). union := &ba.Requests[i] // avoid working on copy union.MustSetInner(&noopRequest) continue } // The request isn't saturated yet. needAnother = true boundedArg.SetBound(nextBound) } } // If this was the last range accessed by this call, exit loop. if !needAnother { return br, nil, false } if isReverse { // In next iteration, query previous range. // We use the StartKey of the current descriptor as opposed to the // EndKey of the previous one since that doesn't have bugs when // stale descriptors come into play. rs.EndKey, err = prev(ba, desc.StartKey) } else { // In next iteration, query next range. // It's important that we use the EndKey of the current descriptor // as opposed to the StartKey of the next one: if the former is stale, // it's possible that the next range has since merged the subsequent // one, and unless both descriptors are stale, the next descriptor's // StartKey would move us to the beginning of the current range, // resulting in a duplicate scan. rs.Key, err = next(ba, desc.EndKey) } if err != nil { return nil, roachpb.NewError(err), false } log.Trace(ctx, "querying next range") } }
// Send implements the batch.Sender interface. If the request is part of a // transaction, the TxnCoordSender adds the transaction to a map of active // transactions and begins heartbeating it. Every subsequent request for the // same transaction updates the lastUpdate timestamp to prevent live // transactions from being considered abandoned and garbage collected. // Read/write mutating requests have their key or key range added to the // transaction's interval tree of key ranges for eventual cleanup via resolved // write intents; they're tagged to an outgoing EndTransaction request, with // the receiving replica in charge of resolving them. func (tc *TxnCoordSender) Send(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) { { // Start new or pick up active trace and embed its trace metadata into // header for use by RPC recipients. From here on, there's always an active // Trace, though its overhead is small unless it's sampled. sp := opentracing.SpanFromContext(ctx) // TODO(radu): once contexts are plumbed correctly, we should use the Tracer // from ctx. tracer := tracing.TracerFromCtx(tc.ctx) if sp == nil { sp = tracer.StartSpan(opTxnCoordSender) defer sp.Finish() ctx = opentracing.ContextWithSpan(ctx, sp) } // TODO(tschottdorf): To get rid of the spurious alloc below we need to // implement the carrier interface on ba.Header or make Span non-nullable, // both of which force all of ba on the Heap. It's already there, so may // not be a big deal, but ba should live on the stack. Also not easy to use // a buffer pool here since anything that goes into the RPC layer could be // used by goroutines we didn't wait for. if ba.Header.Trace == nil { ba.Header.Trace = &tracing.Span{} } else { // We didn't make this object but are about to mutate it, so we // have to take a copy - the original might already have been // passed to the RPC layer. ba.Header.Trace = protoutil.Clone(ba.Header.Trace).(*tracing.Span) } if err := tracer.Inject(sp.Context(), basictracer.Delegator, ba.Trace); err != nil { return nil, roachpb.NewError(err) } } startNS := tc.clock.PhysicalNow() if ba.Txn != nil { // If this request is part of a transaction... if err := tc.maybeBeginTxn(&ba); err != nil { return nil, roachpb.NewError(err) } var et *roachpb.EndTransactionRequest var hasET bool { var rArgs roachpb.Request rArgs, hasET = ba.GetArg(roachpb.EndTransaction) if hasET { et = rArgs.(*roachpb.EndTransactionRequest) if len(et.Key) != 0 { return nil, roachpb.NewErrorf("EndTransaction must not have a Key set") } et.Key = ba.Txn.Key if len(et.IntentSpans) > 0 { // TODO(tschottdorf): it may be useful to allow this later. // That would be part of a possible plan to allow txns which // write on multiple coordinators. return nil, roachpb.NewErrorf("client must not pass intents to EndTransaction") } } } if pErr := func() *roachpb.Error { tc.Lock() defer tc.Unlock() if pErr := tc.maybeRejectClientLocked(ctx, *ba.Txn); pErr != nil { return pErr } if !hasET { return nil } // Everything below is carried out only when trying to commit. // Populate et.IntentSpans, taking into account both any existing // and new writes, and taking care to perform proper deduplication. txnMeta := tc.txns[*ba.Txn.ID] distinctSpans := true if txnMeta != nil { et.IntentSpans = txnMeta.keys // Defensively set distinctSpans to false if we had any previous // requests in this transaction. This effectively limits the distinct // spans optimization to 1pc transactions. distinctSpans = len(txnMeta.keys) == 0 } ba.IntentSpanIterate(func(key, endKey roachpb.Key) { et.IntentSpans = append(et.IntentSpans, roachpb.Span{ Key: key, EndKey: endKey, }) }) // TODO(peter): Populate DistinctSpans on all batches, not just batches // which contain an EndTransactionRequest. var distinct bool // The request might already be used by an outgoing goroutine, so // we can't safely mutate anything in-place (as MergeSpans does). et.IntentSpans = append([]roachpb.Span(nil), et.IntentSpans...) et.IntentSpans, distinct = roachpb.MergeSpans(et.IntentSpans) ba.Header.DistinctSpans = distinct && distinctSpans if len(et.IntentSpans) == 0 { // If there aren't any intents, then there's factually no // transaction to end. Read-only txns have all of their state // in the client. return roachpb.NewErrorf("cannot commit a read-only transaction") } if txnMeta != nil { txnMeta.keys = et.IntentSpans } return nil }(); pErr != nil { return nil, pErr } if hasET && log.V(1) { for _, intent := range et.IntentSpans { log.Tracef(ctx, "intent: [%s,%s)", intent.Key, intent.EndKey) } } } // Send the command through wrapped sender, taking appropriate measures // on error. var br *roachpb.BatchResponse { var pErr *roachpb.Error br, pErr = tc.wrapped.Send(ctx, ba) if _, ok := pErr.GetDetail().(*roachpb.OpRequiresTxnError); ok { // TODO(tschottdorf): needs to keep the trace. br, pErr = tc.resendWithTxn(ba) } if pErr = tc.updateState(startNS, ctx, ba, br, pErr); pErr != nil { log.Tracef(ctx, "error: %s", pErr) return nil, pErr } } if br.Txn == nil { return br, nil } if _, ok := ba.GetArg(roachpb.EndTransaction); !ok { return br, nil } // If the --linearizable flag is set, we want to make sure that // all the clocks in the system are past the commit timestamp // of the transaction. This is guaranteed if either // - the commit timestamp is MaxOffset behind startNS // - MaxOffset ns were spent in this function // when returning to the client. Below we choose the option // that involves less waiting, which is likely the first one // unless a transaction commits with an odd timestamp. if tsNS := br.Txn.Timestamp.WallTime; startNS > tsNS { startNS = tsNS } sleepNS := tc.clock.MaxOffset() - time.Duration(tc.clock.PhysicalNow()-startNS) if tc.linearizable && sleepNS > 0 { defer func() { if log.V(1) { log.Infof(ctx, "%v: waiting %s on EndTransaction for linearizability", br.Txn.ID.Short(), util.TruncateDuration(sleepNS, time.Millisecond)) } time.Sleep(sleepNS) }() } if br.Txn.Status != roachpb.PENDING { tc.Lock() tc.cleanupTxnLocked(ctx, *br.Txn) tc.Unlock() } return br, nil }
// sendChunk is in charge of sending an "admissible" piece of batch, i.e. one // which doesn't need to be subdivided further before going to a range (so no // mixing of forward and reverse scans, etc). The parameters and return values // correspond to client.Sender with the exception of the returned boolean, // which is true when indicating that the caller should retry but needs to send // EndTransaction in a separate request. func (ds *DistSender) sendChunk(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error, bool) { isReverse := ba.IsReverse() trace := tracer.FromCtx(ctx) // The minimal key range encompassing all requests contained within. // Local addressing has already been resolved. // TODO(tschottdorf): consider rudimentary validation of the batch here // (for example, non-range requests with EndKey, or empty key ranges). rs := keys.Range(ba) var br *roachpb.BatchResponse // Send the request to one range per iteration. for { considerIntents := false var curReply *roachpb.BatchResponse var desc *roachpb.RangeDescriptor var needAnother bool var pErr *roachpb.Error for r := retry.Start(ds.rpcRetryOptions); r.Next(); { // Get range descriptor (or, when spanning range, descriptors). Our // error handling below may clear them on certain errors, so we // refresh (likely from the cache) on every retry. descDone := trace.Epoch("meta descriptor lookup") var evictDesc func() desc, needAnother, evictDesc, pErr = ds.getDescriptors(rs, considerIntents, isReverse) descDone() // getDescriptors may fail retryably if the first range isn't // available via Gossip. if pErr != nil { if pErr.Retryable { if log.V(1) { log.Warning(pErr) } continue } break } if needAnother && br == nil { // TODO(tschottdorf): we should have a mechanism for discovering // range merges (descriptor staleness will mostly go unnoticed), // or we'll be turning single-range queries into multi-range // queries for no good reason. // If there's no transaction and op spans ranges, possibly // re-run as part of a transaction for consistency. The // case where we don't need to re-run is if the read // consistency is not required. if ba.Txn == nil && ba.IsPossibleTransaction() && ba.ReadConsistency != roachpb.INCONSISTENT { return nil, roachpb.NewError(&roachpb.OpRequiresTxnError{}), false } // If the request is more than but ends with EndTransaction, we // want the caller to come again with the EndTransaction in an // extra call. if l := len(ba.Requests) - 1; l > 0 && ba.Requests[l].GetInner().Method() == roachpb.EndTransaction { return nil, roachpb.NewError(errors.New("cannot send 1PC txn to multiple ranges")), true /* shouldSplitET */ } } // It's possible that the returned descriptor misses parts of the // keys it's supposed to scan after it's truncated to match the // descriptor. Example revscan [a,g), first desc lookup for "g" // returns descriptor [c,d) -> [d,g) is never scanned. // We evict and retry in such a case. if (isReverse && !desc.ContainsKeyRange(desc.StartKey, rs.EndKey)) || (!isReverse && !desc.ContainsKeyRange(rs.Key, desc.EndKey)) { evictDesc() continue } curReply, pErr = func() (*roachpb.BatchResponse, *roachpb.Error) { // Truncate the request to our current key range. intersected, iErr := rs.Intersect(desc) if iErr != nil { return nil, roachpb.NewError(iErr) } truncBA, numActive, trErr := truncate(ba, intersected) if numActive == 0 && trErr == nil { // This shouldn't happen in the wild, but some tests // exercise it. return nil, roachpb.NewErrorf("truncation resulted in empty batch on [%s,%s): %s", rs.Key, rs.EndKey, ba) } if trErr != nil { return nil, roachpb.NewError(trErr) } return ds.sendSingleRange(trace, truncBA, desc) }() // If sending succeeded, break this loop. if pErr == nil { break } if log.V(1) { log.Warningf("failed to invoke %s: %s", ba, pErr) } trace.Event(fmt.Sprintf("reply error: %T", pErr.GoError())) // Error handling below. // If retryable, allow retry. For range not found or range // key mismatch errors, we don't backoff on the retry, // but reset the backoff loop so we can retry immediately. switch tErr := pErr.GoError().(type) { case *roachpb.SendError: // For an RPC error to occur, we must've been unable to contact // any replicas. In this case, likely all nodes are down (or // not getting back to us within a reasonable amount of time). // We may simply not be trying to talk to the up-to-date // replicas, so clearing the descriptor here should be a good // idea. // TODO(tschottdorf): If a replica group goes dead, this // will cause clients to put high read pressure on the first // range, so there should be some rate limiting here. evictDesc() if tErr.CanRetry() { continue } case *roachpb.RangeNotFoundError, *roachpb.RangeKeyMismatchError: // Range descriptor might be out of date - evict it. evictDesc() // On addressing errors, don't backoff; retry immediately. r.Reset() if log.V(1) { log.Warning(tErr) } // On retries, allow [uncommitted] intents on range descriptor // lookups to be returned 50% of the time in order to succeed // at finding the transaction record pointed to by the intent // itself. The 50% probability of returning either the current // intent or the previously committed value balances between // the two cases where the intent's txn hasn't yet been // committed (the previous value is correct), or the intent's // txn has been committed (the intent value is correct). considerIntents = true continue case *roachpb.NotLeaderError: newLeader := tErr.Leader // Verify that leader is a known replica according to the // descriptor. If not, we've got a stale replica; evict cache. // Next, cache the new leader. if newLeader != nil { if i, _ := desc.FindReplica(newLeader.StoreID); i == -1 { if log.V(1) { log.Infof("error indicates unknown leader %s, expunging descriptor %s", newLeader, desc) } evictDesc() } } else { newLeader = &roachpb.ReplicaDescriptor{} } ds.updateLeaderCache(roachpb.RangeID(desc.RangeID), *newLeader) if log.V(1) { log.Warning(tErr) } r.Reset() continue case retry.Retryable: if tErr.CanRetry() { if log.V(1) { log.Warning(tErr) } continue } } break } // Immediately return if querying a range failed non-retryably. if pErr != nil { return nil, pErr, false } ba.Txn.Update(curReply.Txn) if br == nil { // First response from a Range. br = curReply } else { // This was the second or later call in a cross-Range request. // Combine the new response with the existing one. if err := br.Combine(curReply); err != nil { return nil, roachpb.NewError(err), false } } // If this request has a bound (such as MaxResults in // ScanRequest) and we are going to query at least one more range, // check whether enough rows have been retrieved. // TODO(tschottdorf): need tests for executing a multi-range batch // with various bounded requests which saturate at different times. if needAnother { // Start with the assumption that all requests are saturated. // Below, we look at each and decide whether that's true. // Everything that is indeed saturated is "masked out" from the // batch request; only if that's all requests does needAnother // remain false. needAnother = false if br == nil { // Clone ba.Requests. This is because we're multi-range, and // some requests may be bounded, which could lead to them being // masked out once they're saturated. We don't want to risk // removing requests that way in the "master copy" since that // could lead to omitting requests in certain retry scenarios. ba.Requests = append([]roachpb.RequestUnion(nil), ba.Requests...) } for i, union := range ba.Requests { args := union.GetInner() if _, ok := args.(*roachpb.NoopRequest); ok { // NoopRequests are skipped. continue } boundedArg, ok := args.(roachpb.Bounded) if !ok { // Non-bounded request. We will have to query all ranges. needAnother = true continue } prevBound := boundedArg.GetBound() cReply, ok := curReply.Responses[i].GetInner().(roachpb.Countable) if !ok || prevBound <= 0 { // Request bounded, but without max results. Again, will // need to query everything we can. The case in which the reply // isn't countable occurs when the request wasn't active for // that range (since it didn't apply to it), so the response // is a NoopResponse. needAnother = true continue } nextBound := prevBound - cReply.Count() if nextBound <= 0 { // We've hit max results for this piece of the batch. Mask // it out (we've copied the requests slice above, so this // is kosher). ba.Requests[i].Reset() // necessary (no one-of?) if !ba.Requests[i].SetValue(&roachpb.NoopRequest{}) { panic("RequestUnion excludes NoopRequest") } continue } // The request isn't saturated yet. needAnother = true boundedArg.SetBound(nextBound) } } // If this was the last range accessed by this call, exit loop. if !needAnother { return br, nil, false } if isReverse { // In next iteration, query previous range. // We use the StartKey of the current descriptor as opposed to the // EndKey of the previous one since that doesn't have bugs when // stale descriptors come into play. rs.EndKey = prev(ba, desc.StartKey) } else { // In next iteration, query next range. // It's important that we use the EndKey of the current descriptor // as opposed to the StartKey of the next one: if the former is stale, // it's possible that the next range has since merged the subsequent // one, and unless both descriptors are stale, the next descriptor's // StartKey would move us to the beginning of the current range, // resulting in a duplicate scan. rs.Key = next(ba, desc.EndKey) } trace.Event("querying next range") } }
// Batch implements the roachpb.KVServer interface. func (n *Node) Batch(ctx context.Context, args *roachpb.BatchRequest) (*roachpb.BatchResponse, error) { // TODO(marc): this code is duplicated in kv/db.go, which should be fixed. // Also, grpc's authentication model (which gives credential access in the // request handler) doesn't really fit with the current design of the // security package (which assumes that TLS state is only given at connection // time) - that should be fixed. if peer, ok := peer.FromContext(ctx); ok { if tlsInfo, ok := peer.AuthInfo.(credentials.TLSInfo); ok { certUser, err := security.GetCertificateUser(&tlsInfo.State) if err != nil { return nil, err } if certUser != security.NodeUser { return nil, util.Errorf("user %s is not allowed", certUser) } } } var br *roachpb.BatchResponse opName := "node " + strconv.Itoa(int(n.Descriptor.NodeID)) // could save allocs here fail := func(err error) { br = &roachpb.BatchResponse{} br.Error = roachpb.NewError(err) } f := func() { sp, err := tracing.JoinOrNew(n.ctx.Tracer, args.Trace, opName) if err != nil { fail(err) return } // If this is a snowball span, it gets special treatment: It skips the // regular tracing machinery, and we instead send the collected spans // back with the response. This is more expensive, but then again, // those are individual requests traced by users, so they can be. if sp.BaggageItem(tracing.Snowball) != "" { sp.LogEvent("delegating to snowball tracing") sp.Finish() if sp, err = tracing.JoinOrNewSnowball(opName, args.Trace, func(rawSpan basictracer.RawSpan) { encSp, err := tracing.EncodeRawSpan(&rawSpan, nil) if err != nil { log.Warning(err) } br.CollectedSpans = append(br.CollectedSpans, encSp) }); err != nil { fail(err) return } } defer sp.Finish() traceCtx := opentracing.ContextWithSpan(n.context(ctx), sp) tStart := timeutil.Now() var pErr *roachpb.Error br, pErr = n.stores.Send(traceCtx, *args) if pErr != nil { br = &roachpb.BatchResponse{} log.Trace(traceCtx, fmt.Sprintf("error: %T", pErr.GetDetail())) } if br.Error != nil { panic(roachpb.ErrorUnexpectedlySet(n.stores, br)) } n.metrics.callComplete(timeutil.Since(tStart), pErr) br.Error = pErr } if !n.stopper.RunTask(f) { return nil, util.Errorf("node %d stopped", n.Descriptor.NodeID) } return br, nil }
// Query returns datapoints for the named time series during the supplied time // span. Data is returned as a series of consecutive data points. // // Data is queried only at the Resolution supplied: if data for the named time // series is not stored at the given resolution, an empty result will be // returned. // // All data stored on the server is downsampled to some degree; the data points // returned represent the average value within a sample period. Each datapoint's // timestamp falls in the middle of the sample period it represents. // // If data for the named time series was collected from multiple sources, each // returned datapoint will represent the sum of datapoints from all sources at // the same time. The returned string slices contains a list of all sources for // the metric which were aggregated to produce the result. func (db *DB) Query(query TimeSeriesQueryRequest_Query, r Resolution, startNanos, endNanos int64) ([]*TimeSeriesDatapoint, []string, error) { // Normalize startNanos and endNanos the nearest SampleDuration boundary. startNanos -= startNanos % r.SampleDuration() var rows []client.KeyValue if len(query.Sources) == 0 { // Based on the supplied timestamps and resolution, construct start and end // keys for a scan that will return every key with data relevant to the // query. startKey := MakeDataKey(query.Name, "" /* source */, r, startNanos) endKey := MakeDataKey(query.Name, "" /* source */, r, endNanos).PrefixEnd() var pErr *roachpb.Error rows, pErr = db.db.Scan(startKey, endKey, 0) if pErr != nil { return nil, nil, pErr.GoError() } } else { b := db.db.NewBatch() // Iterate over all key timestamps which may contain data for the given // sources, based on the given start/end time and the resolution. for currentTimestamp := startNanos; currentTimestamp <= endNanos; currentTimestamp += r.KeyDuration() { for _, source := range query.Sources { key := MakeDataKey(query.Name, source, r, currentTimestamp) b.Get(key) } } pErr := db.db.Run(b) if pErr != nil { return nil, nil, pErr.GoError() } for _, result := range b.Results { row := result.Rows[0] if row.Value == nil { continue } rows = append(rows, row) } } // Construct a new dataSpan for each distinct source encountered in the // query. Each dataspan will contain all data queried from the same source. sourceSpans := make(map[string]*dataSpan) for _, row := range rows { data := &roachpb.InternalTimeSeriesData{} if err := row.ValueProto(data); err != nil { return nil, nil, err } _, source, _, _, err := DecodeDataKey(row.Key) if err != nil { return nil, nil, err } if _, ok := sourceSpans[source]; !ok { sourceSpans[source] = &dataSpan{ startNanos: startNanos, sampleNanos: data.SampleDurationNanos, datas: make([]calibratedData, 0, 1), } } if err := sourceSpans[source].addData(data); err != nil { return nil, nil, err } } var responseData []*TimeSeriesDatapoint sources := make([]string, 0, len(sourceSpans)) // Create an interpolatingIterator for each dataSpan. iters := make(unionIterator, 0, len(sourceSpans)) for name, span := range sourceSpans { sources = append(sources, name) iters = append(iters, span.newIterator()) } // Iterate through all values in the iteratorSet, adding a datapoint to // the response for each value. var valueFn func() float64 switch query.GetAggregator() { case TimeSeriesQueryAggregator_AVG: valueFn = iters.avg case TimeSeriesQueryAggregator_AVG_RATE: valueFn = iters.dAvg } iters.init() for iters.isValid() && iters.timestamp() <= endNanos { responseData = append(responseData, &TimeSeriesDatapoint{ TimestampNanos: iters.timestamp(), Value: valueFn(), }) iters.advance() } return responseData, sources, nil }