func (tc *TxnCoordSender) heartbeat(id string, trace *tracer.Trace, ctx context.Context) bool { tc.Lock() proceed := true txnMeta := tc.txns[id] // Before we send a heartbeat, determine whether this transaction // should be considered abandoned. If so, exit heartbeat. if txnMeta.hasClientAbandonedCoord(tc.clock.PhysicalNow()) { // TODO(tschottdorf): should we be more proactive here? // The client might be continuing the transaction // through another coordinator, but in the most likely // case it's just gone and the open transaction record // could block concurrent operations. if log.V(1) { log.Infof("transaction %s abandoned; stopping heartbeat", txnMeta.txn) } proceed = false } // txnMeta.txn is possibly replaced concurrently, // so grab a copy before unlocking. txn := txnMeta.txn tc.Unlock() if !proceed { return false } hb := &roachpb.HeartbeatTxnRequest{} hb.Key = txn.Key ba := roachpb.BatchRequest{} ba.Timestamp = tc.clock.Now() ba.CmdID = ba.GetOrCreateCmdID(ba.Timestamp.WallTime) ba.Txn = txn.Clone() ba.Add(hb) epochEnds := trace.Epoch("heartbeat") _, err := tc.wrapped.Send(ctx, ba) epochEnds() // If the transaction is not in pending state, then we can stop // the heartbeat. It's either aborted or committed, and we resolve // write intents accordingly. if err != nil { log.Warningf("heartbeat to %s failed: %s", txn, err) } // TODO(bdarnell): once we have gotten a heartbeat response with // Status != PENDING, future heartbeats are useless. However, we // need to continue the heartbeatLoop until the client either // commits or abandons the transaction. We could save a little // pointless work by restructuring this loop to stop sending // heartbeats between the time that the transaction is aborted and // the client finds out. Furthermore, we could use this information // to send TransactionAbortedErrors to the client so it can restart // immediately instead of running until its EndTransaction. return true }
// Send implements Sender. // TODO(tschottdorf): We actually don't want to chop EndTransaction off for // single-range requests (but that happens now since EndTransaction has the // isAlone flag). Whether it is one or not is unknown right now (you can only // find out after you've sent to the Range/looked up a descriptor that suggests // that you're multi-range. In those cases, the wrapped sender should return an // error so that we split and retry once the chunk which contains // EndTransaction (i.e. the last one). func (cs *chunkingSender) Send(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) { if len(ba.Requests) < 1 { panic("empty batch") } // Deterministically create ClientCmdIDs for all parts of the batch if // a CmdID is already set (otherwise, leave them empty). var nextID func() roachpb.ClientCmdID empty := roachpb.ClientCmdID{} if empty == ba.CmdID { nextID = func() roachpb.ClientCmdID { return empty } } else { rng := rand.New(rand.NewSource(ba.CmdID.Random)) id := ba.CmdID nextID = func() roachpb.ClientCmdID { curID := id // copy id.Random = rng.Int63() // adjust for next call return curID } } parts := ba.Split() var rplChunks []*roachpb.BatchResponse for _, part := range parts { ba.Requests = part ba.CmdID = nextID() rpl, err := cs.f(ctx, ba) if err != nil { return nil, err } // Propagate transaction from last reply to next request. The final // update is taken and put into the response's main header. ba.Txn.Update(rpl.Header().Txn) rplChunks = append(rplChunks, rpl) } reply := rplChunks[0] for _, rpl := range rplChunks[1:] { reply.Responses = append(reply.Responses, rpl.Responses...) } lastHeader := rplChunks[len(rplChunks)-1].BatchResponse_Header reply.Error = lastHeader.Error reply.Timestamp = lastHeader.Timestamp reply.Txn = ba.Txn return reply, nil }
// SendWrappedWith is a convenience function which wraps the request in a batch // and sends it via the provided Sender at the given timestamp. It returns the // unwrapped response or an error. It's valid to pass a `nil` context; // context.Background() is used in that case. func SendWrappedWith(sender Sender, ctx context.Context, h roachpb.Header, args roachpb.Request) (roachpb.Response, error) { if ctx == nil { ctx = context.Background() } ba := roachpb.BatchRequest{} ba.Header = h ba.CmdID = ba.GetOrCreateCmdID(0) ba.Add(args) br, pErr := sender.Send(ctx, ba) if err := pErr.GoError(); err != nil { return nil, err } unwrappedReply := br.Responses[0].GetInner() unwrappedReply.Header().Txn = br.Txn return unwrappedReply, nil }
// Send implements the batch.Sender interface. If the request is part of a // transaction, the TxnCoordSender adds the transaction to a map of active // transactions and begins heartbeating it. Every subsequent request for the // same transaction updates the lastUpdate timestamp to prevent live // transactions from being considered abandoned and garbage collected. // Read/write mutating requests have their key or key range added to the // transaction's interval tree of key ranges for eventual cleanup via resolved // write intents; they're tagged to an outgoing EndTransaction request, with // the receiving replica in charge of resolving them. func (tc *TxnCoordSender) Send(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) { if err := tc.maybeBeginTxn(&ba); err != nil { return nil, roachpb.NewError(err) } ba.CmdID = ba.GetOrCreateCmdID(tc.clock.PhysicalNow()) var startNS int64 // This is the earliest point at which the request has a ClientCmdID and/or // TxnID (if applicable). Begin a Trace which follows this request. trace := tc.tracer.NewTrace(tracer.Coord, &ba) defer trace.Finalize() defer trace.Epoch("sending batch")() ctx = tracer.ToCtx(ctx, trace) var id string // optional transaction ID if ba.Txn != nil { // If this request is part of a transaction... id = string(ba.Txn.ID) // Verify that if this Transaction is not read-only, we have it on // file. If not, refuse writes - the client must have issued a write on // another coordinator previously. if ba.Txn.Writing && ba.IsTransactionWrite() { tc.Lock() _, ok := tc.txns[id] tc.Unlock() if !ok { return nil, roachpb.NewError(util.Errorf("transaction must not write on multiple coordinators")) } } // Set the timestamp to the original timestamp for read-only // commands and to the transaction timestamp for read/write // commands. if ba.IsReadOnly() { ba.Timestamp = ba.Txn.OrigTimestamp } else { ba.Timestamp = ba.Txn.Timestamp } if rArgs, ok := ba.GetArg(roachpb.EndTransaction); ok { et := rArgs.(*roachpb.EndTransactionRequest) if len(et.Key) != 0 { return nil, roachpb.NewError(util.Errorf("EndTransaction must not have a Key set")) } et.Key = ba.Txn.Key // Remember when EndTransaction started in case we want to // be linearizable. startNS = tc.clock.PhysicalNow() if len(et.Intents) > 0 { // TODO(tschottdorf): it may be useful to allow this later. // That would be part of a possible plan to allow txns which // write on multiple coordinators. return nil, roachpb.NewError(util.Errorf("client must not pass intents to EndTransaction")) } tc.Lock() txnMeta, metaOK := tc.txns[id] if id != "" && metaOK { et.Intents = txnMeta.intents() } tc.Unlock() if intents := ba.GetIntents(); len(intents) > 0 { // Writes in Batch, so EndTransaction is fine. Should add // outstanding intents to EndTransaction, though. // TODO(tschottdorf): possible issues when the batch fails, // but the intents have been added anyways. // TODO(tschottdorf): some of these intents may be covered // by others, for example {[a,b), a}). This can lead to // some extra requests when those are non-local to the txn // record. But it doesn't seem worth optimizing now. et.Intents = append(et.Intents, intents...) } else if !metaOK { // If we don't have the transaction, then this must be a retry // by the client. We can no longer reconstruct a correct // request so we must fail. // // TODO(bdarnell): if we had a GetTransactionStatus API then // we could lookup the transaction and return either nil or // TransactionAbortedError instead of this ambivalent error. return nil, roachpb.NewError(util.Errorf("transaction is already committed or aborted")) } if len(et.Intents) == 0 { // If there aren't any intents, then there's factually no // transaction to end. Read-only txns have all of their state in // the client. return nil, roachpb.NewError(util.Errorf("cannot commit a read-only transaction")) } if log.V(1) { for _, intent := range et.Intents { trace.Event(fmt.Sprintf("intent: [%s,%s)", intent.Key, intent.EndKey)) } } } } // Send the command through wrapped sender, taking appropriate measures // on error. var br *roachpb.BatchResponse { var pErr *roachpb.Error br, pErr = tc.wrapped.Send(ctx, ba) if _, ok := pErr.GoError().(*roachpb.OpRequiresTxnError); ok { br, pErr = tc.resendWithTxn(ba) } if pErr := tc.updateState(ctx, ba, br, pErr); pErr != nil { return nil, pErr } } if br.Txn == nil { return br, nil } if _, ok := ba.GetArg(roachpb.EndTransaction); !ok { return br, nil } // If the --linearizable flag is set, we want to make sure that // all the clocks in the system are past the commit timestamp // of the transaction. This is guaranteed if either // - the commit timestamp is MaxOffset behind startNS // - MaxOffset ns were spent in this function // when returning to the client. Below we choose the option // that involves less waiting, which is likely the first one // unless a transaction commits with an odd timestamp. if tsNS := br.Txn.Timestamp.WallTime; startNS > tsNS { startNS = tsNS } sleepNS := tc.clock.MaxOffset() - time.Duration(tc.clock.PhysicalNow()-startNS) if tc.linearizable && sleepNS > 0 { defer func() { if log.V(1) { log.Infof("%v: waiting %s on EndTransaction for linearizability", br.Txn.Short(), util.TruncateDuration(sleepNS, time.Millisecond)) } time.Sleep(sleepNS) }() } if br.Txn.Status != roachpb.PENDING { tc.cleanupTxn(trace, *br.Txn) } return br, nil }
// resetClientCmdID sets the client command ID if the call is for a // read-write method. The client command ID provides idempotency // protection in conjunction with the server. func resetClientCmdID(ba *roachpb.BatchRequest) { ba.CmdID = roachpb.ClientCmdID{ WallTime: time.Now().UnixNano(), Random: rand.Int63(), } }
// TestTruncateWithSpanAndDescriptor verifies that a batch request is truncated with a // range span and the range of a descriptor found in cache. func TestTruncateWithSpanAndDescriptor(t *testing.T) { defer leaktest.AfterTest(t) g, s := makeTestGossip(t) defer s() if err := g.SetNodeDescriptor(&roachpb.NodeDescriptor{NodeID: 1}); err != nil { t.Fatal(err) } nd := &roachpb.NodeDescriptor{ NodeID: roachpb.NodeID(1), Address: util.MakeUnresolvedAddr(testAddress.Network(), testAddress.String()), } if err := g.AddInfoProto(gossip.MakeNodeIDKey(roachpb.NodeID(1)), nd, time.Hour); err != nil { t.Fatal(err) } // Fill mockRangeDescriptorDB with two descriptors. When a // range descriptor is looked up by key "b", return the second // descriptor whose range is ["a", "c") and partially overlaps // with the first descriptor's range. var descriptor1 = roachpb.RangeDescriptor{ RangeID: 1, StartKey: roachpb.RKeyMin, EndKey: roachpb.RKey("b"), Replicas: []roachpb.ReplicaDescriptor{ { NodeID: 1, StoreID: 1, }, }, } var descriptor2 = roachpb.RangeDescriptor{ RangeID: 2, StartKey: roachpb.RKey("a"), EndKey: roachpb.RKey("c"), Replicas: []roachpb.ReplicaDescriptor{ { NodeID: 1, StoreID: 1, }, }, } descDB := mockRangeDescriptorDB(func(key roachpb.RKey, _ lookupOptions) ([]roachpb.RangeDescriptor, error) { desc := descriptor1 if key.Equal(roachpb.RKey("b")) { desc = descriptor2 } return []roachpb.RangeDescriptor{desc}, nil }) // Define our rpcSend stub which checks the span of the batch // requests. The first request should be the point request on // "a". The second request should be on "b". first := true var testFn rpcSendFn = func(_ rpc.Options, method string, addrs []net.Addr, getArgs func(addr net.Addr) proto.Message, getReply func() proto.Message, _ *rpc.Context) ([]proto.Message, error) { if method != "Node.Batch" { return nil, util.Errorf("unexpected method %v", method) } ba := getArgs(testAddress).(*roachpb.BatchRequest) rs := keys.Range(*ba) if first { if !(rs.Key.Equal(roachpb.RKey("a")) && rs.EndKey.Equal(roachpb.RKey("a").Next())) { t.Errorf("Unexpected span [%s,%s)", rs.Key, rs.EndKey) } first = false } else { if !(rs.Key.Equal(roachpb.RKey("b")) && rs.EndKey.Equal(roachpb.RKey("b").Next())) { t.Errorf("Unexpected span [%s,%s)", rs.Key, rs.EndKey) } } batchReply := getReply().(*roachpb.BatchResponse) reply := &roachpb.PutResponse{} batchReply.Add(reply) return []proto.Message{batchReply}, nil } ctx := &DistSenderContext{ RPCSend: testFn, RangeDescriptorDB: descDB, } ds := NewDistSender(ctx, g) // Send a batch request contains two puts. In the first // attempt, the range of the descriptor found in the cache is // ["a", "b"). The request is truncated to contain only the put // on "a". // // In the second attempt, The range of the descriptor found in // the cache is ["a", c"), but the put on "a" will not be // resent. The request is truncated to contain only the put on "b". ba := roachpb.BatchRequest{} ba.CmdID = ba.GetOrCreateCmdID(0) ba.Txn = &roachpb.Transaction{Name: "test"} val := roachpb.MakeValueFromString("val") ba.Add(roachpb.NewPut(keys.RangeTreeNodeKey(roachpb.RKey("a")), val).(*roachpb.PutRequest)) ba.Add(roachpb.NewPut(keys.RangeTreeNodeKey(roachpb.RKey("b")), val).(*roachpb.PutRequest)) _, pErr := ds.Send(context.Background(), ba) if err := pErr.GoError(); err != nil { t.Fatal(err) } }
// process iterates through all keys in a replica's range, calling the garbage // collector for each key and associated set of values. GC'd keys are batched // into GC calls. Extant intents are resolved if intents are older than // intentAgeThreshold. func (gcq *gcQueue) process(now roachpb.Timestamp, repl *Replica, sysCfg *config.SystemConfig) error { snap := repl.store.Engine().NewSnapshot() desc := repl.Desc() iter := newReplicaDataIterator(desc, snap) defer iter.Close() defer snap.Close() // Lookup the GC policy for the zone containing this key range. zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { return fmt.Errorf("could not find GC policy for range %s: %s", repl, err) } policy := zone.GC gcMeta := roachpb.NewGCMetadata(now.WallTime) gc := engine.NewGarbageCollector(now, *policy) // Compute intent expiration (intent age at which we attempt to resolve). intentExp := now intentExp.WallTime -= intentAgeThreshold.Nanoseconds() // TODO(tschottdorf): execution will use a leader-assigned local // timestamp to compute intent age. While this should be fine, could // consider adding a Now timestamp to GCRequest which would be used // instead. gcArgs := &roachpb.GCRequest{} var mu sync.Mutex var oldestIntentNanos int64 = math.MaxInt64 var expBaseKey roachpb.Key var keys []roachpb.EncodedKey var vals [][]byte // Maps from txn ID to txn and intent key slice. txnMap := map[string]*roachpb.Transaction{} intentMap := map[string][]roachpb.Intent{} // updateOldestIntent atomically updates the oldest intent. updateOldestIntent := func(intentNanos int64) { mu.Lock() defer mu.Unlock() if intentNanos < oldestIntentNanos { oldestIntentNanos = intentNanos } } // processKeysAndValues is invoked with each key and its set of // values. Intents older than the intent age threshold are sent for // resolution and values after the MVCC metadata, and possible // intent, are sent for garbage collection. processKeysAndValues := func() { // If there's more than a single value for the key, possibly send for GC. if len(keys) > 1 { meta := &engine.MVCCMetadata{} if err := proto.Unmarshal(vals[0], meta); err != nil { log.Errorf("unable to unmarshal MVCC metadata for key %q: %s", keys[0], err) } else { // In the event that there's an active intent, send for // intent resolution if older than the threshold. startIdx := 1 if meta.Txn != nil { // Keep track of intent to resolve if older than the intent // expiration threshold. if meta.Timestamp.Less(intentExp) { id := string(meta.Txn.ID) txnMap[id] = meta.Txn intentMap[id] = append(intentMap[id], roachpb.Intent{Key: expBaseKey}) } else { updateOldestIntent(meta.Txn.OrigTimestamp.WallTime) } // With an active intent, GC ignores MVCC metadata & intent value. startIdx = 2 } // See if any values may be GC'd. if gcTS := gc.Filter(keys[startIdx:], vals[startIdx:]); !gcTS.Equal(roachpb.ZeroTimestamp) { // TODO(spencer): need to split the requests up into // multiple requests in the event that more than X keys // are added to the request. gcArgs.Keys = append(gcArgs.Keys, roachpb.GCRequest_GCKey{Key: expBaseKey, Timestamp: gcTS}) } } } } // Iterate through the keys and values of this replica's range. for ; iter.Valid(); iter.Next() { baseKey, ts, isValue, err := engine.MVCCDecodeKey(iter.Key()) if err != nil { log.Errorf("unable to decode MVCC key: %q: %v", iter.Key(), err) continue } if !isValue { // Moving to the next key (& values). processKeysAndValues() expBaseKey = baseKey keys = []roachpb.EncodedKey{iter.Key()} vals = [][]byte{iter.Value()} } else { if !baseKey.Equal(expBaseKey) { log.Errorf("unexpectedly found a value for %q with ts=%s; expected key %q", baseKey, ts, expBaseKey) continue } keys = append(keys, iter.Key()) vals = append(vals, iter.Value()) } } if iter.Error() != nil { return iter.Error() } // Handle last collected set of keys/vals. processKeysAndValues() // Process push transactions in parallel. var wg sync.WaitGroup for _, txn := range txnMap { wg.Add(1) go gcq.pushTxn(repl, now, txn, updateOldestIntent, &wg) } wg.Wait() // Resolve all intents. var intents []roachpb.Intent for id, txn := range txnMap { if txn.Status != roachpb.PENDING { for _, intent := range intentMap[id] { intent.Txn = *txn intents = append(intents, intent) } } } done := true if len(intents) > 0 { done = false repl.resolveIntents(repl.context(), intents) } // Set start and end keys. if len(gcArgs.Keys) > 0 { done = false gcArgs.Key = gcArgs.Keys[0].Key gcArgs.EndKey = gcArgs.Keys[len(gcArgs.Keys)-1].Key.Next() } if done { return nil } // Send GC request through range. gcMeta.OldestIntentNanos = proto.Int64(oldestIntentNanos) gcArgs.GCMeta = *gcMeta var ba roachpb.BatchRequest ba.CmdID = ba.GetOrCreateCmdID(now.WallTime) // Technically not needed since we're talking directly to the Range. ba.RangeID = desc.RangeID ba.Add(gcArgs) if _, pErr := repl.Send(repl.context(), ba); pErr != nil { return pErr.GoError() } // Store current timestamp as last verification for this replica, as // we've just successfully scanned. if err := repl.SetLastVerificationTimestamp(now); err != nil { log.Errorf("failed to set last verification timestamp for replica %s: %s", repl, err) } return nil }