func (mvcc *MVCC) putInternal(key Key, timestamp hlc.Timestamp, value []byte, txnID string) error { keyMeta := &keyMetadata{} ok, err := GetI(mvcc.engine, key, keyMeta) if err != nil { return err } // In case the key metadata exists. if ok { // There is an uncommitted write intent and the current Put // operation does not come from the same transaction. // This should not happen since range should check the existing // write intent before executing any Put action at MVCC level. if len(keyMeta.TxnID) > 0 && (len(txnID) == 0 || keyMeta.TxnID != txnID) { return &writeIntentError{TxnID: keyMeta.TxnID} } if keyMeta.Timestamp.Less(timestamp) || (timestamp.Equal(keyMeta.Timestamp) && txnID == keyMeta.TxnID) { // Update key metadata. PutI(mvcc.engine, key, &keyMetadata{TxnID: txnID, Timestamp: timestamp}) } else { // In case we receive a Put request to update an old version, // it must be an error since raft should handle any client // retry from timeout. return &writeTimestampTooOldError{Timestamp: keyMeta.Timestamp} } } else { // In case the key metadata does not exist yet. // Create key metadata. PutI(mvcc.engine, key, &keyMetadata{TxnID: txnID, Timestamp: timestamp}) } // Save the value with the given version (Key + Timestamp). return mvcc.engine.Put(mvccEncodeKey(key, timestamp), value) }
// getInternal implements the actual logic of get function. // The values of multiple versions for the given key should // be organized as follows: // ... // keyA : keyMetatata of keyA // keyA_Timestamp_n : value of version_n // keyA_Timestamp_n-1 : value of version_n-1 // ... // keyA_Timestamp_0 : value of version_0 // keyB : keyMetadata of keyB // ... func (mvcc *MVCC) getInternal(key Key, timestamp hlc.Timestamp, txnID string) ([]byte, hlc.Timestamp, string, error) { keyMetadata := &keyMetadata{} ok, err := GetI(mvcc.engine, key, keyMetadata) if err != nil || !ok { return nil, hlc.Timestamp{}, "", err } // If the read timestamp is greater than the latest one, we can just // fetch the value without a scan. if !timestamp.Less(keyMetadata.Timestamp) { if len(keyMetadata.TxnID) > 0 && (len(txnID) == 0 || keyMetadata.TxnID != txnID) { return nil, hlc.Timestamp{}, "", &writeIntentError{TxnID: keyMetadata.TxnID} } latestKey := mvccEncodeKey(key, keyMetadata.Timestamp) val, err := mvcc.engine.Get(latestKey) return val, keyMetadata.Timestamp, keyMetadata.TxnID, err } nextKey := mvccEncodeKey(key, timestamp) // We use the PrefixEndKey(key) as the upper bound for scan. // If there is no other version after nextKey, it won't return // the value of the next key. kvs, err := mvcc.engine.Scan(nextKey, PrefixEndKey(key), 1) if len(kvs) > 0 { _, ts := mvccDecodeKey(kvs[0].Key) return kvs[0].Value, ts, "", err } return nil, hlc.Timestamp{}, "", err }
// UpdateDeadlineMaybe sets the transactions deadline to the lower of the // current one (if any) and the passed value. func (txn *Txn) UpdateDeadlineMaybe(deadline hlc.Timestamp) bool { if txn.deadline == nil || deadline.Less(*txn.deadline) { txn.deadline = &deadline return true } return false }
func TestBatchBuilderStress(t *testing.T) { defer leaktest.AfterTest(t)() stopper := stop.NewStopper() defer stopper.Stop() e := NewInMem(roachpb.Attributes{}, 1<<20, stopper) rng, _ := randutil.NewPseudoRand() for i := 0; i < 1000; i++ { count := 1 + rng.Intn(1000) func() { batch := e.NewBatch().(*rocksDBBatch) defer batch.Close() builder := &rocksDBBatchBuilder{} for j := 0; j < count; j++ { var ts hlc.Timestamp if rng.Float32() <= 0.9 { // Give 90% of keys timestamps. ts.WallTime = rng.Int63() if rng.Float32() <= 0.1 { // Give 10% of timestamps a non-zero logical component. ts.Logical = rng.Int31() } } key := MVCCKey{ Key: []byte(fmt.Sprintf("%d", rng.Intn(10000))), Timestamp: ts, } // Generate a random mixture of puts, deletes and merges. switch rng.Intn(3) { case 0: if err := dbPut(batch.batch, key, []byte("value")); err != nil { t.Fatal(err) } builder.Put(key, []byte("value")) case 1: if err := dbClear(batch.batch, key); err != nil { t.Fatal(err) } builder.Clear(key) case 2: if err := dbMerge(batch.batch, key, appender("bar")); err != nil { t.Fatal(err) } builder.Merge(key, appender("bar")) } } batchRepr := batch.Repr() builderRepr := builder.Finish() if !bytes.Equal(batchRepr, builderRepr) { t.Fatalf("expected [% x], but got [% x]", batchRepr, builderRepr) } }() } }
// UpdateObservedTimestamp stores a timestamp off a node's clock for future // operations in the transaction. When multiple calls are made for a single // nodeID, the lowest timestamp prevails. func (t *Transaction) UpdateObservedTimestamp(nodeID NodeID, maxTS hlc.Timestamp) { if t.ObservedTimestamps == nil { t.ObservedTimestamps = make(map[NodeID]hlc.Timestamp) } if ts, ok := t.ObservedTimestamps[nodeID]; !ok || maxTS.Less(ts) { t.ObservedTimestamps[nodeID] = maxTS } }
// isAsOf analyzes a select statement to bypass the logic in newPlan(), // since that requires the transaction to be started already. If the returned // timestamp is not nil, it is the timestamp to which a transaction should // be set. func isAsOf(planMaker *planner, stmt parser.Statement, max hlc.Timestamp) (*hlc.Timestamp, error) { s, ok := stmt.(*parser.Select) if !ok { return nil, nil } sc, ok := s.Select.(*parser.SelectClause) if !ok { return nil, nil } if len(sc.From) != 1 { return nil, nil } ate, ok := sc.From[0].(*parser.AliasedTableExpr) if !ok { return nil, nil } if ate.AsOf.Expr == nil { return nil, nil } te, err := ate.AsOf.Expr.TypeCheck(nil, parser.TypeString) if err != nil { return nil, err } d, err := te.Eval(&planMaker.evalCtx) if err != nil { return nil, err } ds, ok := d.(*parser.DString) if !ok { return nil, fmt.Errorf("AS OF SYSTEM TIME expected string, got %s", ds.Type()) } // Allow nanosecond precision because the timestamp is only used by the // system and won't be returned to the user over pgwire. dt, err := parser.ParseDTimestamp(string(*ds), planMaker.session.Location, time.Nanosecond) if err != nil { return nil, err } ts := hlc.Timestamp{ WallTime: dt.Time.UnixNano(), } if max.Less(ts) { return nil, fmt.Errorf("cannot specify timestamp in the future") } return &ts, nil }
func replicaGCShouldQueueImpl( now, lastCheck, lastActivity hlc.Timestamp, isCandidate bool, ) (bool, float64) { timeout := ReplicaGCQueueInactivityThreshold var priority float64 if isCandidate { // If the range is a candidate (which happens if its former replica set // ignores it), let it expire much earlier. timeout = ReplicaGCQueueCandidateTimeout priority++ } else if now.Less(lastCheck.Add(ReplicaGCQueueInactivityThreshold.Nanoseconds(), 0)) { // Return false immediately if the previous check was less than the // check interval in the past. Note that we don't do this is the // replica is in candidate state, in which case we want to be more // aggressive - a failed rebalance attempt could have checked this // range, and candidate state suggests that a retry succeeded. See // #7489. return false, 0 } shouldQ := lastActivity.Add(timeout.Nanoseconds(), 0).Less(now) if !shouldQ { return false, 0 } return shouldQ, priority }
// selectEventTimestamp selects a timestamp for this log message. If the // transaction this event is being written in has a non-zero timestamp, then that // timestamp should be used; otherwise, the store's physical clock is used. // This helps with testing; in normal usage, the logging of an event will never // be the first action in the transaction, and thus the transaction will have an // assigned database timestamp. However, in the case of our tests log events // *are* the first action in a transaction, and we must elect to use the store's // physical time instead. func (ev EventLogger) selectEventTimestamp(input hlc.Timestamp) time.Time { if input == hlc.ZeroTimestamp { return ev.LeaseManager.clock.PhysicalTime() } return input.GoTime() }
// selectEventTimestamp selects a timestamp for this log message. If the // transaction this event is being written in has a non-zero timestamp, then that // timestamp should be used; otherwise, the store's physical clock is used. // This helps with testing; in normal usage, the logging of an event will never // be the first action in the transaction, and thus the transaction will have an // assigned database timestamp. However, in the case of our tests log events // *are* the first action in a transaction, and we must elect to use the store's // physical time instead. func selectEventTimestamp(s *Store, input hlc.Timestamp) time.Time { if input == hlc.ZeroTimestamp { return s.Clock().PhysicalTime() } return input.GoTime() }
// InitOrJoinRequest executes a RequestLease command asynchronously and returns a // channel on which the result will be posted. If there's already a request in // progress, we join in waiting for the results of that request. // It is an error to call InitOrJoinRequest() while a request is in progress // naming another replica as lease holder. // // replica is used to schedule and execute async work (proposing a RequestLease // command). replica.mu is locked when delivering results, so calls from the // replica happen either before or after a result for a pending request has // happened. // // transfer needs to be set if the request represents a lease transfer (as // opposed to an extension, or acquiring the lease when none is held). // // Note: Once this function gets a context to be used for cancellation, instead // of replica.store.Stopper().ShouldQuiesce(), care will be needed for cancelling // the Raft command, similar to replica.addWriteCmd. func (p *pendingLeaseRequest) InitOrJoinRequest( replica *Replica, nextLeaseHolder roachpb.ReplicaDescriptor, timestamp hlc.Timestamp, startKey roachpb.Key, transfer bool, ) <-chan *roachpb.Error { if nextLease := p.RequestPending(); nextLease != nil { if nextLease.Replica.ReplicaID == nextLeaseHolder.ReplicaID { // Join a pending request asking for the same replica to become lease // holder. return p.JoinRequest() } llChan := make(chan *roachpb.Error, 1) // We can't join the request in progress. llChan <- roachpb.NewErrorf("request for different replica in progress "+ "(requesting: %+v, in progress: %+v)", nextLeaseHolder.ReplicaID, nextLease.Replica.ReplicaID) return llChan } llChan := make(chan *roachpb.Error, 1) // No request in progress. Let's propose a Lease command asynchronously. // TODO(tschottdorf): get duration from configuration, either as a // config flag or, later, dynamically adjusted. startStasis := timestamp.Add(int64(replica.store.ctx.rangeLeaseActiveDuration), 0) expiration := startStasis.Add(int64(replica.store.Clock().MaxOffset()), 0) reqSpan := roachpb.Span{ Key: startKey, } var leaseReq roachpb.Request reqLease := roachpb.Lease{ Start: timestamp, StartStasis: startStasis, Expiration: expiration, Replica: nextLeaseHolder, } if transfer { leaseReq = &roachpb.TransferLeaseRequest{ Span: reqSpan, Lease: reqLease, } } else { leaseReq = &roachpb.RequestLeaseRequest{ Span: reqSpan, Lease: reqLease, } } if replica.store.Stopper().RunAsyncTask(func() { // Propose a RequestLease command and wait for it to apply. var execPErr *roachpb.Error ba := roachpb.BatchRequest{} ba.Timestamp = replica.store.Clock().Now() ba.RangeID = replica.RangeID ba.Add(leaseReq) // Send lease request directly to raft in order to skip unnecessary // checks from normal request machinery, (e.g. the command queue). // Note that the command itself isn't traced, but usually the caller // waiting for the result has an active Trace. ch, _, err := replica.proposeRaftCommand( replica.context(context.Background()), ba) if err != nil { execPErr = roachpb.NewError(err) } else { // If the command was committed, wait for the range to apply it. select { case c := <-ch: if c.Err != nil { if log.V(1) { log.Infof("failed to acquire lease for replica %s: %s", replica.store, c.Err) } execPErr = c.Err } case <-replica.store.Stopper().ShouldQuiesce(): execPErr = roachpb.NewError( replica.newNotLeaseHolderError(nil, replica.store.StoreID(), replica.Desc())) } } // Send result of lease to all waiter channels. replica.mu.Lock() defer replica.mu.Unlock() for i, llChan := range p.llChans { // Don't send the same pErr object twice; this can lead to races. We could // clone every time but it's more efficient to send pErr itself to one of // the channels (the last one; if we send it earlier the race can still // happen). if i == len(p.llChans)-1 { llChan <- execPErr } else { llChan <- protoutil.Clone(execPErr).(*roachpb.Error) // works with `nil` } } p.llChans = p.llChans[:0] p.nextLease = roachpb.Lease{} }) != nil { // We failed to start the asynchronous task. Send a blank NotLeaseHolderError // back to indicate that we have no idea who the range lease holder might // be; we've withdrawn from active duty. llChan <- roachpb.NewError( replica.newNotLeaseHolderError(nil, replica.store.StoreID(), replica.mu.state.Desc)) return llChan } p.llChans = append(p.llChans, llChan) p.nextLease = reqLease return llChan }
// Covers returns true if the given timestamp can be served by the Lease. // This is the case if the timestamp precedes the Lease's stasis period. // Note that the fact that a lease convers a timestamp is not enough for the // holder of the lease to be able to serve a read with that timestamp; // pendingLeaderLeaseRequest.TransferInProgress() should also be consulted to // account for possible lease transfers. func (l Lease) Covers(timestamp hlc.Timestamp) bool { return timestamp.Less(l.StartStasis) }
// add the specified timestamp to the cache as covering the range of // keys from start to end. If end is nil, the range covers the start // key only. txnID is nil for no transaction. readTSCache specifies // whether the command adding this timestamp should update the read // timestamp; false to update the write timestamp cache. func (tc *timestampCache) add( start, end roachpb.Key, timestamp hlc.Timestamp, txnID *uuid.UUID, readTSCache bool, ) { // This gives us a memory-efficient end key if end is empty. if len(end) == 0 { end = start.Next() start = end[:len(start)] } tc.latest.Forward(timestamp) // Only add to the cache if the timestamp is more recent than the // low water mark. if tc.lowWater.Less(timestamp) { tcache := tc.wCache if readTSCache { tcache = tc.rCache } addRange := func(r interval.Range) { value := cacheValue{timestamp: timestamp, txnID: txnID} key := tcache.MakeKey(r.Start, r.End) entry := makeCacheEntry(key, value) tcache.AddEntry(entry) } r := interval.Range{ Start: interval.Comparable(start), End: interval.Comparable(end), } // Check existing, overlapping entries and truncate/split/remove if // superseded and in the past. If existing entries are in the future, // subtract from the range/ranges that need to be added to cache. for _, entry := range tcache.GetOverlaps(r.Start, r.End) { cv := entry.Value.(*cacheValue) key := entry.Key.(*cache.IntervalKey) sCmp := r.Start.Compare(key.Start) eCmp := r.End.Compare(key.End) if !timestamp.Less(cv.timestamp) { // The existing interval has a timestamp less than or equal to the new interval. // Compare interval ranges to determine how to modify existing interval. switch { case sCmp == 0 && eCmp == 0: // New and old are equal; replace old with new and avoid the need to insert new. // // New: ------------ // Old: ------------ // // New: ------------ *cv = cacheValue{timestamp: timestamp, txnID: txnID} tcache.MoveToEnd(entry) return case sCmp <= 0 && eCmp >= 0: // New contains or is equal to old; delete old. // // New: ------------ ------------ ------------ // Old: -------- or ---------- or ---------- // // Old: tcache.DelEntry(entry) case sCmp > 0 && eCmp < 0: // Old contains new; split up old into two. // // New: ---- // Old: ------------ // // Old: ---- ---- oldEnd := key.End key.End = r.Start key := tcache.MakeKey(r.End, oldEnd) newEntry := makeCacheEntry(key, *cv) tcache.AddEntryAfter(newEntry, entry) case eCmp >= 0: // Left partial overlap; truncate old end. // // New: -------- -------- // Old: -------- or ------------ // // Old: ---- ---- key.End = r.Start case sCmp <= 0: // Right partial overlap; truncate old start. // // New: -------- -------- // Old: -------- or ------------ // // Old: ---- ---- key.Start = r.End default: panic(fmt.Sprintf("no overlap between %v and %v", key.Range, r)) } } else { // The existing interval has a timestamp greater than the new interval. // Compare interval ranges to determine how to modify new interval before // adding it to the timestamp cache. switch { case sCmp >= 0 && eCmp <= 0: // Old contains or is equal to new; no need to add. // // Old: ----------- ----------- ----------- ----------- // New: ----- or ----------- or -------- or -------- // // New: return case sCmp < 0 && eCmp > 0: // New contains old; split up old into two. We can add the left piece // immediately because it is guaranteed to be before the rest of the // overlaps. // // Old: ------ // New: ------------ // // New: --- --- lr := interval.Range{Start: r.Start, End: key.Start} addRange(lr) r.Start = key.End case eCmp > 0: // Left partial overlap; truncate new start. // // Old: -------- -------- // New: -------- or ------------ // // New: ---- ---- r.Start = key.End case sCmp < 0: // Right partial overlap; truncate new end. // // Old: -------- -------- // New: -------- or ------------ // // New: ---- ---- r.End = key.Start default: panic(fmt.Sprintf("no overlap between %v and %v", key.Range, r)) } } } addRange(r) } }
// TestTxnCoordSenderHeartbeat verifies periodic heartbeat of the // transaction record. func TestTxnCoordSenderHeartbeat(t *testing.T) { defer leaktest.AfterTest(t)() s, sender := createTestDB(t) defer s.Stop() defer teardownHeartbeats(sender) // Set heartbeat interval to 1ms for testing. sender.heartbeatInterval = 1 * time.Millisecond initialTxn := client.NewTxn(context.Background(), *s.DB) if err := initialTxn.Put(roachpb.Key("a"), []byte("value")); err != nil { t.Fatal(err) } // Verify 3 heartbeats. var heartbeatTS hlc.Timestamp for i := 0; i < 3; i++ { util.SucceedsSoon(t, func() error { txn, pErr := getTxn(sender, &initialTxn.Proto) if pErr != nil { t.Fatal(pErr) } // Advance clock by 1ns. // Locking the TxnCoordSender to prevent a data race. sender.Lock() s.Manual.Increment(1) sender.Unlock() if txn.LastHeartbeat != nil && heartbeatTS.Less(*txn.LastHeartbeat) { heartbeatTS = *txn.LastHeartbeat return nil } return errors.Errorf("expected heartbeat") }) } // Sneakily send an ABORT right to DistSender (bypassing TxnCoordSender). { var ba roachpb.BatchRequest ba.Add(&roachpb.EndTransactionRequest{ Commit: false, Span: roachpb.Span{Key: initialTxn.Proto.Key}, }) ba.Txn = &initialTxn.Proto if _, pErr := sender.wrapped.Send(context.Background(), ba); pErr != nil { t.Fatal(pErr) } } util.SucceedsSoon(t, func() error { sender.Lock() defer sender.Unlock() if txnMeta, ok := sender.txns[*initialTxn.Proto.ID]; !ok { t.Fatal("transaction unregistered prematurely") } else if txnMeta.txn.Status != roachpb.ABORTED { return fmt.Errorf("transaction is not aborted") } return nil }) // Trying to do something else should give us a TransactionAbortedError. _, err := initialTxn.Get("a") assertTransactionAbortedError(t, err) }