// InternalHeartbeatTxn updates the transaction status and heartbeat // timestamp after receiving transaction heartbeat messages from // coordinator. Returns the udpated transaction. func (r *Range) InternalHeartbeatTxn(args *proto.InternalHeartbeatTxnRequest, reply *proto.InternalHeartbeatTxnResponse) { // Create the actual key to the system-local transaction table. key := engine.MakeKey(engine.KeyLocalTransactionPrefix, args.Key) var txn proto.Transaction ok, err := engine.GetProto(r.engine, key, &txn) if err != nil { reply.SetGoError(err) return } // If no existing transaction record was found, initialize // to the transaction in the request header. if !ok { gogoproto.Merge(&txn, args.Txn) } if txn.Status == proto.PENDING { if txn.LastHeartbeat == nil { txn.LastHeartbeat = &proto.Timestamp{} } if txn.LastHeartbeat.Less(args.Header().Timestamp) { *txn.LastHeartbeat = args.Header().Timestamp } if err := engine.PutProto(r.engine, key, &txn); err != nil { reply.SetGoError(err) return } } reply.Txn = &txn }
// CreateRange allocates a new range ID and stores range metadata. // On success, returns the new range. func (s *Store) CreateRange(startKey, endKey engine.Key, replicas []proto.Replica) (*Range, error) { rangeID, err := engine.Increment(s.engine, engine.KeyLocalRangeIDGenerator, 1) if err != nil { return nil, err } if ok, _ := engine.GetProto(s.engine, makeRangeKey(rangeID), nil); ok { return nil, util.Error("newly allocated range ID already in use") } // RangeMetadata is stored local to this store only. It is neither // replicated via raft nor available via the global kv store. meta := &proto.RangeMetadata{ ClusterID: s.Ident.ClusterID, RangeID: rangeID, RangeDescriptor: proto.RangeDescriptor{ StartKey: startKey, EndKey: endKey, Replicas: replicas, }, } err = engine.PutProto(s.engine, makeRangeKey(rangeID), meta) if err != nil { return nil, err } rng := NewRange(meta, s.clock, s.engine, s.allocator, s.gossip, s) rng.Start() s.mu.Lock() defer s.mu.Unlock() s.ranges[rangeID] = rng return rng, nil }
// Init starts the engine, sets the GC and reads the StoreIdent. func (s *Store) Init() error { // Start engine and set garbage collector. if err := s.engine.Start(); err != nil { return err } s.engine.SetGCTimeouts(func() (minTxnTS, minRCacheTS int64) { now := s.clock.Now() minTxnTS = 0 // disable GC of transactions until we know minimum write intent age minRCacheTS = now.WallTime - GCResponseCacheExpiration.Nanoseconds() return }) // Read store ident and return a not-bootstrapped error if necessary. ok, err := engine.GetProto(s.engine, engine.KeyLocalIdent, &s.Ident) if err != nil { return err } else if !ok { return &NotBootstrappedError{} } // TODO(spencer): scan through all range metadata and instantiate // ranges. Right now we just get range ID hardcoded as 1. var meta proto.RangeMetadata ok, err = engine.GetProto(s.engine, makeRangeKey(1), &meta) if err != nil || !ok { return err } rng := NewRange(&meta, s.clock, s.engine, s.allocator, s.gossip, s) rng.Start() s.mu.Lock() defer s.mu.Unlock() s.ranges[meta.RangeID] = rng return nil }
// InternalHeartbeatTxn updates the transaction status and heartbeat // timestamp after receiving transaction heartbeat messages from // coordinator. The range will return the current status for this // transaction to the coordinator. func (r *Range) InternalHeartbeatTxn(args *proto.InternalHeartbeatTxnRequest, reply *proto.InternalHeartbeatTxnResponse) { // Create the actual key to the system-local transaction table. key := engine.MakeKey(engine.KeyLocalTransactionPrefix, args.Key) var txn proto.Transaction if _, err := engine.GetProto(r.engine, key, &txn); err != nil { reply.SetGoError(err) return } if txn.Status == proto.PENDING { if !args.Header().Timestamp.Less(txn.LastHeartbeat) { txn.LastHeartbeat = args.Header().Timestamp } if err := engine.PutProto(r.engine, key, &txn); err != nil { reply.SetGoError(err) return } } reply.Status = txn.Status }
// GetResponse looks up a response matching the specified cmdID and // returns true if found. The response is deserialized into the // supplied reply parameter. If no response is found, returns // false. If a command is pending already for the cmdID, then this // method will block until the the command is completed or the // response cache is cleared. func (rc *ResponseCache) GetResponse(cmdID proto.ClientCmdID, reply interface{}) (bool, error) { // Do nothing if command ID is empty. if cmdID.IsEmpty() { return false, nil } // If the command is inflight, wait for it to complete. rc.Lock() for { if cond, ok := rc.inflight[makeCmdIDKey(cmdID)]; ok { cond.Wait() } else { break } } // Adding inflight here is preemptive; we don't want to hold lock // while fetching from the on-disk cache. The vast, vast majority of // calls to GetResponse will be cache misses, so this saves us // from acquiring the lock twice: once here and once below in the // event we experience a cache miss. rc.addInflightLocked(cmdID) rc.Unlock() // If the response is in the cache or we experienced an error, return. rwResp := proto.ReadWriteCmdResponse{} if ok, err := engine.GetProto(rc.engine, rc.makeKey(cmdID), &rwResp); ok || err != nil { rc.Lock() // Take lock after fetching response from cache. defer rc.Unlock() rc.removeInflightLocked(cmdID) if err == nil && rwResp.GetValue() != nil { gogoproto.Merge(reply.(gogoproto.Message), rwResp.GetValue().(gogoproto.Message)) } return ok, err } // There's no command result cached for this ID; but inflight was added above. return false, nil }
// Init starts the engine, sets the GC and reads the StoreIdent. func (s *Store) Init() error { // Close store for idempotency. s.Close() // Start engine and set garbage collector. if err := s.engine.Start(); err != nil { return err } // Create ID allocators. s.raftIDAlloc = NewIDAllocator(engine.KeyRaftIDGenerator, s.db, 2, raftIDAllocCount) s.rangeIDAlloc = NewIDAllocator(engine.KeyRangeIDGenerator, s.db, 2, rangeIDAllocCount) // GCTimeouts method is called each time an engine compaction is // underway. It sets minimum timeouts for transaction records and // response cache entries. s.engine.SetGCTimeouts(func() (minTxnTS, minRCacheTS int64) { now := s.clock.Now() minTxnTS = 0 // disable GC of transactions until we know minimum write intent age minRCacheTS = now.WallTime - GCResponseCacheExpiration.Nanoseconds() return }) // Read store ident and return a not-bootstrapped error if necessary. ok, err := engine.GetProto(s.engine, engine.KeyLocalIdent, &s.Ident) if err != nil { return err } else if !ok { return &NotBootstrappedError{} } s.mu.Lock() defer s.mu.Unlock() start := engine.KeyLocalRangeMetadataPrefix end := engine.PrefixEndKey(start) const rows = 64 for { kvs, err := s.engine.Scan(start, end, rows) if err != nil { return err } for _, kv := range kvs { var meta proto.RangeMetadata if err := gogoproto.Unmarshal(kv.Value, &meta); err != nil { return err } rng := NewRange(&meta, s.clock, s.engine, s.allocator, s.gossip, s) rng.Start() s.ranges[meta.RangeID] = rng s.rangesByKey = append(s.rangesByKey, rng) } if len(kvs) < rows { break } start = engine.NextKey(kvs[rows-1].Key) } // Ensure that ranges are sorted. sort.Sort(s.rangesByKey) return nil }
// EndTransaction either commits or aborts (rolls back) an extant // transaction according to the args.Commit parameter. func (r *Range) EndTransaction(args *proto.EndTransactionRequest, reply *proto.EndTransactionResponse) { // Create the actual key to the system-local transaction table. key := engine.MakeKey(engine.KeyLocalTransactionPrefix, args.Key) // Start with supplied transaction, then possibly load from txn record. reply.Txn = gogoproto.Clone(args.Txn).(*proto.Transaction) // Fetch existing transaction if possible. existTxn := &proto.Transaction{} ok, err := engine.GetProto(r.engine, key, existTxn) if err != nil { reply.SetGoError(err) return } // If the transaction record already exists, verify that we can either // commit it or abort it (according to args.Commit), and also that the // Timestamp and Epoch have not suffered regression. if ok { if existTxn.Status == proto.COMMITTED { reply.SetGoError(proto.NewTransactionStatusError(existTxn, "already committed")) return } else if existTxn.Status == proto.ABORTED { reply.SetGoError(proto.NewTransactionStatusError(existTxn, "already aborted")) return } else if args.Txn.Epoch < existTxn.Epoch { reply.SetGoError(proto.NewTransactionStatusError(existTxn, fmt.Sprintf("epoch regression: %d", args.Txn.Epoch))) return } else if existTxn.Timestamp.Less(args.Txn.Timestamp) { // The transaction record can only ever be pushed forward, so it's an // error if somehow the transaction record has an earlier timestamp // than the transaction timestamp. reply.SetGoError(proto.NewTransactionStatusError(existTxn, fmt.Sprintf("timestamp regression: %+v", args.Txn.Timestamp))) return } // Use the persisted transaction record as final transaction. gogoproto.Merge(reply.Txn, existTxn) } // Take max of requested timestamp and possibly "pushed" txn // record timestamp as the final commit timestamp. if reply.Txn.Timestamp.Less(args.Timestamp) { reply.Txn.Timestamp = args.Timestamp } // Set transaction status to COMMITTED or ABORTED as per the // args.Commit parameter. if args.Commit { // If the isolation level is SERIALIZABLE, return a transaction // retry error if the commit timestamp isn't equal to the txn // timestamp. if args.Txn.Isolation == proto.SERIALIZABLE && !reply.Txn.Timestamp.Equal(args.Txn.Timestamp) { reply.SetGoError(proto.NewTransactionRetryError(reply.Txn)) return } reply.Txn.Status = proto.COMMITTED } else { reply.Txn.Status = proto.ABORTED } // Persist the transaction record with updated status (& possibly timestmap). if err := engine.PutProto(r.engine, key, reply.Txn); err != nil { reply.SetGoError(err) return } }
// InternalPushTxn resolves conflicts between concurrent txns (or // between a non-transactional reader or writer and a txn) in several // ways depending on the statuses and priorities of the conflicting // transactions. The InternalPushTxn operation is invoked by a // "pusher" (the writer trying to abort a conflicting txn or the // reader trying to push a conflicting txn's commit timestamp // forward), who attempts to resolve a conflict with a "pushee" // (args.PushTxn -- the pushee txn whose intent(s) caused the // conflict). // // Txn already committed/aborted: If pushee txn is committed or // aborted return success. // // Txn Timeout: If pushee txn entry isn't present or its LastHeartbeat // timestamp isn't set, use PushTxn.Timestamp as LastHeartbeat. If // current time - LastHeartbeat > 2 * DefaultHeartbeatInterval, then // the pushee txn should be either pushed forward or aborted, // depending on value of Request.Abort. // // Old Txn Epoch: If persisted pushee txn entry has a newer Epoch than // PushTxn.Epoch, return success, as older epoch may be removed. // // Lower Txn Priority: If pushee txn has a lower priority than pusher, // adjust pushee's persisted txn depending on value of args.Abort. If // args.Abort is true, set txn.Status to ABORTED, and priority to one // less than the pusher's priority and return success. If args.Abort // is false, set txn.Timestamp to pusher's txn.Timestamp + 1. // // Higher Txn Priority: If pushee txn has a higher priority than // pusher, return TransactionRetryError. Transaction will be retried // with priority one less than the pushee's higher priority. func (r *Range) InternalPushTxn(args *proto.InternalPushTxnRequest, reply *proto.InternalPushTxnResponse) { if !bytes.Equal(args.Key, args.PusheeTxn.ID) { reply.SetGoError(util.Errorf("request key %q should match pushee's txn ID %q", args.Key, args.PusheeTxn.ID)) return } // Create the actual key to the system-local transaction table. key := engine.MakeKey(engine.KeyLocalTransactionPrefix, args.Key) // Fetch existing transaction if possible. existTxn := &proto.Transaction{} ok, err := engine.GetProto(r.engine, key, existTxn) if err != nil { reply.SetGoError(err) return } if ok { // Start with the persisted transaction record as final transaction. reply.PusheeTxn = gogoproto.Clone(existTxn).(*proto.Transaction) // Upgrade the epoch and timestamp as necessary. if reply.PusheeTxn.Epoch < args.PusheeTxn.Epoch { reply.PusheeTxn.Epoch = args.PusheeTxn.Epoch } if reply.PusheeTxn.Timestamp.Less(args.PusheeTxn.Timestamp) { reply.PusheeTxn.Timestamp = args.PusheeTxn.Timestamp } } else { // Some sanity checks for case where we don't find a transaction record. if args.PusheeTxn.LastHeartbeat != nil { reply.SetGoError(proto.NewTransactionStatusError(&args.PusheeTxn, "no txn persisted, yet intent has heartbeat")) return } else if args.PusheeTxn.Status != proto.PENDING { reply.SetGoError(proto.NewTransactionStatusError(&args.PusheeTxn, fmt.Sprintf("no txn persisted, yet intent has status %s", args.PusheeTxn.Status))) return } // The transaction doesn't exist yet on disk; use the supplied version. reply.PusheeTxn = gogoproto.Clone(&args.PusheeTxn).(*proto.Transaction) } // If already committed or aborted, return success. if reply.PusheeTxn.Status != proto.PENDING { // Trivial noop. return } // If we're trying to move the timestamp forward, and it's already // far enough forward, return success. if !args.Abort && args.Timestamp.Less(reply.PusheeTxn.Timestamp) { // Trivial noop. return } // pusherWins bool is true in the event the pusher prevails. var pusherWins bool // Check for txn timeout. if reply.PusheeTxn.LastHeartbeat == nil { reply.PusheeTxn.LastHeartbeat = &reply.PusheeTxn.Timestamp } // Compute heartbeat expiration. expiry := r.clock.Now() expiry.WallTime -= 2 * DefaultHeartbeatInterval.Nanoseconds() if reply.PusheeTxn.LastHeartbeat.Less(expiry) { log.V(1).Infof("pushing expired txn %+v", reply.PusheeTxn) pusherWins = true } else if args.PusheeTxn.Epoch < reply.PusheeTxn.Epoch { // Check for an intent from a prior epoch. log.V(1).Infof("pushing intent from previous epoch for txn %+v", reply.PusheeTxn) pusherWins = true } else if reply.PusheeTxn.Priority < args.Txn.Priority || (reply.PusheeTxn.Priority == args.Txn.Priority && args.Txn.Timestamp.Less(reply.PusheeTxn.Timestamp)) { // Finally, choose based on priority; if priorities are equal, order by lower txn timestamp. log.V(1).Infof("pushing intent from txn with lower priority %+v vs %+v", reply.PusheeTxn, args.Txn) pusherWins = true } if !pusherWins { log.V(1).Infof("failed to push intent %+v vs %+v", reply.PusheeTxn, args.Txn) reply.SetGoError(proto.NewTransactionRetryError(reply.PusheeTxn)) return } // If aborting transaction, set new status and return success. if args.Abort { reply.PusheeTxn.Status = proto.ABORTED } else { // Otherwise, update timestamp to be one greater than the request's timestamp. reply.PusheeTxn.Timestamp = args.Timestamp reply.PusheeTxn.Timestamp.Logical++ } // Persist the pushed transaction. if err := engine.PutProto(r.engine, key, reply.PusheeTxn); err != nil { reply.SetGoError(err) return } }