Example #1
0
// InternalHeartbeatTxn updates the transaction status and heartbeat
// timestamp after receiving transaction heartbeat messages from
// coordinator. Returns the udpated transaction.
func (r *Range) InternalHeartbeatTxn(args *proto.InternalHeartbeatTxnRequest, reply *proto.InternalHeartbeatTxnResponse) {
	// Create the actual key to the system-local transaction table.
	key := engine.MakeKey(engine.KeyLocalTransactionPrefix, args.Key)
	var txn proto.Transaction
	ok, err := engine.GetProto(r.engine, key, &txn)
	if err != nil {
		reply.SetGoError(err)
		return
	}
	// If no existing transaction record was found, initialize
	// to the transaction in the request header.
	if !ok {
		gogoproto.Merge(&txn, args.Txn)
	}
	if txn.Status == proto.PENDING {
		if txn.LastHeartbeat == nil {
			txn.LastHeartbeat = &proto.Timestamp{}
		}
		if txn.LastHeartbeat.Less(args.Header().Timestamp) {
			*txn.LastHeartbeat = args.Header().Timestamp
		}
		if err := engine.PutProto(r.engine, key, &txn); err != nil {
			reply.SetGoError(err)
			return
		}
	}
	reply.Txn = &txn
}
Example #2
0
// CreateRange allocates a new range ID and stores range metadata.
// On success, returns the new range.
func (s *Store) CreateRange(startKey, endKey engine.Key, replicas []proto.Replica) (*Range, error) {
	rangeID, err := engine.Increment(s.engine, engine.KeyLocalRangeIDGenerator, 1)
	if err != nil {
		return nil, err
	}
	if ok, _ := engine.GetProto(s.engine, makeRangeKey(rangeID), nil); ok {
		return nil, util.Error("newly allocated range ID already in use")
	}
	// RangeMetadata is stored local to this store only. It is neither
	// replicated via raft nor available via the global kv store.
	meta := &proto.RangeMetadata{
		ClusterID: s.Ident.ClusterID,
		RangeID:   rangeID,
		RangeDescriptor: proto.RangeDescriptor{
			StartKey: startKey,
			EndKey:   endKey,
			Replicas: replicas,
		},
	}
	err = engine.PutProto(s.engine, makeRangeKey(rangeID), meta)
	if err != nil {
		return nil, err
	}
	rng := NewRange(meta, s.clock, s.engine, s.allocator, s.gossip, s)
	rng.Start()
	s.mu.Lock()
	defer s.mu.Unlock()
	s.ranges[rangeID] = rng
	return rng, nil
}
Example #3
0
// Init starts the engine, sets the GC and reads the StoreIdent.
func (s *Store) Init() error {
	// Start engine and set garbage collector.
	if err := s.engine.Start(); err != nil {
		return err
	}
	s.engine.SetGCTimeouts(func() (minTxnTS, minRCacheTS int64) {
		now := s.clock.Now()
		minTxnTS = 0 // disable GC of transactions until we know minimum write intent age
		minRCacheTS = now.WallTime - GCResponseCacheExpiration.Nanoseconds()
		return
	})

	// Read store ident and return a not-bootstrapped error if necessary.
	ok, err := engine.GetProto(s.engine, engine.KeyLocalIdent, &s.Ident)
	if err != nil {
		return err
	} else if !ok {
		return &NotBootstrappedError{}
	}

	// TODO(spencer): scan through all range metadata and instantiate
	// ranges. Right now we just get range ID hardcoded as 1.
	var meta proto.RangeMetadata
	ok, err = engine.GetProto(s.engine, makeRangeKey(1), &meta)
	if err != nil || !ok {
		return err
	}

	rng := NewRange(&meta, s.clock, s.engine, s.allocator, s.gossip, s)
	rng.Start()

	s.mu.Lock()
	defer s.mu.Unlock()
	s.ranges[meta.RangeID] = rng
	return nil
}
Example #4
0
// InternalHeartbeatTxn updates the transaction status and heartbeat
// timestamp after receiving transaction heartbeat messages from
// coordinator. The range will return the current status for this
// transaction to the coordinator.
func (r *Range) InternalHeartbeatTxn(args *proto.InternalHeartbeatTxnRequest, reply *proto.InternalHeartbeatTxnResponse) {
	// Create the actual key to the system-local transaction table.
	key := engine.MakeKey(engine.KeyLocalTransactionPrefix, args.Key)
	var txn proto.Transaction
	if _, err := engine.GetProto(r.engine, key, &txn); err != nil {
		reply.SetGoError(err)
		return
	}
	if txn.Status == proto.PENDING {
		if !args.Header().Timestamp.Less(txn.LastHeartbeat) {
			txn.LastHeartbeat = args.Header().Timestamp
		}
		if err := engine.PutProto(r.engine, key, &txn); err != nil {
			reply.SetGoError(err)
			return
		}
	}
	reply.Status = txn.Status
}
Example #5
0
// GetResponse looks up a response matching the specified cmdID and
// returns true if found. The response is deserialized into the
// supplied reply parameter. If no response is found, returns
// false. If a command is pending already for the cmdID, then this
// method will block until the the command is completed or the
// response cache is cleared.
func (rc *ResponseCache) GetResponse(cmdID proto.ClientCmdID, reply interface{}) (bool, error) {
	// Do nothing if command ID is empty.
	if cmdID.IsEmpty() {
		return false, nil
	}
	// If the command is inflight, wait for it to complete.
	rc.Lock()
	for {
		if cond, ok := rc.inflight[makeCmdIDKey(cmdID)]; ok {
			cond.Wait()
		} else {
			break
		}
	}
	// Adding inflight here is preemptive; we don't want to hold lock
	// while fetching from the on-disk cache. The vast, vast majority of
	// calls to GetResponse will be cache misses, so this saves us
	// from acquiring the lock twice: once here and once below in the
	// event we experience a cache miss.
	rc.addInflightLocked(cmdID)
	rc.Unlock()

	// If the response is in the cache or we experienced an error, return.
	rwResp := proto.ReadWriteCmdResponse{}
	if ok, err := engine.GetProto(rc.engine, rc.makeKey(cmdID), &rwResp); ok || err != nil {
		rc.Lock() // Take lock after fetching response from cache.
		defer rc.Unlock()
		rc.removeInflightLocked(cmdID)
		if err == nil && rwResp.GetValue() != nil {
			gogoproto.Merge(reply.(gogoproto.Message), rwResp.GetValue().(gogoproto.Message))
		}
		return ok, err
	}
	// There's no command result cached for this ID; but inflight was added above.
	return false, nil
}
Example #6
0
// Init starts the engine, sets the GC and reads the StoreIdent.
func (s *Store) Init() error {
	// Close store for idempotency.
	s.Close()

	// Start engine and set garbage collector.
	if err := s.engine.Start(); err != nil {
		return err
	}

	// Create ID allocators.
	s.raftIDAlloc = NewIDAllocator(engine.KeyRaftIDGenerator, s.db, 2, raftIDAllocCount)
	s.rangeIDAlloc = NewIDAllocator(engine.KeyRangeIDGenerator, s.db, 2, rangeIDAllocCount)

	// GCTimeouts method is called each time an engine compaction is
	// underway. It sets minimum timeouts for transaction records and
	// response cache entries.
	s.engine.SetGCTimeouts(func() (minTxnTS, minRCacheTS int64) {
		now := s.clock.Now()
		minTxnTS = 0 // disable GC of transactions until we know minimum write intent age
		minRCacheTS = now.WallTime - GCResponseCacheExpiration.Nanoseconds()
		return
	})

	// Read store ident and return a not-bootstrapped error if necessary.
	ok, err := engine.GetProto(s.engine, engine.KeyLocalIdent, &s.Ident)
	if err != nil {
		return err
	} else if !ok {
		return &NotBootstrappedError{}
	}

	s.mu.Lock()
	defer s.mu.Unlock()
	start := engine.KeyLocalRangeMetadataPrefix
	end := engine.PrefixEndKey(start)
	const rows = 64
	for {
		kvs, err := s.engine.Scan(start, end, rows)
		if err != nil {
			return err
		}
		for _, kv := range kvs {
			var meta proto.RangeMetadata
			if err := gogoproto.Unmarshal(kv.Value, &meta); err != nil {
				return err
			}
			rng := NewRange(&meta, s.clock, s.engine, s.allocator, s.gossip, s)
			rng.Start()
			s.ranges[meta.RangeID] = rng
			s.rangesByKey = append(s.rangesByKey, rng)
		}
		if len(kvs) < rows {
			break
		}
		start = engine.NextKey(kvs[rows-1].Key)
	}

	// Ensure that ranges are sorted.
	sort.Sort(s.rangesByKey)

	return nil
}
Example #7
0
// EndTransaction either commits or aborts (rolls back) an extant
// transaction according to the args.Commit parameter.
func (r *Range) EndTransaction(args *proto.EndTransactionRequest, reply *proto.EndTransactionResponse) {
	// Create the actual key to the system-local transaction table.
	key := engine.MakeKey(engine.KeyLocalTransactionPrefix, args.Key)
	// Start with supplied transaction, then possibly load from txn record.
	reply.Txn = gogoproto.Clone(args.Txn).(*proto.Transaction)

	// Fetch existing transaction if possible.
	existTxn := &proto.Transaction{}
	ok, err := engine.GetProto(r.engine, key, existTxn)
	if err != nil {
		reply.SetGoError(err)
		return
	}
	// If the transaction record already exists, verify that we can either
	// commit it or abort it (according to args.Commit), and also that the
	// Timestamp and Epoch have not suffered regression.
	if ok {
		if existTxn.Status == proto.COMMITTED {
			reply.SetGoError(proto.NewTransactionStatusError(existTxn, "already committed"))
			return
		} else if existTxn.Status == proto.ABORTED {
			reply.SetGoError(proto.NewTransactionStatusError(existTxn, "already aborted"))
			return
		} else if args.Txn.Epoch < existTxn.Epoch {
			reply.SetGoError(proto.NewTransactionStatusError(existTxn, fmt.Sprintf("epoch regression: %d", args.Txn.Epoch)))
			return
		} else if existTxn.Timestamp.Less(args.Txn.Timestamp) {
			// The transaction record can only ever be pushed forward, so it's an
			// error if somehow the transaction record has an earlier timestamp
			// than the transaction timestamp.
			reply.SetGoError(proto.NewTransactionStatusError(existTxn, fmt.Sprintf("timestamp regression: %+v", args.Txn.Timestamp)))
			return
		}
		// Use the persisted transaction record as final transaction.
		gogoproto.Merge(reply.Txn, existTxn)
	}

	// Take max of requested timestamp and possibly "pushed" txn
	// record timestamp as the final commit timestamp.
	if reply.Txn.Timestamp.Less(args.Timestamp) {
		reply.Txn.Timestamp = args.Timestamp
	}

	// Set transaction status to COMMITTED or ABORTED as per the
	// args.Commit parameter.
	if args.Commit {
		// If the isolation level is SERIALIZABLE, return a transaction
		// retry error if the commit timestamp isn't equal to the txn
		// timestamp.
		if args.Txn.Isolation == proto.SERIALIZABLE && !reply.Txn.Timestamp.Equal(args.Txn.Timestamp) {
			reply.SetGoError(proto.NewTransactionRetryError(reply.Txn))
			return
		}
		reply.Txn.Status = proto.COMMITTED
	} else {
		reply.Txn.Status = proto.ABORTED
	}

	// Persist the transaction record with updated status (& possibly timestmap).
	if err := engine.PutProto(r.engine, key, reply.Txn); err != nil {
		reply.SetGoError(err)
		return
	}
}
Example #8
0
// InternalPushTxn resolves conflicts between concurrent txns (or
// between a non-transactional reader or writer and a txn) in several
// ways depending on the statuses and priorities of the conflicting
// transactions. The InternalPushTxn operation is invoked by a
// "pusher" (the writer trying to abort a conflicting txn or the
// reader trying to push a conflicting txn's commit timestamp
// forward), who attempts to resolve a conflict with a "pushee"
// (args.PushTxn -- the pushee txn whose intent(s) caused the
// conflict).
//
// Txn already committed/aborted: If pushee txn is committed or
// aborted return success.
//
// Txn Timeout: If pushee txn entry isn't present or its LastHeartbeat
// timestamp isn't set, use PushTxn.Timestamp as LastHeartbeat. If
// current time - LastHeartbeat > 2 * DefaultHeartbeatInterval, then
// the pushee txn should be either pushed forward or aborted,
// depending on value of Request.Abort.
//
// Old Txn Epoch: If persisted pushee txn entry has a newer Epoch than
// PushTxn.Epoch, return success, as older epoch may be removed.
//
// Lower Txn Priority: If pushee txn has a lower priority than pusher,
// adjust pushee's persisted txn depending on value of args.Abort. If
// args.Abort is true, set txn.Status to ABORTED, and priority to one
// less than the pusher's priority and return success. If args.Abort
// is false, set txn.Timestamp to pusher's txn.Timestamp + 1.
//
// Higher Txn Priority: If pushee txn has a higher priority than
// pusher, return TransactionRetryError. Transaction will be retried
// with priority one less than the pushee's higher priority.
func (r *Range) InternalPushTxn(args *proto.InternalPushTxnRequest, reply *proto.InternalPushTxnResponse) {
	if !bytes.Equal(args.Key, args.PusheeTxn.ID) {
		reply.SetGoError(util.Errorf("request key %q should match pushee's txn ID %q", args.Key, args.PusheeTxn.ID))
		return
	}
	// Create the actual key to the system-local transaction table.
	key := engine.MakeKey(engine.KeyLocalTransactionPrefix, args.Key)

	// Fetch existing transaction if possible.
	existTxn := &proto.Transaction{}
	ok, err := engine.GetProto(r.engine, key, existTxn)
	if err != nil {
		reply.SetGoError(err)
		return
	}
	if ok {
		// Start with the persisted transaction record as final transaction.
		reply.PusheeTxn = gogoproto.Clone(existTxn).(*proto.Transaction)
		// Upgrade the epoch and timestamp as necessary.
		if reply.PusheeTxn.Epoch < args.PusheeTxn.Epoch {
			reply.PusheeTxn.Epoch = args.PusheeTxn.Epoch
		}
		if reply.PusheeTxn.Timestamp.Less(args.PusheeTxn.Timestamp) {
			reply.PusheeTxn.Timestamp = args.PusheeTxn.Timestamp
		}
	} else {
		// Some sanity checks for case where we don't find a transaction record.
		if args.PusheeTxn.LastHeartbeat != nil {
			reply.SetGoError(proto.NewTransactionStatusError(&args.PusheeTxn,
				"no txn persisted, yet intent has heartbeat"))
			return
		} else if args.PusheeTxn.Status != proto.PENDING {
			reply.SetGoError(proto.NewTransactionStatusError(&args.PusheeTxn,
				fmt.Sprintf("no txn persisted, yet intent has status %s", args.PusheeTxn.Status)))
			return
		}
		// The transaction doesn't exist yet on disk; use the supplied version.
		reply.PusheeTxn = gogoproto.Clone(&args.PusheeTxn).(*proto.Transaction)
	}

	// If already committed or aborted, return success.
	if reply.PusheeTxn.Status != proto.PENDING {
		// Trivial noop.
		return
	}
	// If we're trying to move the timestamp forward, and it's already
	// far enough forward, return success.
	if !args.Abort && args.Timestamp.Less(reply.PusheeTxn.Timestamp) {
		// Trivial noop.
		return
	}

	// pusherWins bool is true in the event the pusher prevails.
	var pusherWins bool

	// Check for txn timeout.
	if reply.PusheeTxn.LastHeartbeat == nil {
		reply.PusheeTxn.LastHeartbeat = &reply.PusheeTxn.Timestamp
	}
	// Compute heartbeat expiration.
	expiry := r.clock.Now()
	expiry.WallTime -= 2 * DefaultHeartbeatInterval.Nanoseconds()
	if reply.PusheeTxn.LastHeartbeat.Less(expiry) {
		log.V(1).Infof("pushing expired txn %+v", reply.PusheeTxn)
		pusherWins = true
	} else if args.PusheeTxn.Epoch < reply.PusheeTxn.Epoch {
		// Check for an intent from a prior epoch.
		log.V(1).Infof("pushing intent from previous epoch for txn %+v", reply.PusheeTxn)
		pusherWins = true
	} else if reply.PusheeTxn.Priority < args.Txn.Priority ||
		(reply.PusheeTxn.Priority == args.Txn.Priority && args.Txn.Timestamp.Less(reply.PusheeTxn.Timestamp)) {
		// Finally, choose based on priority; if priorities are equal, order by lower txn timestamp.
		log.V(1).Infof("pushing intent from txn with lower priority %+v vs %+v", reply.PusheeTxn, args.Txn)
		pusherWins = true
	}

	if !pusherWins {
		log.V(1).Infof("failed to push intent %+v vs %+v", reply.PusheeTxn, args.Txn)
		reply.SetGoError(proto.NewTransactionRetryError(reply.PusheeTxn))
		return
	}

	// If aborting transaction, set new status and return success.
	if args.Abort {
		reply.PusheeTxn.Status = proto.ABORTED
	} else {
		// Otherwise, update timestamp to be one greater than the request's timestamp.
		reply.PusheeTxn.Timestamp = args.Timestamp
		reply.PusheeTxn.Timestamp.Logical++
	}
	// Persist the pushed transaction.
	if err := engine.PutProto(r.engine, key, reply.PusheeTxn); err != nil {
		reply.SetGoError(err)
		return
	}
}