// GetRange fetches a range by ID. Returns an error if no range is found. func (s *Store) GetRange(rangeID int64) (*Range, error) { s.mu.RLock() defer s.mu.RUnlock() if rng, ok := s.ranges[rangeID]; ok { return rng, nil } return nil, proto.NewRangeNotFoundError(rangeID) }
// addWriteCmd first consults the response cache to determine whether // this command has already been sent to the range. If a response is // found, it's returned immediately and not submitted to raft. Next, // the timestamp cache is checked to determine if any newer accesses to // this command's affected keys have been made. If so, this command's // timestamp is moved forward. Finally the keys affected by this // command are added as pending writes to the read queue and the // command is submitted to Raft. Upon completion, the write is removed // from the read queue and the reply is added to the response cache. // If wait is true, will block until the command is complete. func (r *Range) addWriteCmd(ctx context.Context, args proto.Request, reply proto.Response, wait bool) error { // Check the response cache in case this is a replay. This call // may block if the same command is already underway. header := args.Header() // Add the write to the command queue to gate subsequent overlapping // Commands until this command completes. Note that this must be // done before getting the max timestamp for the key(s), as // timestamp cache is only updated after preceding commands have // been run to successful completion. cmdKey := r.beginCmd(header, false) // This replica must have leader lease to process a write. if err := r.redirectOnOrAcquireLeaderLease(header.Timestamp); err != nil { r.endCmd(cmdKey, args, err, false /* !readOnly */) reply.Header().SetGoError(err) return err } // Two important invariants of Cockroach: 1) encountering a more // recently written value means transaction restart. 2) values must // be written with a greater timestamp than the most recent read to // the same key. Check the timestamp cache for reads/writes which // are at least as recent as the timestamp of this write. For // writes, send WriteTooOldError; for reads, update the write's // timestamp. When the write returns, the updated timestamp will // inform the final commit timestamp. if usesTimestampCache(args) { r.Lock() rTS, wTS := r.tsCache.GetMax(header.Key, header.EndKey, header.Txn.GetID()) r.Unlock() // Always push the timestamp forward if there's been a read which // occurred after our txn timestamp. if !rTS.Less(header.Timestamp) { header.Timestamp = rTS.Next() } // If there's a newer write timestamp... if !wTS.Less(header.Timestamp) { // If we're in a txn, set a write too old error in reply. We // still go ahead and try the write because we want to avoid // restarting the transaction in the event that there isn't an // intent or the intent can be pushed by us. if header.Txn != nil { err := &proto.WriteTooOldError{Timestamp: header.Timestamp, ExistingTimestamp: wTS} reply.Header().SetGoError(err) } else { // Otherwise, make sure we advance the request's timestamp. header.Timestamp = wTS.Next() } } } errChan, pendingCmd := r.proposeRaftCommand(ctx, args, reply) // Create a completion func for mandatory cleanups which we either // run synchronously if we're waiting or in a goroutine otherwise. completionFunc := func() error { // First wait for raft to commit or abort the command. var err error if err = <-errChan; err == nil { // Next if the command was committed, wait for the range to apply it. err = <-pendingCmd.done } else if err == multiraft.ErrGroupDeleted { // This error needs to be converted appropriately so that // clients will retry. err = proto.NewRangeNotFoundError(r.Desc().RaftID) } // As for reads, update timestamp cache with the timestamp // of this write on success. This ensures a strictly higher // timestamp for successive writes to the same key or key range. r.endCmd(cmdKey, args, err, false /* !readOnly */) return err } if wait { return completionFunc() } go func() { // If the original client didn't wait (e.g. resolve write intent), // log execution errors so they're surfaced somewhere. if err := completionFunc(); err != nil { // TODO(tschottdorf): possible security risk to log args. log.Warningc(ctx, "async execution of %v failed: %s", args, err) } }() return nil }
// addWriteCmd first adds the keys affected by this command as pending writes // to the command queue. Next, the timestamp cache is checked to determine if // any newer accesses to this command's affected keys have been made. If so, // the command's timestamp is moved forward. Finally, the command is submitted // to Raft. Upon completion, the write is removed from the read queue and any // error returned. If a WaitGroup is supplied, it is signaled when the command // enters Raft or the function returns with a preprocessing error, whichever // happens earlier. func (r *Range) addWriteCmd(ctx context.Context, args proto.Request, wg *sync.WaitGroup) (proto.Response, error) { signal := func() { if wg != nil { wg.Done() wg = nil } } // This happens more eagerly below, but it's important to guarantee that // early returns do not skip this. defer signal() header := args.Header() if err := r.checkCmdHeader(args.Header()); err != nil { return nil, err } trace := tracer.FromCtx(ctx) // Add the write to the command queue to gate subsequent overlapping // Commands until this command completes. Note that this must be // done before getting the max timestamp for the key(s), as // timestamp cache is only updated after preceding commands have // been run to successful completion. qDone := trace.Epoch("command queue") cmdKey := r.beginCmd(header, false) qDone() // This replica must have leader lease to process a write. if err := r.redirectOnOrAcquireLeaderLease(trace, header.Timestamp); err != nil { r.endCmd(cmdKey, args, err, false /* !readOnly */) return nil, err } // Two important invariants of Cockroach: 1) encountering a more // recently written value means transaction restart. 2) values must // be written with a greater timestamp than the most recent read to // the same key. Check the timestamp cache for reads/writes which // are at least as recent as the timestamp of this write. For // writes, send WriteTooOldError; for reads, update the write's // timestamp. When the write returns, the updated timestamp will // inform the final commit timestamp. if usesTimestampCache(args) { r.Lock() rTS, wTS := r.tsCache.GetMax(header.Key, header.EndKey, header.Txn.GetID()) r.Unlock() // Always push the timestamp forward if there's been a read which // occurred after our txn timestamp. if !rTS.Less(header.Timestamp) { header.Timestamp = rTS.Next() } // If there's a newer write timestamp... if !wTS.Less(header.Timestamp) { // If we're in a txn, we still go ahead and try the write since // we want to avoid restarting the transaction in the event that // there isn't an intent or the intent can be pushed by us. // // If we're not in a txn, it's trivial to just advance our timestamp. if header.Txn == nil { header.Timestamp = wTS.Next() } } } defer trace.Epoch("raft")() errChan, pendingCmd := r.proposeRaftCommand(ctx, args) signal() // First wait for raft to commit or abort the command. var err error var reply proto.Response if err = <-errChan; err == nil { // Next if the command was committed, wait for the range to apply it. respWithErr := <-pendingCmd.done reply, err = respWithErr.reply, respWithErr.err } else if err == multiraft.ErrGroupDeleted { // This error needs to be converted appropriately so that // clients will retry. err = proto.NewRangeNotFoundError(r.Desc().RaftID) } // As for reads, update timestamp cache with the timestamp // of this write on success. This ensures a strictly higher // timestamp for successive writes to the same key or key range. r.endCmd(cmdKey, args, err, false /* !readOnly */) return reply, err }