// updateForBatch updates the first argument (the header of a request contained // in a batch) from the second one (the batch header), returning an error when // inconsistencies are found. // It is checked that the individual call does not have a User, UserPriority // or Txn set that differs from the batch's. func updateForBatch(args proto.Request, bHeader proto.RequestHeader) error { // Disallow transaction, user and priority on individual calls, unless // equal. aHeader := args.Header() if aHeader.User != "" && aHeader.User != bHeader.User { return util.Error("conflicting user on call in batch") } if aPrio := aHeader.GetUserPriority(); aPrio != proto.Default_RequestHeader_UserPriority && aPrio != bHeader.GetUserPriority() { return util.Error("conflicting user priority on call in batch") } aHeader.User = bHeader.User aHeader.UserPriority = bHeader.UserPriority // Only allow individual transactions on the requests of a batch if // - the batch is non-transactional, // - the individual transaction does not write intents, and // - the individual transaction is initialized. // The main usage of this is to allow mass-resolution of intents, which // entails sending a non-txn batch of transactional InternalResolveIntent. if aHeader.Txn != nil && !aHeader.Txn.Equal(bHeader.Txn) { if len(aHeader.Txn.ID) == 0 || proto.IsTransactionWrite(args) || bHeader.Txn != nil { return util.Error("conflicting transaction in transactional batch") } } else { aHeader.Txn = bHeader.Txn } return nil }
// Validate returns an error if any required elements of the Config are missing or invalid. // Called automatically by NewMultiRaft. func (c *Config) Validate() error { if c.Transport == nil { return util.Error("Transport is required") } if c.ElectionTimeoutMin == 0 || c.ElectionTimeoutMax == 0 { return util.Error("ElectionTimeout{Min,Max} must be non-zero") } if c.ElectionTimeoutMin > c.ElectionTimeoutMax { return util.Error("ElectionTimeoutMin must be <= ElectionTimeoutMax") } return nil }
// addReadOnlyCmd updates the read timestamp cache and waits for any // overlapping writes currently processing through Raft ahead of us to // clear via the read queue. func (r *Range) addReadOnlyCmd(ctx context.Context, args proto.Request, reply proto.Response) error { header := args.Header() if err := r.checkCmdHeader(header); err != nil { reply.Header().SetGoError(err) return err } // If read-consistency is set to INCONSISTENT, run directly. if header.ReadConsistency == proto.INCONSISTENT { // But disallow any inconsistent reads within txns. if header.Txn != nil { reply.Header().SetGoError(util.Error("cannot allow inconsistent reads within a transaction")) return reply.Header().GoError() } if header.Timestamp.Equal(proto.ZeroTimestamp) { header.Timestamp = r.rm.Clock().Now() } intents, err := r.executeCmd(r.rm.Engine(), nil, args, reply) if err == nil { r.handleSkippedIntents(args, intents) } return err } else if header.ReadConsistency == proto.CONSENSUS { reply.Header().SetGoError(util.Error("consensus reads not implemented")) return reply.Header().GoError() } // Add the read to the command queue to gate subsequent // overlapping commands until this command completes. cmdKey := r.beginCmd(header, true) // This replica must have leader lease to process a consistent read. if err := r.redirectOnOrAcquireLeaderLease(tracer.FromCtx(ctx), header.Timestamp); err != nil { r.endCmd(cmdKey, args, err, true /* readOnly */) reply.Header().SetGoError(err) return err } // Execute read-only command. intents, err := r.executeCmd(r.rm.Engine(), nil, args, reply) // Only update the timestamp cache if the command succeeded. r.endCmd(cmdKey, args, err, true /* readOnly */) if err == nil { r.handleSkippedIntents(args, intents) } return err }
// validate returns an error if any required elements of the Config are missing or invalid. // Called automatically by NewMultiRaft. func (c *Config) validate() error { if c.Transport == nil { return util.Error("Transport is required") } if c.ElectionTimeoutTicks == 0 { return util.Error("ElectionTimeoutTicks must be non-zero") } if c.HeartbeatIntervalTicks == 0 { return util.Error("HeartbeatIntervalTicks must be non-zero") } if c.TickInterval == 0 { return util.Error("TickInterval must be non-zero") } return nil }
// initStores initializes the Stores map from id to Store. Stores are // added to the local sender if already bootstrapped. A bootstrapped // Store has a valid ident with cluster, node and Store IDs set. If // the Store doesn't yet have a valid ident, it's added to the // bootstraps list for initialization once the cluster and node IDs // have been determined. func (n *Node) initStores(clock *hlc.Clock, engines []engine.Engine) error { bootstraps := list.New() if len(engines) == 0 { return util.Error("no engines") } for _, e := range engines { // TODO(bdarnell): use a real transport here instead of NewLocalRPCTransport. // TODO(bdarnell): arrange to have the transport closed. s := storage.NewStore(clock, e, n.db, n.gossip, multiraft.NewLocalRPCTransport()) // Initialize each store in turn, handling un-bootstrapped errors by // adding the store to the bootstraps list. if err := s.Start(); err != nil { if _, ok := err.(*storage.NotBootstrappedError); ok { bootstraps.PushBack(s) continue } return err } if s.Ident.ClusterID != "" { if s.Ident.StoreID == 0 { return util.Error("cluster id set for node ident but missing store id") } capacity, err := s.Capacity() if err != nil { return err } log.Infof("initialized store %s: %+v", s, capacity) n.lSender.AddStore(s) } } // Verify all initialized stores agree on cluster and node IDs. if err := n.validateStores(); err != nil { return err } // Connect gossip before starting bootstrap. For new nodes, connecting // to the gossip network is necessary to get the cluster ID. n.connectGossip() // Bootstrap any uninitialized stores asynchronously. if bootstraps.Len() > 0 { go n.bootstrapStores(bootstraps) } return nil }
// CreateRange allocates a new range ID and stores range metadata. // On success, returns the new range. func (s *Store) CreateRange(startKey, endKey engine.Key, replicas []proto.Replica) (*Range, error) { rangeID, err := engine.Increment(s.engine, engine.KeyLocalRangeIDGenerator, 1) if err != nil { return nil, err } if ok, _ := engine.GetProto(s.engine, makeRangeKey(rangeID), nil); ok { return nil, util.Error("newly allocated range ID already in use") } // RangeMetadata is stored local to this store only. It is neither // replicated via raft nor available via the global kv store. meta := &proto.RangeMetadata{ ClusterID: s.Ident.ClusterID, RangeID: rangeID, RangeDescriptor: proto.RangeDescriptor{ StartKey: startKey, EndKey: endKey, Replicas: replicas, }, } err = engine.PutProto(s.engine, makeRangeKey(rangeID), meta) if err != nil { return nil, err } rng := NewRange(meta, s.clock, s.engine, s.allocator, s.gossip, s) rng.Start() s.mu.Lock() defer s.mu.Unlock() s.ranges[rangeID] = rng return rng, nil }
// newServerTLSConfig creates a server TLSConfig from the supplied byte strings containing // - the certificate of this node (should be signed by the CA), // - the private key of this node. // - the certificate of the cluster CA, func newServerTLSConfig(certPEM, keyPEM, caPEM []byte) (*tls.Config, error) { cert, err := tls.X509KeyPair(certPEM, keyPEM) if err != nil { return nil, err } certPool := x509.NewCertPool() if ok := certPool.AppendCertsFromPEM(caPEM); !ok { err = util.Error("failed to parse PEM data to pool") return nil, err } return &tls.Config{ Certificates: []tls.Certificate{cert}, // Verify client certs if passed. ClientAuth: tls.VerifyClientCertIfGiven, RootCAs: certPool, ClientCAs: certPool, // Use the default cipher suite from golang (RC4 is going away in 1.5). // Prefer the server-specified suite. PreferServerCipherSuites: true, // TLS 1.1 and 1.2 support is crappy out there. Let's use 1.0. MinVersion: tls.VersionTLS10, // Should we disable session resumption? This may break forward secrecy. // SessionTicketsDisabled: true, }, nil }
// CreateRange allocates a new range ID and stores range metadata. // On success, returns the new range. func (s *Store) CreateRange(startKey, endKey Key, replicas []Replica) (*Range, error) { rangeID, err := increment(s.engine, storeKeyRangeIDGenerator, 1, time.Now().UnixNano()) if err != nil { return nil, err } if ok, _, _ := getI(s.engine, rangeKey(rangeID), nil); ok { return nil, util.Error("newly allocated range ID already in use") } // RangeMetadata is stored local to this store only. It is neither // replicated via raft nor available via the global kv store. meta := RangeMetadata{ ClusterID: s.Ident.ClusterID, RangeID: rangeID, StartKey: startKey, EndKey: endKey, Desc: RangeDescriptor{ StartKey: startKey, Replicas: replicas, }, } err = putI(s.engine, rangeKey(rangeID), meta) if err != nil { return nil, err } rng := NewRange(meta, s.engine, s.allocator, s.gossip) rng.Start() s.mu.Lock() defer s.mu.Unlock() s.ranges[rangeID] = rng return rng, nil }
// ParseTimestamp parses the timestamp. func ParseTimestamp(s DString) (DTimestamp, error) { str := string(s) t, err := time.Parse(timestampFormat, str) if err == nil { t = t.UTC() return DTimestamp{Time: t}, nil } t, err = time.Parse(timestampWithOffsetZoneFormat, str) if err == nil { t = t.UTC() return DTimestamp{Time: t}, nil } t, err = time.Parse(timestampWithNamedZoneFormat, str) if err == nil { // Parsing using a named time zone is imperfect for two reasons: // 1. Some named time zones are ambiguous (PST can be US PST and // phillipines PST), and 2. The code needs to have access to the entire // database of named timed zones in order to get some time offset, // and it's not clear what are the memory requirements for that. // TODO(vivek): Implement SET TIME ZONE to set a time zone and use // time.ParseInLocation() return DummyTimestamp, util.Error("TODO(vivek): named time zone input not supported") } // Parse other formats in the future. return DummyTimestamp, err }
// Merge appends the input value to the string and returns the result. func (s Appender) Merge(t Mergable) (Mergable, error) { m, ok := t.(Appender) if !ok { return s, util.Error("parameter is of wrong type") } return append(s, m...), nil }
// NewMultiRaft creates a MultiRaft object. func NewMultiRaft(nodeID NodeID, config *Config) (*MultiRaft, error) { if !nodeID.isSet() { return nil, util.Error("Invalid NodeID") } err := config.Validate() if err != nil { return nil, err } if config.Clock == nil { config.Clock = RealClock } m := &MultiRaft{ Config: *config, nodeID: nodeID, Events: make(chan interface{}, 1000), ops: make(chan interface{}, 100), requests: make(chan *rpc.Call, 100), stopped: make(chan struct{}), } err = m.Transport.Listen(nodeID, m) if err != nil { return nil, err } return m, nil }
// replaceNode cuts a node away form its parent, substituting a new node or // nil. The updated new node is returned. Note that this does not in fact alter // the old node in any way, but only the old node's parent and the new node. func (tc *treeContext) replaceNode(oldNode, newNode *proto.RangeTreeNode) (*proto.RangeTreeNode, error) { if oldNode.ParentKey == nil { if newNode == nil { return nil, util.Error("cannot replace the root node with nil") } // Update the root key if this was the root. tc.setRootKey(newNode.Key) } else { oldParent, err := tc.getNode(oldNode.ParentKey) if err != nil { return nil, err } if oldParent.LeftKey != nil && oldNode.Key.Equal(oldParent.LeftKey) { if newNode == nil { oldParent.LeftKey = nil } else { oldParent.LeftKey = newNode.Key } } else { if newNode == nil { oldParent.RightKey = nil } else { oldParent.RightKey = newNode.Key } } tc.setNode(oldParent) } if newNode != nil { newNode.ParentKey = oldNode.ParentKey tc.setNode(newNode) } return newNode, nil }
// sendBatch unrolls a batched command and sends each constituent // command in parallel. func (tc *TxnCoordSender) sendBatch(batchArgs *proto.InternalBatchRequest, batchReply *proto.InternalBatchResponse) { // Prepare the calls by unrolling the batch. If the batchReply is // pre-initialized with replies, use those; otherwise create replies // as needed. // TODO(spencer): send calls in parallel. batchReply.Txn = batchArgs.Txn for i := range batchArgs.Requests { args := batchArgs.Requests[i].GetValue().(proto.Request) call := proto.Call{Args: args} // Disallow transaction, user and priority on individual calls, unless // equal. if args.Header().User != "" && args.Header().User != batchArgs.User { batchReply.Header().SetGoError(util.Error("cannot have individual user on call in batch")) return } args.Header().User = batchArgs.User if args.Header().UserPriority != nil && args.Header().GetUserPriority() != batchArgs.GetUserPriority() { batchReply.Header().SetGoError(util.Error("cannot have individual user priority on call in batch")) return } args.Header().UserPriority = batchArgs.UserPriority if txn := args.Header().Txn; txn != nil && !txn.Equal(batchArgs.Txn) { batchReply.Header().SetGoError(util.Error("cannot have individual transactional call in batch")) return } // Propagate batch Txn to each call. args.Header().Txn = batchArgs.Txn // Create a reply from the method type and add to batch response. if i >= len(batchReply.Responses) { call.Reply = args.CreateReply() batchReply.Add(call.Reply) } else { call.Reply = batchReply.Responses[i].GetValue().(proto.Response) } tc.sendOne(call) // Amalgamate transaction updates and propagate first error, if applicable. if batchReply.Txn != nil { batchReply.Txn.Update(call.Reply.Header().Txn) } if call.Reply.Header().Error != nil { batchReply.Error = call.Reply.Header().Error return } } }
// ResolveWriteIntentRange commits or aborts (rolls back) // the range of write intents specified by start and end // keys for a given txnID according to commit parameter. // ResolveWriteIntentRange will skip write intents of // other txnIDs. Specify max=0 for unbounded resolves. func (mvcc *MVCC) ResolveWriteIntentRange(key Key, endKey Key, max int64, txnID string, commit bool) (int64, error) { if len(txnID) == 0 { return 0, util.Error("missing txnID in request") } nextKey := key // TODO(Jiang-Ming): remove this after we put everything via MVCC. // Currently, we need to skip the series of reserved system // key / value pairs covering accounting, range metadata, node // accounting and permissions before the actual key / value pairs // since they don't have keyMetadata. if nextKey.Less(PrefixEndKey(Key("\x00"))) { nextKey = PrefixEndKey(Key("\x00")) } num := int64(0) for { kvs, err := mvcc.engine.Scan(nextKey, endKey, 1) if err != nil { return num, err } // No more keys exists in the given range. if len(kvs) == 0 { break } currentKey := kvs[0].Key _, existingTxnID, err := mvcc.Get(currentKey, hlc.MaxHLTimestamp, txnID) // Return the error unless its a writeIntentError, which // will occur in the event we scan a key with a write // intent belonging to a different transaction. if _, ok := err.(*writeIntentError); err != nil && !ok { return num, err } // endRangTransaction only needs to deal with the write // intents for the given txnID. if err == nil && existingTxnID == txnID { // commits or aborts (rolls back) the write intent of // the given txnID. err = mvcc.ResolveWriteIntent(currentKey, txnID, commit) if err != nil { return num, err } num++ } if max != 0 && max == num { break } // In order to efficiently skip the possibly long list of // old versions for this key, please refer to scan function // for details. nextKey = NextKey(mvccEncodeKey(currentKey, hlc.MinHLTimestamp)) } return num, nil }
// updateForBatch updates the first argument (the header of a request contained // in a batch) from the second one (the batch header), returning an error when // inconsistencies are found. // It is checked that the individual call does not have a User, UserPriority // or Txn set that differs from the batch's. func updateForBatch(aHeader *proto.RequestHeader, bHeader proto.RequestHeader) error { // Disallow transaction, user and priority on individual calls, unless // equal. if aHeader.User != "" && aHeader.User != bHeader.User { return util.Error("conflicting user on call in batch") } if aPrio := aHeader.GetUserPriority(); aPrio != proto.Default_RequestHeader_UserPriority && aPrio != bHeader.GetUserPriority() { return util.Error("conflicting user priority on call in batch") } if aHeader.Txn != nil && !aHeader.Txn.Equal(bHeader.Txn) { return util.Error("conflicting transaction in transactional batch") } aHeader.User = bHeader.User aHeader.UserPriority = bHeader.UserPriority aHeader.Txn = bHeader.Txn return nil }
// getDescriptors takes a call and looks up the corresponding range // descriptors associated with it. First, the range descriptor for // call.Args.Key is looked up. If call.Args.EndKey exceeds that of the // returned descriptor, the next descriptor is obtained as well. func (ds *DistSender) getDescriptors(call proto.Call) (*proto.RangeDescriptor, *proto.RangeDescriptor, error) { // If this is a PushTxn, set ignoreIntents option as // necessary. This prevents a potential infinite loop; see the // comments in proto.RangeLookupRequest. options := lookupOptions{} if pushArgs, ok := call.Args.(*proto.PushTxnRequest); ok { options.ignoreIntents = pushArgs.RangeLookup } var desc *proto.RangeDescriptor var err error _, isReverseScan := call.Args.(*proto.ReverseScanRequest) if !isReverseScan { desc, err = ds.rangeCache.LookupRangeDescriptor(call.Args.Header().Key, options) } else { options.useReverseScan = true desc, err = ds.rangeCache.LookupRangeDescriptor(call.Args.Header().EndKey, options) } if err != nil { return nil, nil, err } // Checks whether need to get next range descriptor. If so, returns true // and the key to look up, depending on whether we're in reverse mode. needAnother := func(desc *proto.RangeDescriptor, isReverseScan bool) (proto.Key, bool) { if isReverseScan { return desc.StartKey, call.Args.Header().Key.Less(desc.StartKey) } return desc.EndKey, desc.EndKey.Less(call.Args.Header().EndKey) } var descNext *proto.RangeDescriptor // If the request accesses keys beyond the end of this range, // get the descriptor of the adjacent range to address next. if nextKey, ok := needAnother(desc, isReverseScan); ok { if _, ok := call.Reply.(proto.Combinable); !ok { return nil, nil, util.Error("illegal cross-range operation") } // If there's no transaction and op spans ranges, possibly // re-run as part of a transaction for consistency. The // case where we don't need to re-run is if the read // consistency is not required. if call.Args.Header().Txn == nil && call.Args.Header().ReadConsistency != proto.INCONSISTENT { return nil, nil, &proto.OpRequiresTxnError{} } // This next lookup is likely for free since we've read the // previous descriptor and range lookups use cache // prefetching. descNext, err = ds.rangeCache.LookupRangeDescriptor(nextKey, options) if err != nil { return nil, nil, err } } return desc, descNext, nil }
// CreateGroup creates a new consensus group and joins it. The application should // arrange to call CreateGroup on all nodes named in initialMembers. func (m *MultiRaft) CreateGroup(groupID GroupID, initialMembers []NodeID) error { for _, id := range initialMembers { if !id.isSet() { return util.Error("Invalid NodeID") } } op := &createGroupOp{newGroup(groupID, initialMembers), make(chan error)} m.ops <- op return <-op.ch }
// initStores initializes the Stores map from id to Store. Stores are // added to the local sender if already bootstrapped. A bootstrapped // Store has a valid ident with cluster, node and Store IDs set. If // the Store doesn't yet have a valid ident, it's added to the // bootstraps list for initialization once the cluster and node IDs // have been determined. func (n *Node) initStores(engines []engine.Engine, stopper *util.Stopper) error { bootstraps := list.New() if len(engines) == 0 { return util.Error("no engines") } for _, e := range engines { s := storage.NewStore(n.ctx, e, &n.Descriptor) // Initialize each store in turn, handling un-bootstrapped errors by // adding the store to the bootstraps list. if err := s.Start(stopper); err != nil { if _, ok := err.(*storage.NotBootstrappedError); ok { log.Infof("store %s not bootstrapped", s) bootstraps.PushBack(s) continue } return util.Errorf("failed to start store: %s", err) } if s.Ident.ClusterID == "" || s.Ident.NodeID == 0 { return util.Errorf("unidentified store: %s", s) } capacity, err := s.Capacity() if err != nil { return util.Errorf("could not query store capacity: %s", err) } log.Infof("initialized store %s: %+v", s, capacity) n.lSender.AddStore(s) } // Verify all initialized stores agree on cluster and node IDs. if err := n.validateStores(); err != nil { return err } // Connect gossip before starting bootstrap. For new nodes, connecting // to the gossip network is necessary to get the cluster ID. n.connectGossip() // If no NodeID has been assigned yet, allocate a new node ID by // supplying 0 to initNodeID. if n.Descriptor.NodeID == 0 { n.initNodeID(0) } // Bootstrap any uninitialized stores asynchronously. if bootstraps.Len() > 0 && stopper.StartTask() { go func() { n.bootstrapStores(bootstraps, stopper) stopper.FinishTask() }() } return nil }
// Merge updates the value of this Counter with the supplied // update and returns an error in case of an integer overflow. func (n Counter) Merge(o Mergable) (Mergable, error) { m, ok := o.(Counter) if !ok { return n, util.Error("parameter is of wrong type") } if encoding.WillOverflow(int64(n), int64(m)) { return n, util.Errorf("merge error: %d + %d overflows", n, m) } result := Counter(n + m) return result, nil }
// ResolveWriteIntentRange commits or aborts (rolls back) the range of // write intents specified by start and end keys for a given txnID // according to commit parameter. ResolveWriteIntentRange will skip // write intents of other txnIDs. Specify max=0 for unbounded // resolves. func (mvcc *MVCC) ResolveWriteIntentRange(key Key, endKey Key, max int64, txnID []byte, commit bool) (int64, error) { if len(txnID) == 0 { return 0, util.Error("missing txnID in request") } binKey := encoding.EncodeBinary(nil, key) binEndKey := encoding.EncodeBinary(nil, endKey) nextKey := binKey num := int64(0) for { kvs, err := mvcc.engine.Scan(nextKey, binEndKey, 1) if err != nil { return num, err } // No more keys exists in the given range. if len(kvs) == 0 { break } remainder, currentKey := encoding.DecodeBinary(kvs[0].Key) if len(remainder) != 0 { return 0, util.Errorf("expected an MVCC metadata key: %s", kvs[0].Key) } _, _, existingTxnID, err := mvcc.getInternal(kvs[0].Key, proto.MaxTimestamp, txnID) // Return the error unless its a writeIntentError, which // will occur in the event we scan a key with a write // intent belonging to a different transaction. if _, ok := err.(*writeIntentError); err != nil && !ok { return num, err } // ResolveWriteIntent only needs to deal with the write // intents for the given txnID. if err == nil && bytes.Equal(existingTxnID, txnID) { // commits or aborts (rolls back) the write intent of // the given txnID. err = mvcc.ResolveWriteIntent(currentKey, txnID, commit) if err != nil { return num, err } num++ } if max != 0 && max == num { break } // In order to efficiently skip the possibly long list of // old versions for this key; refer to Scan for details. nextKey = encoding.EncodeBinary(nil, NextKey(currentKey)) } return num, nil }
// goMerge takes existing and update byte slices. It first attempts // to gob-unmarshal the update string, returning an error on failure. // The unmarshaled value must satisfy the Mergable interface. // Next, it unmarshals the existing string, falling back to the init // value supplied by the update value's Init() method if necessary. // The two values obtained in this way are merged and the result, or // an error, returned. func goMerge(existing, update []byte) ([]byte, error) { u, err := encoding.GobDecode(update) if err != nil { return nil, util.Errorf("merge: %v", err) } if _, ok := u.(Mergable); !ok { return nil, util.Error("update is not Mergable") } e, err := encoding.GobDecode(existing) if err != nil { e = u.(Mergable).Init(existing) } if reflect.TypeOf(u) != reflect.TypeOf(e) { return nil, util.Error("cannot merge values of different type") } newValue, err := e.(Mergable).Merge(u.(Mergable)) if err != nil { return nil, err } return encoding.GobEncode(newValue) }
// ResolveWriteIntent either commits or aborts (rolls back) an extant // write intent for a given txnID according to commit parameter. // ResolveWriteIntent will skip write intents of other txnIDs. func (mvcc *MVCC) ResolveWriteIntent(key Key, txnID []byte, commit bool) error { if len(txnID) == 0 { return util.Error("missing txnID in request") } binKey := encoding.EncodeBinary(nil, key) meta := &proto.MVCCMetadata{} ok, err := GetProto(mvcc.engine, binKey, meta) if err != nil { return err } if !ok { return util.Errorf("key %q does not exist", key) } if len(meta.TxnID) == 0 { return util.Errorf("write intent %q does not exist", key) } if !bytes.Equal(meta.TxnID, txnID) { return util.Errorf("cannot commit another TxnID %s from TxnID %s", meta.TxnID, txnID) } if !commit { latestKey := mvccEncodeKey(binKey, meta.Timestamp) err = mvcc.engine.Clear(latestKey) if err != nil { return err } // Compute the next possible mvcc value for this key. nextKey := NextKey(latestKey) // Compute the last possible mvcc value for this key. endScanKey := encoding.EncodeBinary(nil, NextKey(key)) kvs, err := mvcc.engine.Scan(nextKey, endScanKey, 1) if err != nil { return err } // If there is no other version, we should just clean up the key entirely. if len(kvs) == 0 { return mvcc.engine.Clear(binKey) } _, ts, isValue := mvccDecodeKey(kvs[0].Key) if !isValue { return util.Errorf("expected an MVCC value key: %s", kvs[0].Key) } // Update the keyMetadata with the next version. return PutProto(mvcc.engine, binKey, &proto.MVCCMetadata{TxnID: nil, Timestamp: ts}) } return PutProto(mvcc.engine, binKey, &proto.MVCCMetadata{TxnID: nil, Timestamp: meta.Timestamp}) }
// NewMultiRaft creates a MultiRaft object. func NewMultiRaft(nodeID proto.RaftNodeID, config *Config, stopper *util.Stopper) (*MultiRaft, error) { if nodeID == 0 { return nil, util.Error("Invalid RaftNodeID") } if err := config.validate(); err != nil { return nil, err } if config.Ticker == nil { config.Ticker = newTicker(config.TickInterval) stopper.AddCloser(config.Ticker) } if config.EntryFormatter != nil { // Wrap the EntryFormatter to strip off the command id. ef := config.EntryFormatter config.EntryFormatter = func(data []byte) string { if len(data) == 0 { return "[empty]" } id, cmd := decodeCommand(data) formatted := ef(cmd) return fmt.Sprintf("%x: %s", id, formatted) } } m := &MultiRaft{ Config: *config, stopper: stopper, multiNode: raft.StartMultiNode(uint64(nodeID)), nodeID: nodeID, // Output channel. Events: make(chan interface{}, config.EventBufferSize), // Input channels. reqChan: make(chan *RaftMessageRequest), createGroupChan: make(chan *createGroupOp), removeGroupChan: make(chan *removeGroupOp), proposalChan: make(chan *proposal), callbackChan: make(chan func()), } if err := m.Transport.Listen(nodeID, (*multiraftServer)(m)); err != nil { return nil, err } return m, nil }
// rotateLeft performs a left rotation around the node. func (tc *treeContext) rotateLeft(node *proto.RangeTreeNode) (*proto.RangeTreeNode, error) { right, err := tc.getNode(node.RightKey) if err != nil { return nil, err } if right.Black { return nil, util.Error("rotating a black node") } node.RightKey = right.LeftKey right.LeftKey = &node.Key right.Black = node.Black node.Black = false tc.setNode(node) tc.setNode(right) return right, nil }
// rotateRight performs a right rotation around the node. func (tc *treeContext) rotateRight(node *proto.RangeTreeNode) (*proto.RangeTreeNode, error) { left, err := tc.getNode(node.LeftKey) if err != nil { return nil, err } if left.Black { return nil, util.Error("rotating a black node") } node.LeftKey = left.RightKey left.RightKey = &node.Key left.Black = node.Black node.Black = false tc.setNode(node) tc.setNode(left) return left, nil }
// NewMultiRaft creates a MultiRaft object. func NewMultiRaft(nodeID NodeID, config *Config) (*MultiRaft, error) { if nodeID == 0 { return nil, util.Error("Invalid NodeID") } err := config.validate() if err != nil { return nil, err } if config.Ticker == nil { config.Ticker = newTicker(config.TickInterval) } if config.EntryFormatter != nil { // Wrap the EntryFormatter to strip off the command id. ef := config.EntryFormatter config.EntryFormatter = func(data []byte) string { if len(data) == 0 { return "[empty]" } id, cmd := decodeCommand(data) formatted := ef(cmd) return fmt.Sprintf("%x: %s", id, formatted) } } m := &MultiRaft{ Config: *config, multiNode: raft.StartMultiNode(uint64(nodeID), config.ElectionTimeoutTicks, config.HeartbeatIntervalTicks), nodeID: nodeID, Events: make(chan interface{}, 1000), reqChan: make(chan *RaftMessageRequest, 100), createGroupChan: make(chan *createGroupOp, 100), removeGroupChan: make(chan *removeGroupOp, 100), proposalChan: make(chan *proposal, 100), callbackChan: make(chan func(), 100), stopper: util.NewStopper(1), } err = m.Transport.Listen(nodeID, (*multiraftServer)(m)) if err != nil { return nil, err } return m, nil }
// RemoveTarget returns a suitable replica to remove from the provided replica // set. It attempts to consider which of the provided replicas would be the best // candidate for removal. // // TODO(mrtracy): RemoveTarget eventually needs to accept the attributes from // the zone config associated with the provided replicas. This will allow it to // make correct decisions in the case of ranges with heterogeneous replica // requirements (i.e. multiple data centers). func (a *allocator) RemoveTarget(existing []proto.Replica) (proto.Replica, error) { if len(existing) == 0 { return proto.Replica{}, util.Error("must supply at least one replica to allocator.RemoveTarget()") } a.Lock() defer a.Unlock() // Retrieve store descriptors for the provided replicas from gossip. type replStore struct { repl proto.Replica store *proto.StoreDescriptor } replStores := make([]replStore, len(existing)) usedStat := stat{} for i := range existing { desc, err := storeDescFromGossip(gossip.MakeStoreKey(existing[i].StoreID), a.gossip) if err != nil { return proto.Replica{}, err } replStores[i] = replStore{ repl: existing[i], store: desc, } usedStat.Update(desc.Capacity.FractionUsed()) } // Based on store statistics, determine which replica is the "worst" and // thus should be removed. var worst replStore for i, rs := range replStores { if i == 0 { worst = rs continue } if usedStat.mean < minFractionUsedThreshold { if rs.store.Capacity.RangeCount > worst.store.Capacity.RangeCount { worst = rs } continue } if rs.store.Capacity.FractionUsed() > worst.store.Capacity.FractionUsed() { worst = rs } } return worst.repl, nil }
// addAdminCmd executes the command directly. There is no interaction // with the command queue or the timestamp cache, as admin commands // are not meant to consistently access or modify the underlying data. // Admin commands must run on the leader replica. func (r *Range) addAdminCmd(ctx context.Context, args proto.Request, reply proto.Response) error { // Admin commands always require the leader lease. if err := r.redirectOnOrAcquireLeaderLease(args.Header().Timestamp); err != nil { reply.Header().SetGoError(err) return err } switch args.(type) { case *proto.AdminSplitRequest: r.AdminSplit(args.(*proto.AdminSplitRequest), reply.(*proto.AdminSplitResponse)) case *proto.AdminMergeRequest: r.AdminMerge(args.(*proto.AdminMergeRequest), reply.(*proto.AdminMergeResponse)) default: return util.Error("unrecognized admin command") } return reply.Header().GoError() }
func (s *state) addLogEntry(groupID GroupID, entryType LogEntryType, payload []byte) error { g := s.groups[groupID] if g.role != RoleLeader { return util.Error("TODO(bdarnell): forward commands to leader") } g.lastLogIndex++ entry := &LogEntry{ Term: g.electionState.CurrentTerm, Index: g.lastLogIndex, Type: entryType, Payload: payload, } g.pendingEntries = append(g.pendingEntries, entry) s.updateDirtyStatus(g) return nil }
// ResolveWriteIntent either commits or aborts (rolls back) an extant // write intent for a given txnID according to commit parameter. // ResolveWriteIntent will skip write intents of other txnIDs. func (mvcc *MVCC) ResolveWriteIntent(key Key, txnID string, commit bool) error { if len(txnID) == 0 { return util.Error("missing txnID in request") } keyMeta := &keyMetadata{} ok, err := GetI(mvcc.engine, key, keyMeta) if err != nil { return err } if !ok { return util.Errorf("key %q does not exist", key) } if len(keyMeta.TxnID) == 0 { return util.Errorf("write intent does not exist", key) } if keyMeta.TxnID != txnID { return util.Errorf("cannot commit another TxnID %s from TxnID %s", keyMeta.TxnID, txnID) } if !commit { latestKey := mvccEncodeKey(key, keyMeta.Timestamp) err = mvcc.engine.Clear(latestKey) if err != nil { return err } nextKey := NextKey(latestKey) kvs, err := mvcc.engine.Scan(nextKey, PrefixEndKey(key), 1) if err != nil { return err } // If there is no other version, we should just clean up the key entirely. if len(kvs) == 0 { return mvcc.engine.Clear(key) } _, ts := mvccDecodeKey(kvs[0].Key) // Update the keyMetadata with the next version. return PutI(mvcc.engine, key, &keyMetadata{TxnID: "", Timestamp: ts}) } return PutI(mvcc.engine, key, &keyMetadata{TxnID: "", Timestamp: keyMeta.Timestamp}) }