Beispiel #1
0
// updateForBatch updates the first argument (the header of a request contained
// in a batch) from the second one (the batch header), returning an error when
// inconsistencies are found.
// It is checked that the individual call does not have a User, UserPriority
// or Txn set that differs from the batch's.
func updateForBatch(args proto.Request, bHeader proto.RequestHeader) error {
	// Disallow transaction, user and priority on individual calls, unless
	// equal.
	aHeader := args.Header()
	if aHeader.User != "" && aHeader.User != bHeader.User {
		return util.Error("conflicting user on call in batch")
	}
	if aPrio := aHeader.GetUserPriority(); aPrio != proto.Default_RequestHeader_UserPriority && aPrio != bHeader.GetUserPriority() {
		return util.Error("conflicting user priority on call in batch")
	}
	aHeader.User = bHeader.User
	aHeader.UserPriority = bHeader.UserPriority
	// Only allow individual transactions on the requests of a batch if
	// - the batch is non-transactional,
	// - the individual transaction does not write intents, and
	// - the individual transaction is initialized.
	// The main usage of this is to allow mass-resolution of intents, which
	// entails sending a non-txn batch of transactional InternalResolveIntent.
	if aHeader.Txn != nil && !aHeader.Txn.Equal(bHeader.Txn) {
		if len(aHeader.Txn.ID) == 0 || proto.IsTransactionWrite(args) || bHeader.Txn != nil {
			return util.Error("conflicting transaction in transactional batch")
		}
	} else {
		aHeader.Txn = bHeader.Txn
	}
	return nil
}
Beispiel #2
0
// Validate returns an error if any required elements of the Config are missing or invalid.
// Called automatically by NewMultiRaft.
func (c *Config) Validate() error {
	if c.Transport == nil {
		return util.Error("Transport is required")
	}
	if c.ElectionTimeoutMin == 0 || c.ElectionTimeoutMax == 0 {
		return util.Error("ElectionTimeout{Min,Max} must be non-zero")
	}
	if c.ElectionTimeoutMin > c.ElectionTimeoutMax {
		return util.Error("ElectionTimeoutMin must be <= ElectionTimeoutMax")
	}
	return nil
}
Beispiel #3
0
// addReadOnlyCmd updates the read timestamp cache and waits for any
// overlapping writes currently processing through Raft ahead of us to
// clear via the read queue.
func (r *Range) addReadOnlyCmd(ctx context.Context, args proto.Request, reply proto.Response) error {
	header := args.Header()

	if err := r.checkCmdHeader(header); err != nil {
		reply.Header().SetGoError(err)
		return err
	}

	// If read-consistency is set to INCONSISTENT, run directly.
	if header.ReadConsistency == proto.INCONSISTENT {
		// But disallow any inconsistent reads within txns.
		if header.Txn != nil {
			reply.Header().SetGoError(util.Error("cannot allow inconsistent reads within a transaction"))
			return reply.Header().GoError()
		}
		if header.Timestamp.Equal(proto.ZeroTimestamp) {
			header.Timestamp = r.rm.Clock().Now()
		}
		intents, err := r.executeCmd(r.rm.Engine(), nil, args, reply)
		if err == nil {
			r.handleSkippedIntents(args, intents)
		}
		return err
	} else if header.ReadConsistency == proto.CONSENSUS {
		reply.Header().SetGoError(util.Error("consensus reads not implemented"))
		return reply.Header().GoError()
	}

	// Add the read to the command queue to gate subsequent
	// overlapping commands until this command completes.
	cmdKey := r.beginCmd(header, true)

	// This replica must have leader lease to process a consistent read.
	if err := r.redirectOnOrAcquireLeaderLease(tracer.FromCtx(ctx), header.Timestamp); err != nil {
		r.endCmd(cmdKey, args, err, true /* readOnly */)
		reply.Header().SetGoError(err)
		return err
	}

	// Execute read-only command.
	intents, err := r.executeCmd(r.rm.Engine(), nil, args, reply)

	// Only update the timestamp cache if the command succeeded.
	r.endCmd(cmdKey, args, err, true /* readOnly */)

	if err == nil {
		r.handleSkippedIntents(args, intents)
	}
	return err
}
Beispiel #4
0
// validate returns an error if any required elements of the Config are missing or invalid.
// Called automatically by NewMultiRaft.
func (c *Config) validate() error {
	if c.Transport == nil {
		return util.Error("Transport is required")
	}
	if c.ElectionTimeoutTicks == 0 {
		return util.Error("ElectionTimeoutTicks must be non-zero")
	}
	if c.HeartbeatIntervalTicks == 0 {
		return util.Error("HeartbeatIntervalTicks must be non-zero")
	}
	if c.TickInterval == 0 {
		return util.Error("TickInterval must be non-zero")
	}
	return nil
}
Beispiel #5
0
// initStores initializes the Stores map from id to Store. Stores are
// added to the local sender if already bootstrapped. A bootstrapped
// Store has a valid ident with cluster, node and Store IDs set. If
// the Store doesn't yet have a valid ident, it's added to the
// bootstraps list for initialization once the cluster and node IDs
// have been determined.
func (n *Node) initStores(clock *hlc.Clock, engines []engine.Engine) error {
	bootstraps := list.New()

	if len(engines) == 0 {
		return util.Error("no engines")
	}
	for _, e := range engines {
		// TODO(bdarnell): use a real transport here instead of NewLocalRPCTransport.
		// TODO(bdarnell): arrange to have the transport closed.
		s := storage.NewStore(clock, e, n.db, n.gossip, multiraft.NewLocalRPCTransport())
		// Initialize each store in turn, handling un-bootstrapped errors by
		// adding the store to the bootstraps list.
		if err := s.Start(); err != nil {
			if _, ok := err.(*storage.NotBootstrappedError); ok {
				bootstraps.PushBack(s)
				continue
			}
			return err
		}
		if s.Ident.ClusterID != "" {
			if s.Ident.StoreID == 0 {
				return util.Error("cluster id set for node ident but missing store id")
			}
			capacity, err := s.Capacity()
			if err != nil {
				return err
			}
			log.Infof("initialized store %s: %+v", s, capacity)
			n.lSender.AddStore(s)
		}
	}

	// Verify all initialized stores agree on cluster and node IDs.
	if err := n.validateStores(); err != nil {
		return err
	}

	// Connect gossip before starting bootstrap. For new nodes, connecting
	// to the gossip network is necessary to get the cluster ID.
	n.connectGossip()

	// Bootstrap any uninitialized stores asynchronously.
	if bootstraps.Len() > 0 {
		go n.bootstrapStores(bootstraps)
	}

	return nil
}
Beispiel #6
0
// CreateRange allocates a new range ID and stores range metadata.
// On success, returns the new range.
func (s *Store) CreateRange(startKey, endKey engine.Key, replicas []proto.Replica) (*Range, error) {
	rangeID, err := engine.Increment(s.engine, engine.KeyLocalRangeIDGenerator, 1)
	if err != nil {
		return nil, err
	}
	if ok, _ := engine.GetProto(s.engine, makeRangeKey(rangeID), nil); ok {
		return nil, util.Error("newly allocated range ID already in use")
	}
	// RangeMetadata is stored local to this store only. It is neither
	// replicated via raft nor available via the global kv store.
	meta := &proto.RangeMetadata{
		ClusterID: s.Ident.ClusterID,
		RangeID:   rangeID,
		RangeDescriptor: proto.RangeDescriptor{
			StartKey: startKey,
			EndKey:   endKey,
			Replicas: replicas,
		},
	}
	err = engine.PutProto(s.engine, makeRangeKey(rangeID), meta)
	if err != nil {
		return nil, err
	}
	rng := NewRange(meta, s.clock, s.engine, s.allocator, s.gossip, s)
	rng.Start()
	s.mu.Lock()
	defer s.mu.Unlock()
	s.ranges[rangeID] = rng
	return rng, nil
}
Beispiel #7
0
// newServerTLSConfig creates a server TLSConfig from the supplied byte strings containing
// - the certificate of this node (should be signed by the CA),
// - the private key of this node.
// - the certificate of the cluster CA,
func newServerTLSConfig(certPEM, keyPEM, caPEM []byte) (*tls.Config, error) {
	cert, err := tls.X509KeyPair(certPEM, keyPEM)
	if err != nil {
		return nil, err
	}

	certPool := x509.NewCertPool()

	if ok := certPool.AppendCertsFromPEM(caPEM); !ok {
		err = util.Error("failed to parse PEM data to pool")
		return nil, err
	}

	return &tls.Config{
		Certificates: []tls.Certificate{cert},
		// Verify client certs if passed.
		ClientAuth: tls.VerifyClientCertIfGiven,
		RootCAs:    certPool,
		ClientCAs:  certPool,

		// Use the default cipher suite from golang (RC4 is going away in 1.5).
		// Prefer the server-specified suite.
		PreferServerCipherSuites: true,

		// TLS 1.1 and 1.2 support is crappy out there. Let's use 1.0.
		MinVersion: tls.VersionTLS10,

		// Should we disable session resumption? This may break forward secrecy.
		// SessionTicketsDisabled: true,
	}, nil
}
Beispiel #8
0
// CreateRange allocates a new range ID and stores range metadata.
// On success, returns the new range.
func (s *Store) CreateRange(startKey, endKey Key, replicas []Replica) (*Range, error) {
	rangeID, err := increment(s.engine, storeKeyRangeIDGenerator, 1, time.Now().UnixNano())
	if err != nil {
		return nil, err
	}
	if ok, _, _ := getI(s.engine, rangeKey(rangeID), nil); ok {
		return nil, util.Error("newly allocated range ID already in use")
	}
	// RangeMetadata is stored local to this store only. It is neither
	// replicated via raft nor available via the global kv store.
	meta := RangeMetadata{
		ClusterID: s.Ident.ClusterID,
		RangeID:   rangeID,
		StartKey:  startKey,
		EndKey:    endKey,
		Desc: RangeDescriptor{
			StartKey: startKey,
			Replicas: replicas,
		},
	}
	err = putI(s.engine, rangeKey(rangeID), meta)
	if err != nil {
		return nil, err
	}
	rng := NewRange(meta, s.engine, s.allocator, s.gossip)
	rng.Start()
	s.mu.Lock()
	defer s.mu.Unlock()
	s.ranges[rangeID] = rng
	return rng, nil
}
Beispiel #9
0
// ParseTimestamp parses the timestamp.
func ParseTimestamp(s DString) (DTimestamp, error) {
	str := string(s)
	t, err := time.Parse(timestampFormat, str)
	if err == nil {
		t = t.UTC()
		return DTimestamp{Time: t}, nil
	}
	t, err = time.Parse(timestampWithOffsetZoneFormat, str)
	if err == nil {
		t = t.UTC()
		return DTimestamp{Time: t}, nil
	}
	t, err = time.Parse(timestampWithNamedZoneFormat, str)
	if err == nil {
		// Parsing using a named time zone is imperfect for two reasons:
		// 1. Some named time zones are ambiguous (PST can be US PST and
		// phillipines PST), and 2. The code needs to have access to the entire
		// database of named timed zones in order to get some time offset,
		// and it's not clear what are the memory requirements for that.
		// TODO(vivek): Implement SET TIME ZONE to set a time zone and use
		// time.ParseInLocation()
		return DummyTimestamp, util.Error("TODO(vivek): named time zone input not supported")
	}
	// Parse other formats in the future.
	return DummyTimestamp, err
}
Beispiel #10
0
// Merge appends the input value to the string and returns the result.
func (s Appender) Merge(t Mergable) (Mergable, error) {
	m, ok := t.(Appender)
	if !ok {
		return s, util.Error("parameter is of wrong type")
	}
	return append(s, m...), nil
}
Beispiel #11
0
// NewMultiRaft creates a MultiRaft object.
func NewMultiRaft(nodeID NodeID, config *Config) (*MultiRaft, error) {
	if !nodeID.isSet() {
		return nil, util.Error("Invalid NodeID")
	}
	err := config.Validate()
	if err != nil {
		return nil, err
	}

	if config.Clock == nil {
		config.Clock = RealClock
	}

	m := &MultiRaft{
		Config:   *config,
		nodeID:   nodeID,
		Events:   make(chan interface{}, 1000),
		ops:      make(chan interface{}, 100),
		requests: make(chan *rpc.Call, 100),
		stopped:  make(chan struct{}),
	}

	err = m.Transport.Listen(nodeID, m)
	if err != nil {
		return nil, err
	}

	return m, nil
}
Beispiel #12
0
// replaceNode cuts a node away form its parent, substituting a new node or
// nil. The updated new node is returned. Note that this does not in fact alter
// the old node in any way, but only the old node's parent and the new node.
func (tc *treeContext) replaceNode(oldNode, newNode *proto.RangeTreeNode) (*proto.RangeTreeNode, error) {
	if oldNode.ParentKey == nil {
		if newNode == nil {
			return nil, util.Error("cannot replace the root node with nil")
		}
		// Update the root key if this was the root.
		tc.setRootKey(newNode.Key)
	} else {
		oldParent, err := tc.getNode(oldNode.ParentKey)
		if err != nil {
			return nil, err
		}
		if oldParent.LeftKey != nil && oldNode.Key.Equal(oldParent.LeftKey) {
			if newNode == nil {
				oldParent.LeftKey = nil
			} else {
				oldParent.LeftKey = newNode.Key
			}
		} else {
			if newNode == nil {
				oldParent.RightKey = nil
			} else {
				oldParent.RightKey = newNode.Key
			}
		}
		tc.setNode(oldParent)
	}
	if newNode != nil {
		newNode.ParentKey = oldNode.ParentKey
		tc.setNode(newNode)
	}
	return newNode, nil
}
// sendBatch unrolls a batched command and sends each constituent
// command in parallel.
func (tc *TxnCoordSender) sendBatch(batchArgs *proto.InternalBatchRequest, batchReply *proto.InternalBatchResponse) {
	// Prepare the calls by unrolling the batch. If the batchReply is
	// pre-initialized with replies, use those; otherwise create replies
	// as needed.
	// TODO(spencer): send calls in parallel.
	batchReply.Txn = batchArgs.Txn
	for i := range batchArgs.Requests {
		args := batchArgs.Requests[i].GetValue().(proto.Request)
		call := proto.Call{Args: args}
		// Disallow transaction, user and priority on individual calls, unless
		// equal.
		if args.Header().User != "" && args.Header().User != batchArgs.User {
			batchReply.Header().SetGoError(util.Error("cannot have individual user on call in batch"))
			return
		}
		args.Header().User = batchArgs.User
		if args.Header().UserPriority != nil && args.Header().GetUserPriority() != batchArgs.GetUserPriority() {
			batchReply.Header().SetGoError(util.Error("cannot have individual user priority on call in batch"))
			return
		}
		args.Header().UserPriority = batchArgs.UserPriority
		if txn := args.Header().Txn; txn != nil && !txn.Equal(batchArgs.Txn) {
			batchReply.Header().SetGoError(util.Error("cannot have individual transactional call in batch"))
			return
		}
		// Propagate batch Txn to each call.
		args.Header().Txn = batchArgs.Txn

		// Create a reply from the method type and add to batch response.
		if i >= len(batchReply.Responses) {
			call.Reply = args.CreateReply()
			batchReply.Add(call.Reply)
		} else {
			call.Reply = batchReply.Responses[i].GetValue().(proto.Response)
		}
		tc.sendOne(call)
		// Amalgamate transaction updates and propagate first error, if applicable.
		if batchReply.Txn != nil {
			batchReply.Txn.Update(call.Reply.Header().Txn)
		}
		if call.Reply.Header().Error != nil {
			batchReply.Error = call.Reply.Header().Error
			return
		}
	}
}
Beispiel #14
0
// ResolveWriteIntentRange commits or aborts (rolls back)
// the range of write intents specified by start and end
// keys for a given txnID according to commit parameter.
// ResolveWriteIntentRange will skip write intents of
// other txnIDs. Specify max=0 for unbounded resolves.
func (mvcc *MVCC) ResolveWriteIntentRange(key Key, endKey Key, max int64, txnID string, commit bool) (int64, error) {
	if len(txnID) == 0 {
		return 0, util.Error("missing txnID in request")
	}

	nextKey := key
	// TODO(Jiang-Ming): remove this after we put everything via MVCC.
	// Currently, we need to skip the series of reserved system
	// key / value pairs covering accounting, range metadata, node
	// accounting and permissions before the actual key / value pairs
	// since they don't have keyMetadata.
	if nextKey.Less(PrefixEndKey(Key("\x00"))) {
		nextKey = PrefixEndKey(Key("\x00"))
	}

	num := int64(0)
	for {
		kvs, err := mvcc.engine.Scan(nextKey, endKey, 1)
		if err != nil {
			return num, err
		}
		// No more keys exists in the given range.
		if len(kvs) == 0 {
			break
		}

		currentKey := kvs[0].Key
		_, existingTxnID, err := mvcc.Get(currentKey, hlc.MaxHLTimestamp, txnID)
		// Return the error unless its a writeIntentError, which
		// will occur in the event we scan a key with a write
		// intent belonging to a different transaction.
		if _, ok := err.(*writeIntentError); err != nil && !ok {
			return num, err
		}
		// endRangTransaction only needs to deal with the write
		// intents for the given txnID.
		if err == nil && existingTxnID == txnID {
			// commits or aborts (rolls back) the write intent of
			// the given txnID.
			err = mvcc.ResolveWriteIntent(currentKey, txnID, commit)
			if err != nil {
				return num, err
			}
			num++
		}

		if max != 0 && max == num {
			break
		}

		// In order to efficiently skip the possibly long list of
		// old versions for this key, please refer to scan function
		// for details.
		nextKey = NextKey(mvccEncodeKey(currentKey, hlc.MinHLTimestamp))
	}

	return num, nil
}
// updateForBatch updates the first argument (the header of a request contained
// in a batch) from the second one (the batch header), returning an error when
// inconsistencies are found.
// It is checked that the individual call does not have a User, UserPriority
// or Txn set that differs from the batch's.
func updateForBatch(aHeader *proto.RequestHeader, bHeader proto.RequestHeader) error {
	// Disallow transaction, user and priority on individual calls, unless
	// equal.
	if aHeader.User != "" && aHeader.User != bHeader.User {
		return util.Error("conflicting user on call in batch")
	}
	if aPrio := aHeader.GetUserPriority(); aPrio != proto.Default_RequestHeader_UserPriority && aPrio != bHeader.GetUserPriority() {
		return util.Error("conflicting user priority on call in batch")
	}
	if aHeader.Txn != nil && !aHeader.Txn.Equal(bHeader.Txn) {
		return util.Error("conflicting transaction in transactional batch")
	}

	aHeader.User = bHeader.User
	aHeader.UserPriority = bHeader.UserPriority
	aHeader.Txn = bHeader.Txn
	return nil
}
Beispiel #16
0
// getDescriptors takes a call and looks up the corresponding range
// descriptors associated with it. First, the range descriptor for
// call.Args.Key is looked up. If call.Args.EndKey exceeds that of the
// returned descriptor, the next descriptor is obtained as well.
func (ds *DistSender) getDescriptors(call proto.Call) (*proto.RangeDescriptor, *proto.RangeDescriptor, error) {
	// If this is a PushTxn, set ignoreIntents option as
	// necessary. This prevents a potential infinite loop; see the
	// comments in proto.RangeLookupRequest.
	options := lookupOptions{}
	if pushArgs, ok := call.Args.(*proto.PushTxnRequest); ok {
		options.ignoreIntents = pushArgs.RangeLookup
	}

	var desc *proto.RangeDescriptor
	var err error
	_, isReverseScan := call.Args.(*proto.ReverseScanRequest)
	if !isReverseScan {
		desc, err = ds.rangeCache.LookupRangeDescriptor(call.Args.Header().Key, options)
	} else {
		options.useReverseScan = true
		desc, err = ds.rangeCache.LookupRangeDescriptor(call.Args.Header().EndKey, options)
	}

	if err != nil {
		return nil, nil, err
	}

	// Checks whether need to get next range descriptor. If so, returns true
	// and the key to look up, depending on whether we're in reverse mode.
	needAnother := func(desc *proto.RangeDescriptor, isReverseScan bool) (proto.Key, bool) {
		if isReverseScan {
			return desc.StartKey, call.Args.Header().Key.Less(desc.StartKey)
		}
		return desc.EndKey, desc.EndKey.Less(call.Args.Header().EndKey)
	}

	var descNext *proto.RangeDescriptor
	// If the request accesses keys beyond the end of this range,
	// get the descriptor of the adjacent range to address next.
	if nextKey, ok := needAnother(desc, isReverseScan); ok {
		if _, ok := call.Reply.(proto.Combinable); !ok {
			return nil, nil, util.Error("illegal cross-range operation")
		}
		// If there's no transaction and op spans ranges, possibly
		// re-run as part of a transaction for consistency. The
		// case where we don't need to re-run is if the read
		// consistency is not required.
		if call.Args.Header().Txn == nil &&
			call.Args.Header().ReadConsistency != proto.INCONSISTENT {
			return nil, nil, &proto.OpRequiresTxnError{}
		}
		// This next lookup is likely for free since we've read the
		// previous descriptor and range lookups use cache
		// prefetching.
		descNext, err = ds.rangeCache.LookupRangeDescriptor(nextKey, options)
		if err != nil {
			return nil, nil, err
		}
	}
	return desc, descNext, nil
}
Beispiel #17
0
// CreateGroup creates a new consensus group and joins it.  The application should
// arrange to call CreateGroup on all nodes named in initialMembers.
func (m *MultiRaft) CreateGroup(groupID GroupID, initialMembers []NodeID) error {
	for _, id := range initialMembers {
		if !id.isSet() {
			return util.Error("Invalid NodeID")
		}
	}
	op := &createGroupOp{newGroup(groupID, initialMembers), make(chan error)}
	m.ops <- op
	return <-op.ch
}
Beispiel #18
0
// initStores initializes the Stores map from id to Store. Stores are
// added to the local sender if already bootstrapped. A bootstrapped
// Store has a valid ident with cluster, node and Store IDs set. If
// the Store doesn't yet have a valid ident, it's added to the
// bootstraps list for initialization once the cluster and node IDs
// have been determined.
func (n *Node) initStores(engines []engine.Engine, stopper *util.Stopper) error {
	bootstraps := list.New()

	if len(engines) == 0 {
		return util.Error("no engines")
	}
	for _, e := range engines {
		s := storage.NewStore(n.ctx, e, &n.Descriptor)
		// Initialize each store in turn, handling un-bootstrapped errors by
		// adding the store to the bootstraps list.
		if err := s.Start(stopper); err != nil {
			if _, ok := err.(*storage.NotBootstrappedError); ok {
				log.Infof("store %s not bootstrapped", s)
				bootstraps.PushBack(s)
				continue
			}
			return util.Errorf("failed to start store: %s", err)
		}
		if s.Ident.ClusterID == "" || s.Ident.NodeID == 0 {
			return util.Errorf("unidentified store: %s", s)
		}
		capacity, err := s.Capacity()
		if err != nil {
			return util.Errorf("could not query store capacity: %s", err)
		}
		log.Infof("initialized store %s: %+v", s, capacity)
		n.lSender.AddStore(s)
	}

	// Verify all initialized stores agree on cluster and node IDs.
	if err := n.validateStores(); err != nil {
		return err
	}

	// Connect gossip before starting bootstrap. For new nodes, connecting
	// to the gossip network is necessary to get the cluster ID.
	n.connectGossip()

	// If no NodeID has been assigned yet, allocate a new node ID by
	// supplying 0 to initNodeID.
	if n.Descriptor.NodeID == 0 {
		n.initNodeID(0)
	}

	// Bootstrap any uninitialized stores asynchronously.
	if bootstraps.Len() > 0 && stopper.StartTask() {
		go func() {
			n.bootstrapStores(bootstraps, stopper)
			stopper.FinishTask()
		}()
	}

	return nil
}
Beispiel #19
0
// Merge updates the value of this Counter with the supplied
// update and returns an error in case of an integer overflow.
func (n Counter) Merge(o Mergable) (Mergable, error) {
	m, ok := o.(Counter)
	if !ok {
		return n, util.Error("parameter is of wrong type")
	}
	if encoding.WillOverflow(int64(n), int64(m)) {
		return n, util.Errorf("merge error: %d + %d overflows", n, m)
	}
	result := Counter(n + m)
	return result, nil
}
Beispiel #20
0
// ResolveWriteIntentRange commits or aborts (rolls back) the range of
// write intents specified by start and end keys for a given txnID
// according to commit parameter.  ResolveWriteIntentRange will skip
// write intents of other txnIDs. Specify max=0 for unbounded
// resolves.
func (mvcc *MVCC) ResolveWriteIntentRange(key Key, endKey Key, max int64, txnID []byte, commit bool) (int64, error) {
	if len(txnID) == 0 {
		return 0, util.Error("missing txnID in request")
	}

	binKey := encoding.EncodeBinary(nil, key)
	binEndKey := encoding.EncodeBinary(nil, endKey)
	nextKey := binKey

	num := int64(0)
	for {
		kvs, err := mvcc.engine.Scan(nextKey, binEndKey, 1)
		if err != nil {
			return num, err
		}
		// No more keys exists in the given range.
		if len(kvs) == 0 {
			break
		}

		remainder, currentKey := encoding.DecodeBinary(kvs[0].Key)
		if len(remainder) != 0 {
			return 0, util.Errorf("expected an MVCC metadata key: %s", kvs[0].Key)
		}
		_, _, existingTxnID, err := mvcc.getInternal(kvs[0].Key, proto.MaxTimestamp, txnID)
		// Return the error unless its a writeIntentError, which
		// will occur in the event we scan a key with a write
		// intent belonging to a different transaction.
		if _, ok := err.(*writeIntentError); err != nil && !ok {
			return num, err
		}
		// ResolveWriteIntent only needs to deal with the write
		// intents for the given txnID.
		if err == nil && bytes.Equal(existingTxnID, txnID) {
			// commits or aborts (rolls back) the write intent of
			// the given txnID.
			err = mvcc.ResolveWriteIntent(currentKey, txnID, commit)
			if err != nil {
				return num, err
			}
			num++
		}

		if max != 0 && max == num {
			break
		}

		// In order to efficiently skip the possibly long list of
		// old versions for this key; refer to Scan for details.
		nextKey = encoding.EncodeBinary(nil, NextKey(currentKey))
	}

	return num, nil
}
Beispiel #21
0
// goMerge takes existing and update byte slices. It first attempts
// to gob-unmarshal the update string, returning an error on failure.
// The unmarshaled value must satisfy the Mergable interface.
// Next, it unmarshals the existing string, falling back to the init
// value supplied by the update value's Init() method if necessary.
// The two values obtained in this way are merged and the result, or
// an error, returned.
func goMerge(existing, update []byte) ([]byte, error) {
	u, err := encoding.GobDecode(update)
	if err != nil {
		return nil, util.Errorf("merge: %v", err)
	}
	if _, ok := u.(Mergable); !ok {
		return nil, util.Error("update is not Mergable")
	}
	e, err := encoding.GobDecode(existing)
	if err != nil {
		e = u.(Mergable).Init(existing)
	}
	if reflect.TypeOf(u) != reflect.TypeOf(e) {
		return nil, util.Error("cannot merge values of different type")
	}
	newValue, err := e.(Mergable).Merge(u.(Mergable))
	if err != nil {
		return nil, err
	}
	return encoding.GobEncode(newValue)
}
Beispiel #22
0
// ResolveWriteIntent either commits or aborts (rolls back) an extant
// write intent for a given txnID according to commit parameter.
// ResolveWriteIntent will skip write intents of other txnIDs.
func (mvcc *MVCC) ResolveWriteIntent(key Key, txnID []byte, commit bool) error {
	if len(txnID) == 0 {
		return util.Error("missing txnID in request")
	}

	binKey := encoding.EncodeBinary(nil, key)
	meta := &proto.MVCCMetadata{}
	ok, err := GetProto(mvcc.engine, binKey, meta)
	if err != nil {
		return err
	}
	if !ok {
		return util.Errorf("key %q does not exist", key)
	}

	if len(meta.TxnID) == 0 {
		return util.Errorf("write intent %q does not exist", key)
	}
	if !bytes.Equal(meta.TxnID, txnID) {
		return util.Errorf("cannot commit another TxnID %s from TxnID %s",
			meta.TxnID, txnID)
	}

	if !commit {
		latestKey := mvccEncodeKey(binKey, meta.Timestamp)
		err = mvcc.engine.Clear(latestKey)
		if err != nil {
			return err
		}

		// Compute the next possible mvcc value for this key.
		nextKey := NextKey(latestKey)
		// Compute the last possible mvcc value for this key.
		endScanKey := encoding.EncodeBinary(nil, NextKey(key))
		kvs, err := mvcc.engine.Scan(nextKey, endScanKey, 1)
		if err != nil {
			return err
		}
		// If there is no other version, we should just clean up the key entirely.
		if len(kvs) == 0 {
			return mvcc.engine.Clear(binKey)
		}
		_, ts, isValue := mvccDecodeKey(kvs[0].Key)
		if !isValue {
			return util.Errorf("expected an MVCC value key: %s", kvs[0].Key)
		}
		// Update the keyMetadata with the next version.
		return PutProto(mvcc.engine, binKey, &proto.MVCCMetadata{TxnID: nil, Timestamp: ts})
	}

	return PutProto(mvcc.engine, binKey, &proto.MVCCMetadata{TxnID: nil, Timestamp: meta.Timestamp})
}
Beispiel #23
0
// NewMultiRaft creates a MultiRaft object.
func NewMultiRaft(nodeID proto.RaftNodeID, config *Config, stopper *util.Stopper) (*MultiRaft, error) {
	if nodeID == 0 {
		return nil, util.Error("Invalid RaftNodeID")
	}
	if err := config.validate(); err != nil {
		return nil, err
	}

	if config.Ticker == nil {
		config.Ticker = newTicker(config.TickInterval)
		stopper.AddCloser(config.Ticker)
	}

	if config.EntryFormatter != nil {
		// Wrap the EntryFormatter to strip off the command id.
		ef := config.EntryFormatter
		config.EntryFormatter = func(data []byte) string {
			if len(data) == 0 {
				return "[empty]"
			}
			id, cmd := decodeCommand(data)
			formatted := ef(cmd)
			return fmt.Sprintf("%x: %s", id, formatted)
		}
	}

	m := &MultiRaft{
		Config:    *config,
		stopper:   stopper,
		multiNode: raft.StartMultiNode(uint64(nodeID)),
		nodeID:    nodeID,

		// Output channel.
		Events: make(chan interface{}, config.EventBufferSize),

		// Input channels.
		reqChan:         make(chan *RaftMessageRequest),
		createGroupChan: make(chan *createGroupOp),
		removeGroupChan: make(chan *removeGroupOp),
		proposalChan:    make(chan *proposal),
		callbackChan:    make(chan func()),
	}

	if err := m.Transport.Listen(nodeID, (*multiraftServer)(m)); err != nil {
		return nil, err
	}

	return m, nil
}
Beispiel #24
0
// rotateLeft performs a left rotation around the node.
func (tc *treeContext) rotateLeft(node *proto.RangeTreeNode) (*proto.RangeTreeNode, error) {
	right, err := tc.getNode(node.RightKey)
	if err != nil {
		return nil, err
	}
	if right.Black {
		return nil, util.Error("rotating a black node")
	}
	node.RightKey = right.LeftKey
	right.LeftKey = &node.Key
	right.Black = node.Black
	node.Black = false
	tc.setNode(node)
	tc.setNode(right)
	return right, nil
}
Beispiel #25
0
// rotateRight performs a right rotation around the node.
func (tc *treeContext) rotateRight(node *proto.RangeTreeNode) (*proto.RangeTreeNode, error) {
	left, err := tc.getNode(node.LeftKey)
	if err != nil {
		return nil, err
	}
	if left.Black {
		return nil, util.Error("rotating a black node")
	}
	node.LeftKey = left.RightKey
	left.RightKey = &node.Key
	left.Black = node.Black
	node.Black = false
	tc.setNode(node)
	tc.setNode(left)
	return left, nil
}
Beispiel #26
0
// NewMultiRaft creates a MultiRaft object.
func NewMultiRaft(nodeID NodeID, config *Config) (*MultiRaft, error) {
	if nodeID == 0 {
		return nil, util.Error("Invalid NodeID")
	}
	err := config.validate()
	if err != nil {
		return nil, err
	}

	if config.Ticker == nil {
		config.Ticker = newTicker(config.TickInterval)
	}

	if config.EntryFormatter != nil {
		// Wrap the EntryFormatter to strip off the command id.
		ef := config.EntryFormatter
		config.EntryFormatter = func(data []byte) string {
			if len(data) == 0 {
				return "[empty]"
			}
			id, cmd := decodeCommand(data)
			formatted := ef(cmd)
			return fmt.Sprintf("%x: %s", id, formatted)
		}
	}

	m := &MultiRaft{
		Config: *config,
		multiNode: raft.StartMultiNode(uint64(nodeID), config.ElectionTimeoutTicks,
			config.HeartbeatIntervalTicks),
		nodeID:          nodeID,
		Events:          make(chan interface{}, 1000),
		reqChan:         make(chan *RaftMessageRequest, 100),
		createGroupChan: make(chan *createGroupOp, 100),
		removeGroupChan: make(chan *removeGroupOp, 100),
		proposalChan:    make(chan *proposal, 100),
		callbackChan:    make(chan func(), 100),
		stopper:         util.NewStopper(1),
	}

	err = m.Transport.Listen(nodeID, (*multiraftServer)(m))
	if err != nil {
		return nil, err
	}

	return m, nil
}
Beispiel #27
0
// RemoveTarget returns a suitable replica to remove from the provided replica
// set. It attempts to consider which of the provided replicas would be the best
// candidate for removal.
//
// TODO(mrtracy): RemoveTarget eventually needs to accept the attributes from
// the zone config associated with the provided replicas. This will allow it to
// make correct decisions in the case of ranges with heterogeneous replica
// requirements (i.e. multiple data centers).
func (a *allocator) RemoveTarget(existing []proto.Replica) (proto.Replica, error) {
	if len(existing) == 0 {
		return proto.Replica{}, util.Error("must supply at least one replica to allocator.RemoveTarget()")
	}
	a.Lock()
	defer a.Unlock()

	// Retrieve store descriptors for the provided replicas from gossip.
	type replStore struct {
		repl  proto.Replica
		store *proto.StoreDescriptor
	}
	replStores := make([]replStore, len(existing))
	usedStat := stat{}
	for i := range existing {
		desc, err := storeDescFromGossip(gossip.MakeStoreKey(existing[i].StoreID), a.gossip)
		if err != nil {
			return proto.Replica{}, err
		}
		replStores[i] = replStore{
			repl:  existing[i],
			store: desc,
		}
		usedStat.Update(desc.Capacity.FractionUsed())
	}

	// Based on store statistics, determine which replica is the "worst" and
	// thus should be removed.
	var worst replStore
	for i, rs := range replStores {
		if i == 0 {
			worst = rs
			continue
		}

		if usedStat.mean < minFractionUsedThreshold {
			if rs.store.Capacity.RangeCount > worst.store.Capacity.RangeCount {
				worst = rs
			}
			continue
		}
		if rs.store.Capacity.FractionUsed() > worst.store.Capacity.FractionUsed() {
			worst = rs
		}
	}
	return worst.repl, nil
}
Beispiel #28
0
// addAdminCmd executes the command directly. There is no interaction
// with the command queue or the timestamp cache, as admin commands
// are not meant to consistently access or modify the underlying data.
// Admin commands must run on the leader replica.
func (r *Range) addAdminCmd(ctx context.Context, args proto.Request, reply proto.Response) error {
	// Admin commands always require the leader lease.
	if err := r.redirectOnOrAcquireLeaderLease(args.Header().Timestamp); err != nil {
		reply.Header().SetGoError(err)
		return err
	}

	switch args.(type) {
	case *proto.AdminSplitRequest:
		r.AdminSplit(args.(*proto.AdminSplitRequest), reply.(*proto.AdminSplitResponse))
	case *proto.AdminMergeRequest:
		r.AdminMerge(args.(*proto.AdminMergeRequest), reply.(*proto.AdminMergeResponse))
	default:
		return util.Error("unrecognized admin command")
	}
	return reply.Header().GoError()
}
Beispiel #29
0
func (s *state) addLogEntry(groupID GroupID, entryType LogEntryType, payload []byte) error {
	g := s.groups[groupID]
	if g.role != RoleLeader {
		return util.Error("TODO(bdarnell): forward commands to leader")
	}

	g.lastLogIndex++
	entry := &LogEntry{
		Term:    g.electionState.CurrentTerm,
		Index:   g.lastLogIndex,
		Type:    entryType,
		Payload: payload,
	}
	g.pendingEntries = append(g.pendingEntries, entry)
	s.updateDirtyStatus(g)
	return nil
}
Beispiel #30
0
// ResolveWriteIntent either commits or aborts (rolls back) an extant
// write intent for a given txnID according to commit parameter.
// ResolveWriteIntent will skip write intents of other txnIDs.
func (mvcc *MVCC) ResolveWriteIntent(key Key, txnID string, commit bool) error {
	if len(txnID) == 0 {
		return util.Error("missing txnID in request")
	}

	keyMeta := &keyMetadata{}
	ok, err := GetI(mvcc.engine, key, keyMeta)
	if err != nil {
		return err
	}
	if !ok {
		return util.Errorf("key %q does not exist", key)
	}

	if len(keyMeta.TxnID) == 0 {
		return util.Errorf("write intent does not exist", key)
	}
	if keyMeta.TxnID != txnID {
		return util.Errorf("cannot commit another TxnID %s from TxnID %s",
			keyMeta.TxnID, txnID)
	}

	if !commit {
		latestKey := mvccEncodeKey(key, keyMeta.Timestamp)
		err = mvcc.engine.Clear(latestKey)
		if err != nil {
			return err
		}

		nextKey := NextKey(latestKey)
		kvs, err := mvcc.engine.Scan(nextKey, PrefixEndKey(key), 1)
		if err != nil {
			return err
		}
		// If there is no other version, we should just clean up the key entirely.
		if len(kvs) == 0 {
			return mvcc.engine.Clear(key)
		}

		_, ts := mvccDecodeKey(kvs[0].Key)
		// Update the keyMetadata with the next version.
		return PutI(mvcc.engine, key, &keyMetadata{TxnID: "", Timestamp: ts})
	}

	return PutI(mvcc.engine, key, &keyMetadata{TxnID: "", Timestamp: keyMeta.Timestamp})
}