Example #1
0
// promote() - called when remote predecessor died or left the ring
// because we're only first REMOTE node from original master
// it doesn't mean that we're the actual successor for all the replicated data with depth 0
// if we are, we promote ourselves
// if not, we must find actual successor for each key, and promote that vnode for each key
func (dt *DTable) promote(vnode *dendrite.Vnode) {
	//log.Printf("Node left me: %X for %X now replicating to:\n", localVn.Id, new_pred.Id)
	rtable := dt.rtable[vnode.String()]
	vn_table := dt.table[vnode.String()]
	for key_str, ritem := range rtable {
		if ritem.replicaInfo.depth != 0 {
			continue
		}
		// check if we're real successor for this key
		succs, err := dt.ring.Lookup(1, ritem.keyHash)
		if err != nil {
			dt.Logf(LogInfo, "Could not promote key, Lookup() failed: %s\n", err.Error())
			continue
		}
		if bytes.Compare(succs[0].Id, vnode.Id) == 0 {
			// this key should be promoted locally
			new_ritem := ritem.dup()
			new_ritem.replicaInfo.vnodes[0] = nil
			new_ritem.commited = true
			new_ritem.lock.Lock()
			vn_table.put(new_ritem)
			dt.Logf(LogDebug, "Promoted local key: %s - running replicator now replicas are %+v \n", key_str, new_ritem.replicaInfo.vnodes)
			delete(rtable, key_str)
			dt.Logf(LogDebug, "Promote calling replicateKey for key %s\n", key_str)
			dt.replicateKey(vnode, new_ritem, dt.ring.Replicas())
			new_ritem.lock.Unlock()
			dt.Logf(LogDebug, "Promote finishing key %s, replicaVnodes are: %+v\n", key_str, new_ritem.replicaInfo.vnodes)
		} else {
			// TODO promote remote vnode
			dt.Logf(LogDebug, "Promoting remote vnode %s for key %s\n", succs[0].String(), key_str)
			delete(rtable, key_str)
			dt.remotePromoteKey(vnode, succs[0], ritem)
		}
	}
}
Example #2
0
// changeReplicas() -- callend when replica set changes
//
func (dt *DTable) changeReplicas(vnode *dendrite.Vnode, new_replicas []*dendrite.Vnode) {
	for _, item := range dt.table[vnode.String()] {
		if !item.commited {
			continue
		}
		item.lock.Lock()
		dt.replicateKey(vnode, item, dt.ring.Replicas())
		item.lock.Unlock()
	}
}
Example #3
0
// rollback is called on failed set()
func (dt *DTable) rollback(vn *dendrite.Vnode, item *kvItem) {
	if item.replicaInfo != nil {
		for _, replica := range item.replicaInfo.vnodes {
			if replica != nil {
				dt.remoteClearReplica(replica, item, false)
			}
		}
	}
	delete(dt.table[vn.String()], item.keyHashString())
}
Example #4
0
// handle remote replica requests
func (dt *DTable) setReplica(vnode *dendrite.Vnode, item *kvItem) {
	key_str := item.keyHashString()
	if item.Val == nil {
		//log.Println("SetReplica() - value for key", key_str, "is nil, removing item")
		delete(dt.rtable[vnode.String()], key_str)
	} else {
		//log.Println("SetReplica() - success for key", key_str)
		item.commited = true
		dt.rtable[vnode.String()][key_str] = item
	}
}
Example #5
0
// promoteKey() -- called when remote wants to promote a key to us
func (dt *DTable) promoteKey(vnode *dendrite.Vnode, reqItem *kvItem) {
	rtable := dt.rtable[vnode.String()]
	vn_table := dt.table[vnode.String()]
	// if we're already primary node for this key, just replicate again because one replica could be deleted
	if _, ok := vn_table[reqItem.keyHashString()]; ok {
		dt.replicateKey(vnode, reqItem, dt.ring.Replicas())
		return
	}
	delete(rtable, reqItem.keyHashString())
	reqItem.lock.Lock()
	vn_table.put(reqItem)
	dt.replicateKey(vnode, reqItem, dt.ring.Replicas())
	reqItem.lock.Unlock()
}
Example #6
0
// processDemoteKey is called when our successor is demoting key to us.
// We fix replicas for the key and when we're done we make a call to origin
// (old primary for this key) to clear demotedItem there.
func (dt *DTable) processDemoteKey(vnode, origin, old_master *dendrite.Vnode, reqItem *kvItem) {
	// find the key in our primary table
	key_str := reqItem.keyHashString()
	if _, ok := dt.table[vnode.String()][key_str]; ok {
		dt.replicateKey(vnode, reqItem, dt.ring.Replicas())

		// now clear demoted item on origin
		err := dt.remoteClearReplica(origin, reqItem, true)
		if err != nil {
			dt.Logf(LogInfo, "processDemoteKey() - failed while removing demoted key from origin %x for key %s\n", origin.Id, key_str)
		}
	} else {
		dt.Logln(LogInfo, "processDemoteKey failed - key not found:", key_str)
		return
	}
}
Example #7
0
/* demote() - promotes new predecessor with keys from primary table
if new predecessor is local:
	- move all of my replica keys to new vnode
	- replica scheme of remote successors doesn't change here
	  we just need to update metadata on all replica nodes to reflect this change
if new predecessor is remote:
  - for all keys in primary table, that are <= new_pred.Id:
  	1. move key to demoted table and wait there for cleanup call from new master
  	2. call demoteKey() to commit to new_pred's primary table + let that vnode know where existing replicas are
  	3. demoteKey() will callback to cleanup each key from demoted table after it's written new replicas
  - handle replica-0 table such that:
  	1. for each key, check if master vnode is located on same physical node as new_pred
  	- if it is, we don't need to do anything because we're still natural remote successor
  	- if not
  		1. call demoteReplica() to let master know existing replica setup and about newRemoteSucc
  		2. master will reconfigure replicas around and delete unnecessary copies (if any)
*/
func (dt *DTable) demote(vnode, new_pred *dendrite.Vnode) {
	// determine if new_pred is on this node
	isLocal := false
	for _, lvn := range dt.ring.MyVnodes() {
		if lvn.Host == new_pred.Host {
			isLocal = true
		}
	}
	switch isLocal {
	case true:
		// move all replica keys to new vnode
		vn_rtable := dt.rtable[vnode.String()]
		for rkey, ritem := range vn_rtable {
			if !ritem.commited {
				continue
			}

			ritem.replicaInfo.vnodes[ritem.replicaInfo.depth] = new_pred
			ritem.lock.Lock()
			dt.rtable[new_pred.String()].put(ritem)
			delete(vn_rtable, rkey)

			// update metadata on all replicas
			new_state := ritem.replicaInfo.state
			for idx, replica := range ritem.replicaInfo.vnodes {
				// skip ourselves
				if idx == ritem.replicaInfo.depth {
					continue
				}
				new_ritem := ritem.dup()
				new_ritem.replicaInfo.depth = idx
				new_ritem.replicaInfo.state = new_state

				err := dt.remoteSetReplicaInfo(replica, new_ritem)
				if err != nil {
					dt.Logf(LogInfo, "Error updating replicaMeta on demote() -", err)
					new_state = replicaIncomplete
					continue
				}
			}
			ritem.lock.Unlock()
		}
	case false:
		// loop over primary table to find keys that should belong to new predecessor
		vn_table := dt.table[vnode.String()]
		for key_str, item := range vn_table {
			if !item.commited {
				continue
			}
			if dendrite.Between(vnode.Id, new_pred.Id, item.keyHash, true) {
				//log.Printf("Analyzed key for demoting %s and pushing to %s\n", key_str, new_pred.String())
				// copy the key to demoted table and remove it from primary one
				dt.demoted_table[vnode.String()][item.keyHashString()] = item.to_demoted(new_pred)
				delete(vn_table, key_str)
				done_c := make(chan error)
				go dt.remoteSet(vnode, new_pred, item, dt.ring.Replicas(), true, done_c)
				err := <-done_c
				if err != nil {
					dt.Logln(LogInfo, "Error demoting key to new predecessor -", err)
					continue
				}
			}
		}
	}

}
Example #8
0
// Client Request: set value for a key to remote host
func (dt *DTable) remoteSet(origin, remote *dendrite.Vnode, reqItem *kvItem, minAcks int, demoting bool, done chan error) {
	//fmt.Printf("REMOTESET CALLED from %s to %s for key %s\n", origin.String(), remote.String(), reqItem.keyHashString())
	error_c := make(chan error, 1)
	resp_c := make(chan bool, 1)
	zmq_transport := dt.transport.(*dendrite.ZMQTransport)

	go func() {
		req_sock, err := zmq_transport.ZMQContext.NewSocket(zmq.REQ)
		if err != nil {
			error_c <- fmt.Errorf("ZMQ:DTable:remoteSet - newsocket error - %s", err)
			return
		}
		req_sock.SetRcvtimeo(5 * time.Second)
		req_sock.SetSndtimeo(5 * time.Second)

		defer req_sock.Close()
		err = req_sock.Connect("tcp://" + remote.Host)
		if err != nil {
			error_c <- fmt.Errorf("ZMQ:DTable:remoteSet - connect error - %s", err)
			return
		}
		// Build request protobuf
		req := &PBDTableSetItem{
			Origin:   origin.ToProtobuf(),
			Dest:     remote.ToProtobuf(),
			Item:     reqItem.to_protobuf(),
			MinAcks:  proto.Int32(int32(minAcks)),
			Demoting: proto.Bool(demoting),
		}

		reqData, _ := proto.Marshal(req)
		encoded := dt.transport.Encode(PbDtableSetItem, reqData)
		_, err = req_sock.SendBytes(encoded, 0)
		if err != nil {
			error_c <- fmt.Errorf("ZMQ:DTable:remoteSet - error while sending request - %s", err)
			return
		}

		// read response and decode it
		resp, err := req_sock.RecvBytes(0)
		if err != nil {
			error_c <- fmt.Errorf("ZMQ:DTable:remoteSet - error while reading response - %s", err)
			return
		}
		decoded, err := dt.transport.Decode(resp)
		if err != nil {
			error_c <- fmt.Errorf("ZMQ:DTable:remoteSet - error while decoding response - %s", err)
			return
		}

		switch decoded.Type {
		case dendrite.PbErr:
			pbMsg := decoded.TransportMsg.(dendrite.PBProtoErr)
			error_c <- fmt.Errorf("ZMQ:DTable:remoteSet - got error response - %s", pbMsg.GetError())
		case PbDtableResponse:
			pbMsg := decoded.TransportMsg.(PBDTableResponse)
			success := pbMsg.GetOk()
			if success {
				resp_c <- true
				return
			}
			error_c <- fmt.Errorf("ZMQ:DTable:remoteSet - write error - %s", pbMsg.GetError())
			return
		default:
			// unexpected response
			error_c <- fmt.Errorf("ZMQ:DTable:remoteSet - unexpected response")
			return
		}
	}()

	select {
	case <-time.After(zmq_transport.ClientTimeout):
		done <- fmt.Errorf("ZMQ:DTable:remoteSet - command timed out!")
	case err := <-error_c:
		done <- err
	case _ = <-resp_c:
		done <- nil
	}
}
Example #9
0
// Client Request: promote remote vnode for a key
func (dt *DTable) remotePromoteKey(origin, remote *dendrite.Vnode, reqItem *kvItem) error {
	error_c := make(chan error, 1)
	resp_c := make(chan bool, 1)
	zmq_transport := dt.transport.(*dendrite.ZMQTransport)

	go func() {
		req_sock, err := zmq_transport.ZMQContext.NewSocket(zmq.REQ)
		if err != nil {
			error_c <- fmt.Errorf("ZMQ:DTable:remotePromoteKey - newsocket error - %s", err)
			return
		}
		req_sock.SetRcvtimeo(5 * time.Second)
		req_sock.SetSndtimeo(5 * time.Second)

		defer req_sock.Close()
		err = req_sock.Connect("tcp://" + remote.Host)
		if err != nil {
			error_c <- fmt.Errorf("ZMQ:DTable:remotePromoteKey - connect error - %s", err)
			return
		}

		// Build request protobuf
		req := &PBDTablePromoteKey{
			Dest:   remote.ToProtobuf(),
			Origin: origin.ToProtobuf(),
			Item:   reqItem.to_protobuf(),
		}

		reqData, _ := proto.Marshal(req)
		encoded := dt.transport.Encode(PbDtablePromoteKey, reqData)
		_, err = req_sock.SendBytes(encoded, 0)
		if err != nil {
			error_c <- fmt.Errorf("ZMQ:DTable:remotePromoteKey - error while sending request - %s", err)
			return
		}

		// read response and decode it
		resp, err := req_sock.RecvBytes(0)
		if err != nil {
			error_c <- fmt.Errorf("ZMQ:DTable:remotePromoteKey - error while reading response - %s", err)
			return
		}
		decoded, err := dt.transport.Decode(resp)
		if err != nil {
			error_c <- fmt.Errorf("ZMQ:DTable:remotePromoteKey - error while decoding response - %s", err)
			return
		}

		switch decoded.Type {
		case dendrite.PbErr:
			pbMsg := decoded.TransportMsg.(dendrite.PBProtoErr)
			error_c <- fmt.Errorf("ZMQ:DTable:remotePromoteKey - got error response - %s", pbMsg.GetError())
		case PbDtableResponse:
			pbMsg := decoded.TransportMsg.(PBDTableResponse)
			success := pbMsg.GetOk()
			if success {
				resp_c <- true
				return
			}
			error_c <- fmt.Errorf("ZMQ:DTable:remotePromoteKey - error - %s", pbMsg.GetError())
			return
		default:
			// unexpected response
			error_c <- fmt.Errorf("ZMQ:DTable:remotePromoteKey - unexpected response")
			return
		}
	}()

	select {
	case <-time.After(zmq_transport.ClientTimeout):
		return fmt.Errorf("ZMQ:DTable:remotePromoteKey - command timed out!")
	case err := <-error_c:
		return err
	case _ = <-resp_c:
		return nil
	}
}
Example #10
0
// Client Request: Get value for a key from remote host
func (dt *DTable) remoteGet(remote *dendrite.Vnode, reqItem *kvItem) (*kvItem, bool, error) {
	error_c := make(chan error, 1)
	resp_c := make(chan *kvItem, 1)
	notfound_c := make(chan bool, 1)
	zmq_transport := dt.transport.(*dendrite.ZMQTransport)
	go func() {

		req_sock, err := zmq_transport.ZMQContext.NewSocket(zmq.REQ)
		if err != nil {
			error_c <- fmt.Errorf("ZMQ:DTable:remoteGet - newsocket error - %s", err)
			return
		}
		req_sock.SetRcvtimeo(2 * time.Second)
		req_sock.SetSndtimeo(2 * time.Second)

		defer req_sock.Close()
		err = req_sock.Connect("tcp://" + remote.Host)
		if err != nil {
			error_c <- fmt.Errorf("ZMQ:DTable:remoteGet - connect error - %s", err)
			return
		}
		// Build request protobuf
		req := &PBDTableGetItem{
			Dest:    remote.ToProtobuf(),
			KeyHash: reqItem.keyHash,
		}

		reqData, _ := proto.Marshal(req)
		encoded := dt.transport.Encode(PbDtableGetItem, reqData)
		_, err = req_sock.SendBytes(encoded, 0)
		if err != nil {
			error_c <- fmt.Errorf("ZMQ:DTable:remoteGet - error while sending request - %s", err)
			return
		}

		// read response and decode it
		resp, err := req_sock.RecvBytes(0)
		if err != nil {
			error_c <- fmt.Errorf("ZMQ:DTable:remoteGet - error while reading response - %s", err)
			return
		}
		decoded, err := dt.transport.Decode(resp)
		if err != nil {
			error_c <- fmt.Errorf("ZMQ:DTable:remoteGet - error while decoding response - %s", err)
			return
		}

		switch decoded.Type {
		case dendrite.PbErr:
			pbMsg := decoded.TransportMsg.(dendrite.PBProtoErr)
			error_c <- fmt.Errorf("ZMQ:DTable:remoteGet - got error response - %s", pbMsg.GetError())
		case PbDtableItem:
			pbMsg := decoded.TransportMsg.(PBDTableItem)
			if found := pbMsg.GetFound(); !found {
				notfound_c <- true
				return
			}
			item := new(kvItem)
			copy(item.Key, reqItem.Key)
			copy(item.keyHash, reqItem.keyHash)
			item.Val = pbMsg.GetVal()
			resp_c <- item
			return
		default:
			// unexpected response
			error_c <- fmt.Errorf("ZMQ:DTable:remoteGet - unexpected response")
			return
		}
	}()

	select {
	case <-time.After(zmq_transport.ClientTimeout):
		return nil, false, fmt.Errorf("ZMQ:DTable:remoteGet - command timed out!")
	case err := <-error_c:
		return nil, false, err
	case _ = <-notfound_c:
		return nil, false, nil
	case item := <-resp_c:
		return item, true, nil
	}
}
Example #11
0
/* set writes to dtable's primary(non-replica table). It is called from both Query api and
by remote clients via zmq.

It reports back on done chan when minAcks is reached so that clients can continue without
blocking while replication takes place.
*/
func (dt *DTable) set(vn *dendrite.Vnode, item *kvItem, minAcks int, done chan error) {
	// make sure we have local handler before doing any write
	handler, _ := dt.transport.GetVnodeHandler(vn)
	if handler == nil {
		done <- fmt.Errorf("local handler could not be found for vnode %x", vn.Id)
		return
	}
	write_count := 0
	vn_table, _ := dt.table[vn.String()]

	item.lock.Lock()
	defer item.lock.Unlock()

	item.replicaInfo.master = vn
	err := vn_table.put(item)
	if err != nil {
		done <- err
		return
	}

	write_count++
	repwrite_count := 0
	returned := false
	item.replicaInfo.state = replicaIncomplete

	// should we return to client immediately?
	if minAcks == write_count {
		// cover the case where ring.Replicas() returns 0
		if dt.ring.Replicas() == repwrite_count {
			item.replicaInfo.state = replicaStable
			item.commited = true
			done <- nil
			dt.callHooks(item)
			return
		}
		item.commited = true
		done <- nil
		returned = true
	}

	// find remote successors to write replicas to
	remote_succs, err := handler.FindRemoteSuccessors(dt.ring.Replicas())
	if err != nil {
		if !returned {
			done <- fmt.Errorf("could not find replica nodes due to error %s", err)
		}
		dt.Logf(LogDebug, "could not find replica nodes due to error %s\n", err)
		dt.rollback(vn, item)
		return
	}

	// don't write any replica if not enough replica nodes have been found for requested consistency
	if minAcks > len(remote_succs)+1 {
		done <- fmt.Errorf("insufficient nodes found for requested consistency level (%d)\n", minAcks)
		dt.rollback(vn, item)
		return
	}

	// now lets write replicas
	item_replicas := make([]*dendrite.Vnode, 0)
	repl_item := item.dup()
	repl_item.commited = false

	for _, succ := range remote_succs {
		err := dt.remoteWriteReplica(vn, succ, repl_item)
		if err != nil {
			dt.Logf(LogDebug, "could not write replica due to error: %s\n", err)
			continue
		}
		item_replicas = append(item_replicas, succ)
	}

	// check if we have enough written replicas for requested minAcks
	if minAcks > len(item_replicas)+1 {
		done <- fmt.Errorf("insufficient active nodes found for requested consistency level (%d)\n", minAcks)
		dt.rollback(vn, item)
		return
	}

	// update replication state based on available replicas
	var target_state replicaState
	if dt.ring.Replicas() <= len(item_replicas) {
		target_state = replicaStable
	} else {
		target_state = replicaPartial
	}

	// replicas have been written, lets now update metadata
	real_idx := 0
	fail_count := 0
	repl_item.commited = true
	repl_item.replicaInfo.vnodes = item_replicas
	repl_item.replicaInfo.state = target_state
	repl_item.replicaInfo.master = vn

	for _, replica := range item_replicas {
		// update metadata/commit on remote
		repl_item.replicaInfo.depth = real_idx
		err := dt.remoteSetReplicaInfo(replica, repl_item)
		if err != nil {
			fail_count++
			if !returned && len(item_replicas)-fail_count < minAcks {
				done <- fmt.Errorf("insufficient (phase2) active nodes found for requested consistency level (%d)\n", minAcks)
				dt.rollback(vn, item)
				return
			}
			continue
		}
		real_idx++
		repwrite_count++

		// notify client if enough replicas have been written
		if !returned && repwrite_count+1 == minAcks {
			done <- nil
			returned = true
		}
	}
	item.replicaInfo.state = target_state
	item.commited = true
	dt.callHooks(item)

}