// promote() - called when remote predecessor died or left the ring // because we're only first REMOTE node from original master // it doesn't mean that we're the actual successor for all the replicated data with depth 0 // if we are, we promote ourselves // if not, we must find actual successor for each key, and promote that vnode for each key func (dt *DTable) promote(vnode *dendrite.Vnode) { //log.Printf("Node left me: %X for %X now replicating to:\n", localVn.Id, new_pred.Id) rtable := dt.rtable[vnode.String()] vn_table := dt.table[vnode.String()] for key_str, ritem := range rtable { if ritem.replicaInfo.depth != 0 { continue } // check if we're real successor for this key succs, err := dt.ring.Lookup(1, ritem.keyHash) if err != nil { dt.Logf(LogInfo, "Could not promote key, Lookup() failed: %s\n", err.Error()) continue } if bytes.Compare(succs[0].Id, vnode.Id) == 0 { // this key should be promoted locally new_ritem := ritem.dup() new_ritem.replicaInfo.vnodes[0] = nil new_ritem.commited = true new_ritem.lock.Lock() vn_table.put(new_ritem) dt.Logf(LogDebug, "Promoted local key: %s - running replicator now replicas are %+v \n", key_str, new_ritem.replicaInfo.vnodes) delete(rtable, key_str) dt.Logf(LogDebug, "Promote calling replicateKey for key %s\n", key_str) dt.replicateKey(vnode, new_ritem, dt.ring.Replicas()) new_ritem.lock.Unlock() dt.Logf(LogDebug, "Promote finishing key %s, replicaVnodes are: %+v\n", key_str, new_ritem.replicaInfo.vnodes) } else { // TODO promote remote vnode dt.Logf(LogDebug, "Promoting remote vnode %s for key %s\n", succs[0].String(), key_str) delete(rtable, key_str) dt.remotePromoteKey(vnode, succs[0], ritem) } } }
// changeReplicas() -- callend when replica set changes // func (dt *DTable) changeReplicas(vnode *dendrite.Vnode, new_replicas []*dendrite.Vnode) { for _, item := range dt.table[vnode.String()] { if !item.commited { continue } item.lock.Lock() dt.replicateKey(vnode, item, dt.ring.Replicas()) item.lock.Unlock() } }
// rollback is called on failed set() func (dt *DTable) rollback(vn *dendrite.Vnode, item *kvItem) { if item.replicaInfo != nil { for _, replica := range item.replicaInfo.vnodes { if replica != nil { dt.remoteClearReplica(replica, item, false) } } } delete(dt.table[vn.String()], item.keyHashString()) }
// handle remote replica requests func (dt *DTable) setReplica(vnode *dendrite.Vnode, item *kvItem) { key_str := item.keyHashString() if item.Val == nil { //log.Println("SetReplica() - value for key", key_str, "is nil, removing item") delete(dt.rtable[vnode.String()], key_str) } else { //log.Println("SetReplica() - success for key", key_str) item.commited = true dt.rtable[vnode.String()][key_str] = item } }
// promoteKey() -- called when remote wants to promote a key to us func (dt *DTable) promoteKey(vnode *dendrite.Vnode, reqItem *kvItem) { rtable := dt.rtable[vnode.String()] vn_table := dt.table[vnode.String()] // if we're already primary node for this key, just replicate again because one replica could be deleted if _, ok := vn_table[reqItem.keyHashString()]; ok { dt.replicateKey(vnode, reqItem, dt.ring.Replicas()) return } delete(rtable, reqItem.keyHashString()) reqItem.lock.Lock() vn_table.put(reqItem) dt.replicateKey(vnode, reqItem, dt.ring.Replicas()) reqItem.lock.Unlock() }
// processDemoteKey is called when our successor is demoting key to us. // We fix replicas for the key and when we're done we make a call to origin // (old primary for this key) to clear demotedItem there. func (dt *DTable) processDemoteKey(vnode, origin, old_master *dendrite.Vnode, reqItem *kvItem) { // find the key in our primary table key_str := reqItem.keyHashString() if _, ok := dt.table[vnode.String()][key_str]; ok { dt.replicateKey(vnode, reqItem, dt.ring.Replicas()) // now clear demoted item on origin err := dt.remoteClearReplica(origin, reqItem, true) if err != nil { dt.Logf(LogInfo, "processDemoteKey() - failed while removing demoted key from origin %x for key %s\n", origin.Id, key_str) } } else { dt.Logln(LogInfo, "processDemoteKey failed - key not found:", key_str) return } }
/* demote() - promotes new predecessor with keys from primary table if new predecessor is local: - move all of my replica keys to new vnode - replica scheme of remote successors doesn't change here we just need to update metadata on all replica nodes to reflect this change if new predecessor is remote: - for all keys in primary table, that are <= new_pred.Id: 1. move key to demoted table and wait there for cleanup call from new master 2. call demoteKey() to commit to new_pred's primary table + let that vnode know where existing replicas are 3. demoteKey() will callback to cleanup each key from demoted table after it's written new replicas - handle replica-0 table such that: 1. for each key, check if master vnode is located on same physical node as new_pred - if it is, we don't need to do anything because we're still natural remote successor - if not 1. call demoteReplica() to let master know existing replica setup and about newRemoteSucc 2. master will reconfigure replicas around and delete unnecessary copies (if any) */ func (dt *DTable) demote(vnode, new_pred *dendrite.Vnode) { // determine if new_pred is on this node isLocal := false for _, lvn := range dt.ring.MyVnodes() { if lvn.Host == new_pred.Host { isLocal = true } } switch isLocal { case true: // move all replica keys to new vnode vn_rtable := dt.rtable[vnode.String()] for rkey, ritem := range vn_rtable { if !ritem.commited { continue } ritem.replicaInfo.vnodes[ritem.replicaInfo.depth] = new_pred ritem.lock.Lock() dt.rtable[new_pred.String()].put(ritem) delete(vn_rtable, rkey) // update metadata on all replicas new_state := ritem.replicaInfo.state for idx, replica := range ritem.replicaInfo.vnodes { // skip ourselves if idx == ritem.replicaInfo.depth { continue } new_ritem := ritem.dup() new_ritem.replicaInfo.depth = idx new_ritem.replicaInfo.state = new_state err := dt.remoteSetReplicaInfo(replica, new_ritem) if err != nil { dt.Logf(LogInfo, "Error updating replicaMeta on demote() -", err) new_state = replicaIncomplete continue } } ritem.lock.Unlock() } case false: // loop over primary table to find keys that should belong to new predecessor vn_table := dt.table[vnode.String()] for key_str, item := range vn_table { if !item.commited { continue } if dendrite.Between(vnode.Id, new_pred.Id, item.keyHash, true) { //log.Printf("Analyzed key for demoting %s and pushing to %s\n", key_str, new_pred.String()) // copy the key to demoted table and remove it from primary one dt.demoted_table[vnode.String()][item.keyHashString()] = item.to_demoted(new_pred) delete(vn_table, key_str) done_c := make(chan error) go dt.remoteSet(vnode, new_pred, item, dt.ring.Replicas(), true, done_c) err := <-done_c if err != nil { dt.Logln(LogInfo, "Error demoting key to new predecessor -", err) continue } } } } }
// Client Request: set value for a key to remote host func (dt *DTable) remoteSet(origin, remote *dendrite.Vnode, reqItem *kvItem, minAcks int, demoting bool, done chan error) { //fmt.Printf("REMOTESET CALLED from %s to %s for key %s\n", origin.String(), remote.String(), reqItem.keyHashString()) error_c := make(chan error, 1) resp_c := make(chan bool, 1) zmq_transport := dt.transport.(*dendrite.ZMQTransport) go func() { req_sock, err := zmq_transport.ZMQContext.NewSocket(zmq.REQ) if err != nil { error_c <- fmt.Errorf("ZMQ:DTable:remoteSet - newsocket error - %s", err) return } req_sock.SetRcvtimeo(5 * time.Second) req_sock.SetSndtimeo(5 * time.Second) defer req_sock.Close() err = req_sock.Connect("tcp://" + remote.Host) if err != nil { error_c <- fmt.Errorf("ZMQ:DTable:remoteSet - connect error - %s", err) return } // Build request protobuf req := &PBDTableSetItem{ Origin: origin.ToProtobuf(), Dest: remote.ToProtobuf(), Item: reqItem.to_protobuf(), MinAcks: proto.Int32(int32(minAcks)), Demoting: proto.Bool(demoting), } reqData, _ := proto.Marshal(req) encoded := dt.transport.Encode(PbDtableSetItem, reqData) _, err = req_sock.SendBytes(encoded, 0) if err != nil { error_c <- fmt.Errorf("ZMQ:DTable:remoteSet - error while sending request - %s", err) return } // read response and decode it resp, err := req_sock.RecvBytes(0) if err != nil { error_c <- fmt.Errorf("ZMQ:DTable:remoteSet - error while reading response - %s", err) return } decoded, err := dt.transport.Decode(resp) if err != nil { error_c <- fmt.Errorf("ZMQ:DTable:remoteSet - error while decoding response - %s", err) return } switch decoded.Type { case dendrite.PbErr: pbMsg := decoded.TransportMsg.(dendrite.PBProtoErr) error_c <- fmt.Errorf("ZMQ:DTable:remoteSet - got error response - %s", pbMsg.GetError()) case PbDtableResponse: pbMsg := decoded.TransportMsg.(PBDTableResponse) success := pbMsg.GetOk() if success { resp_c <- true return } error_c <- fmt.Errorf("ZMQ:DTable:remoteSet - write error - %s", pbMsg.GetError()) return default: // unexpected response error_c <- fmt.Errorf("ZMQ:DTable:remoteSet - unexpected response") return } }() select { case <-time.After(zmq_transport.ClientTimeout): done <- fmt.Errorf("ZMQ:DTable:remoteSet - command timed out!") case err := <-error_c: done <- err case _ = <-resp_c: done <- nil } }
// Client Request: promote remote vnode for a key func (dt *DTable) remotePromoteKey(origin, remote *dendrite.Vnode, reqItem *kvItem) error { error_c := make(chan error, 1) resp_c := make(chan bool, 1) zmq_transport := dt.transport.(*dendrite.ZMQTransport) go func() { req_sock, err := zmq_transport.ZMQContext.NewSocket(zmq.REQ) if err != nil { error_c <- fmt.Errorf("ZMQ:DTable:remotePromoteKey - newsocket error - %s", err) return } req_sock.SetRcvtimeo(5 * time.Second) req_sock.SetSndtimeo(5 * time.Second) defer req_sock.Close() err = req_sock.Connect("tcp://" + remote.Host) if err != nil { error_c <- fmt.Errorf("ZMQ:DTable:remotePromoteKey - connect error - %s", err) return } // Build request protobuf req := &PBDTablePromoteKey{ Dest: remote.ToProtobuf(), Origin: origin.ToProtobuf(), Item: reqItem.to_protobuf(), } reqData, _ := proto.Marshal(req) encoded := dt.transport.Encode(PbDtablePromoteKey, reqData) _, err = req_sock.SendBytes(encoded, 0) if err != nil { error_c <- fmt.Errorf("ZMQ:DTable:remotePromoteKey - error while sending request - %s", err) return } // read response and decode it resp, err := req_sock.RecvBytes(0) if err != nil { error_c <- fmt.Errorf("ZMQ:DTable:remotePromoteKey - error while reading response - %s", err) return } decoded, err := dt.transport.Decode(resp) if err != nil { error_c <- fmt.Errorf("ZMQ:DTable:remotePromoteKey - error while decoding response - %s", err) return } switch decoded.Type { case dendrite.PbErr: pbMsg := decoded.TransportMsg.(dendrite.PBProtoErr) error_c <- fmt.Errorf("ZMQ:DTable:remotePromoteKey - got error response - %s", pbMsg.GetError()) case PbDtableResponse: pbMsg := decoded.TransportMsg.(PBDTableResponse) success := pbMsg.GetOk() if success { resp_c <- true return } error_c <- fmt.Errorf("ZMQ:DTable:remotePromoteKey - error - %s", pbMsg.GetError()) return default: // unexpected response error_c <- fmt.Errorf("ZMQ:DTable:remotePromoteKey - unexpected response") return } }() select { case <-time.After(zmq_transport.ClientTimeout): return fmt.Errorf("ZMQ:DTable:remotePromoteKey - command timed out!") case err := <-error_c: return err case _ = <-resp_c: return nil } }
// Client Request: Get value for a key from remote host func (dt *DTable) remoteGet(remote *dendrite.Vnode, reqItem *kvItem) (*kvItem, bool, error) { error_c := make(chan error, 1) resp_c := make(chan *kvItem, 1) notfound_c := make(chan bool, 1) zmq_transport := dt.transport.(*dendrite.ZMQTransport) go func() { req_sock, err := zmq_transport.ZMQContext.NewSocket(zmq.REQ) if err != nil { error_c <- fmt.Errorf("ZMQ:DTable:remoteGet - newsocket error - %s", err) return } req_sock.SetRcvtimeo(2 * time.Second) req_sock.SetSndtimeo(2 * time.Second) defer req_sock.Close() err = req_sock.Connect("tcp://" + remote.Host) if err != nil { error_c <- fmt.Errorf("ZMQ:DTable:remoteGet - connect error - %s", err) return } // Build request protobuf req := &PBDTableGetItem{ Dest: remote.ToProtobuf(), KeyHash: reqItem.keyHash, } reqData, _ := proto.Marshal(req) encoded := dt.transport.Encode(PbDtableGetItem, reqData) _, err = req_sock.SendBytes(encoded, 0) if err != nil { error_c <- fmt.Errorf("ZMQ:DTable:remoteGet - error while sending request - %s", err) return } // read response and decode it resp, err := req_sock.RecvBytes(0) if err != nil { error_c <- fmt.Errorf("ZMQ:DTable:remoteGet - error while reading response - %s", err) return } decoded, err := dt.transport.Decode(resp) if err != nil { error_c <- fmt.Errorf("ZMQ:DTable:remoteGet - error while decoding response - %s", err) return } switch decoded.Type { case dendrite.PbErr: pbMsg := decoded.TransportMsg.(dendrite.PBProtoErr) error_c <- fmt.Errorf("ZMQ:DTable:remoteGet - got error response - %s", pbMsg.GetError()) case PbDtableItem: pbMsg := decoded.TransportMsg.(PBDTableItem) if found := pbMsg.GetFound(); !found { notfound_c <- true return } item := new(kvItem) copy(item.Key, reqItem.Key) copy(item.keyHash, reqItem.keyHash) item.Val = pbMsg.GetVal() resp_c <- item return default: // unexpected response error_c <- fmt.Errorf("ZMQ:DTable:remoteGet - unexpected response") return } }() select { case <-time.After(zmq_transport.ClientTimeout): return nil, false, fmt.Errorf("ZMQ:DTable:remoteGet - command timed out!") case err := <-error_c: return nil, false, err case _ = <-notfound_c: return nil, false, nil case item := <-resp_c: return item, true, nil } }
/* set writes to dtable's primary(non-replica table). It is called from both Query api and by remote clients via zmq. It reports back on done chan when minAcks is reached so that clients can continue without blocking while replication takes place. */ func (dt *DTable) set(vn *dendrite.Vnode, item *kvItem, minAcks int, done chan error) { // make sure we have local handler before doing any write handler, _ := dt.transport.GetVnodeHandler(vn) if handler == nil { done <- fmt.Errorf("local handler could not be found for vnode %x", vn.Id) return } write_count := 0 vn_table, _ := dt.table[vn.String()] item.lock.Lock() defer item.lock.Unlock() item.replicaInfo.master = vn err := vn_table.put(item) if err != nil { done <- err return } write_count++ repwrite_count := 0 returned := false item.replicaInfo.state = replicaIncomplete // should we return to client immediately? if minAcks == write_count { // cover the case where ring.Replicas() returns 0 if dt.ring.Replicas() == repwrite_count { item.replicaInfo.state = replicaStable item.commited = true done <- nil dt.callHooks(item) return } item.commited = true done <- nil returned = true } // find remote successors to write replicas to remote_succs, err := handler.FindRemoteSuccessors(dt.ring.Replicas()) if err != nil { if !returned { done <- fmt.Errorf("could not find replica nodes due to error %s", err) } dt.Logf(LogDebug, "could not find replica nodes due to error %s\n", err) dt.rollback(vn, item) return } // don't write any replica if not enough replica nodes have been found for requested consistency if minAcks > len(remote_succs)+1 { done <- fmt.Errorf("insufficient nodes found for requested consistency level (%d)\n", minAcks) dt.rollback(vn, item) return } // now lets write replicas item_replicas := make([]*dendrite.Vnode, 0) repl_item := item.dup() repl_item.commited = false for _, succ := range remote_succs { err := dt.remoteWriteReplica(vn, succ, repl_item) if err != nil { dt.Logf(LogDebug, "could not write replica due to error: %s\n", err) continue } item_replicas = append(item_replicas, succ) } // check if we have enough written replicas for requested minAcks if minAcks > len(item_replicas)+1 { done <- fmt.Errorf("insufficient active nodes found for requested consistency level (%d)\n", minAcks) dt.rollback(vn, item) return } // update replication state based on available replicas var target_state replicaState if dt.ring.Replicas() <= len(item_replicas) { target_state = replicaStable } else { target_state = replicaPartial } // replicas have been written, lets now update metadata real_idx := 0 fail_count := 0 repl_item.commited = true repl_item.replicaInfo.vnodes = item_replicas repl_item.replicaInfo.state = target_state repl_item.replicaInfo.master = vn for _, replica := range item_replicas { // update metadata/commit on remote repl_item.replicaInfo.depth = real_idx err := dt.remoteSetReplicaInfo(replica, repl_item) if err != nil { fail_count++ if !returned && len(item_replicas)-fail_count < minAcks { done <- fmt.Errorf("insufficient (phase2) active nodes found for requested consistency level (%d)\n", minAcks) dt.rollback(vn, item) return } continue } real_idx++ repwrite_count++ // notify client if enough replicas have been written if !returned && repwrite_count+1 == minAcks { done <- nil returned = true } } item.replicaInfo.state = target_state item.commited = true dt.callHooks(item) }