func TestGossiperAddRemoveGetNode(t *testing.T) { printTestInfo() g := NewGossiperImpl("0.0.0.0:9010", "0") nodes := []string{"0.0.0.0:90011", "0.0.0.0:90012", "0.0.0.0:90013", "0.0.0.0:90014"} // test add nodes i := 1 for _, node := range nodes { err := g.AddNode(node, types.NodeId(strconv.Itoa(i))) if err != nil { t.Error("Error adding new node") } i++ } // try adding existing node err := g.AddNode(nodes[0], types.NodeId(strconv.Itoa(1))) if err == nil { t.Error("Duplicate node addition did not fail") } // check the nodelist is same peerNodes := g.GetNodes() if len(peerNodes) != len(nodes) { t.Error("Peer nodes len does not match added nodes, got: ", peerNodes, " expected: ", nodes) } outer: for _, origNode := range nodes { for _, peerNode := range peerNodes { if origNode == peerNode { continue outer } } t.Error("Peer nodes does not have added node: ", origNode) } // test remove nodes for _, node := range nodes { err := g.RemoveNode(node) if err != nil { t.Error("Error removing new node") } } // try removing non-existing node err = g.RemoveNode("0.0.0.0:9020") if err == nil { t.Error("Non-existing node removal did not fail") } g.Stop() }
func TestGossipStoreSubset(t *testing.T) { printTestInfo() g := NewGossipStore(ID) log.Info("Testing: empty store and empty nodelist") // empty store and empty nodelist and non-empty nodelist diff := make(types.StoreNodes, 0) sv := g.Subset(diff) if len(sv) != 0 { t.Error("Emtpy subset expected, got: ", sv) } nodeLen := 10 for i := 0; i < nodeLen*2; i++ { diff = append(diff, types.NodeId(strconv.Itoa(i))) } log.Info("Testing: empty store and non-empty nodelist") sv = g.Subset(diff) if len(sv) != 0 { t.Error("Emtpy subset expected, got: ", sv) } // store and diff asks for 20 nodes but store // has only a subset of it, as well as some keys // it does not know about keyList := []types.StoreKey{"key1", "key2", "key3"} g.nodeMap = make(types.NodeInfoMap) for _, key := range keyList { fillUpNodeInfoMap(g.nodeMap, key, nodeLen) } diff = make(types.StoreNodes, 0) for i := 0; i < nodeLen; i++ { diff = append(diff, types.NodeId(strconv.Itoa(2*i))) } log.Info("Testing: empty store and non-empty nodelist") sv = g.Subset(diff) if len(sv) != nodeLen/2 { t.Error("Subset has more keys then requested: ", sv) } for id, info := range sv { gInfo, ok := g.nodeMap[id] if !ok { t.Error("Subset returned id which was not originally present ", id) } if !compareNodeInfo(info, gInfo) { t.Error("Node info does not match, d:", info, " o:", gInfo) } } }
func verifyGossiperEquality(g1 *GossiperImpl, g2 *GossiperImpl, t *testing.T) { // check for the equality g1Keys := g1.GetStoreKeys() g2Keys := g2.GetStoreKeys() if len(g1Keys) != len(g2Keys) { t.Error("Keys mismatch, g1: ", g1Keys, " g2:", g2Keys) } for _, key := range g1Keys { g1Values := g1.GetStoreKeyValue(key) g2Values := g1.GetStoreKeyValue(key) t.Log("g1Values: ", g1Values) t.Log("g2Values: ", g2Values) if len(g1Values) != len(g2Values) { t.Fatal("Lens mismatch between g1 and g2 values") } for i := 0; i < len(g1Values); i++ { id := types.NodeId(strconv.Itoa(i)) if g1Values[id].Id != g2Values[id].Id { t.Error("Values mismtach between g1 and g2, g1:\n", g1Values[id].Id, "\ng2:", g2Values[id].Id) } } } }
func fillUpNodeInfo(node *types.NodeInfo, key types.StoreKey, i int) { node.Id = types.NodeId(strconv.Itoa(i)) node.LastUpdateTs = time.Now() node.Status = types.NODE_STATUS_UP node.Value = make(types.StoreMap) node.Value[types.StoreKey(CPU+key)] = node.Id node.Value[types.StoreKey(MEMORY+key)] = node.Id }
// Initialize node and alert listeners that we are joining the cluster. func (c *ClusterManager) joinCluster(db *Database, self *api.Node, exist bool) error { var err error // If I am already in the cluster map, don't add me again. if exist { goto found } // Alert all listeners that we are a new node joining an existing cluster. for e := c.listeners.Front(); e != nil; e = e.Next() { err = e.Value.(ClusterListener).Init(self, db) if err != nil { self.Status = api.Status_STATUS_ERROR dlog.Warnf("Failed to initialize Init %s: %v", e.Value.(ClusterListener).String(), err) c.cleanupInit(db, self) goto done } } found: // Alert all listeners that we are joining the cluster. for e := c.listeners.Front(); e != nil; e = e.Next() { err = e.Value.(ClusterListener).Join(self, db) if err != nil { self.Status = api.Status_STATUS_ERROR dlog.Warnf("Failed to initialize Join %s: %v", e.Value.(ClusterListener).String(), err) if exist == false { c.cleanupInit(db, self) } goto done } } for id, n := range db.NodeEntries { if id != c.config.NodeId { // Check to see if the IP is the same. If it is, then we have a stale entry. if n.MgmtIp == self.MgmtIp { dlog.Warnf("Warning, Detected node %s with the same IP %s in the database. Will not connect to this node.", id, n.MgmtIp) } else { // Gossip with this node. dlog.Infof("Connecting to node %s with IP %s.", id, n.MgmtIp) c.gossip.AddNode(n.MgmtIp+":9002", types.NodeId(id)) } } } done: return err }
func TestTransportSendAndRcvData(t *testing.T) { printTestInfo() time.Sleep(10 * time.Second) data1 := &TestData{} data2 := &TestData{} data1.Data = make(map[types.StoreKey]types.NodeInfo) data2.Data = make(map[types.StoreKey]types.NodeInfo) var handler types.OnMessageRcv = func(c types.MessageChannel) { err := c.RcvData(&data2) if err != nil { t.Error("Error receiving data: ", err) } else { t.Log("Done receiving") } } ipString := "0.0.0.0:19002" r := NewRunnableMessageChannel(ipString, handler) go r.RunOnRcvData() time.Sleep(1 * time.Second) keyList := []types.StoreKey{"key1", "key2"} for i, key := range keyList { var node types.NodeInfo node.Id = types.NodeId(i) node.Value = make(types.StoreMap) node.Value[key] = "some data" data1.Data[key] = node } s := NewMessageChannel(ipString) if s == nil { t.Fatal("Error creating send channel, failing test") } go s.SendData(&data1) time.Sleep(1 * time.Second) s.Close() r.Close() if len(data1.Data) != len(data2.Data) { t.Error("Sent and rcvd messages mismatch, sent: ", data1, " got: ", data2) } }
func TestGossipStoreGetStoreKeyValue(t *testing.T) { printTestInfo() // Case: emtpy store // Case: key absent g := NewGossipStore(ID) keyList := []types.StoreKey{"key1", "key2"} nodeInfoMap := g.GetStoreKeyValue(keyList[0]) if len(nodeInfoMap) != 0 { t.Error("Expected empty node info list, got: ", nodeInfoMap) } // Case: key present with nodes with holes in node ids fillUpNodeInfoMap(g.nodeMap, keyList[0], 6) if len(g.nodeMap) != 6 { t.Error("Failed to fillup node info map properly, got: ", g.nodeMap) } keyCheck := types.StoreKey(CPU + keyList[0]) delete(g.nodeMap["0"].Value, keyCheck) delete(g.nodeMap["2"].Value, keyCheck) delete(g.nodeMap["4"].Value, keyCheck) nodeInfoMap = g.GetStoreKeyValue(keyCheck) if len(nodeInfoMap) != 3 { t.Error("Expected list with atleast 6 elements, got: ", nodeInfoMap) } for i := 0; i < len(nodeInfoMap); i++ { id := types.NodeId(strconv.Itoa(i)) if i%2 == 0 { if _, ok := nodeInfoMap[id]; ok { t.Error("No node expected, got: ", nodeInfoMap[id]) } continue } infoMap := nodeInfoMap[id].Value.(types.NodeId) if nodeInfoMap[id].Id != id || nodeInfoMap[id].Status != types.NODE_STATUS_UP || infoMap != id { t.Error("Invalid node content received, got: ", nodeInfoMap[id]) } } }
// Get the latest config. func (c *ClusterManager) watchDB(key string, opaque interface{}, kvp *kvdb.KVPair, err error) error { db, err := readDatabase() if err != nil { dlog.Warnln("Failed to read database after update ", err) return nil } // The only value we rely on during an update is the cluster size. c.size = db.Size for id, n := range db.NodeEntries { if id != c.config.NodeId { // Check to see if the IP is the same. If it is, then we have a stale entry. c.gossip.UpdateNode(n.MgmtIp+":9002", types.NodeId(id)) } } return nil }
func (c *ClusterManager) Start() error { log.Info("Cluster manager starting...") kvdb := kv.Instance() // Start the gossip protocol. // XXX Make the port configurable. gob.Register(api.Node{}) c.g = gossip.New("0.0.0.0:9002", gossiptypes.NodeId(c.config.NodeId)) c.g.SetGossipInterval(2 * time.Second) kvlock, err := kvdb.Lock("cluster/lock", 60) if err != nil { log.Panic("Fatal, Unable to obtain cluster lock.", err) } db, err := readDatabase() if err != nil { log.Panic(err) } if db.Status == api.StatusInit { log.Info("Will initialize a new cluster.") c.status = api.StatusOk db.Status = api.StatusOk self, _ := c.initNode(&db) err = c.initCluster(&db, self, false) if err != nil { kvdb.Unlock(kvlock) log.Error("Failed to initialize the cluster.", err) log.Panic(err) } // Update the new state of the cluster in the KV Database err = writeDatabase(&db) if err != nil { log.Errorf("Failed to save the database.", err) log.Panic(err) } err = kvdb.Unlock(kvlock) if err != nil { log.Panic("Fatal, unable to unlock cluster... Did something take too long to initialize?", err) } } else if db.Status&api.StatusOk > 0 { log.Info("Cluster state is OK... Joining the cluster.") c.status = api.StatusOk self, exist := c.initNode(&db) err = c.joinCluster(&db, self, exist) if err != nil { kvdb.Unlock(kvlock) log.Panic(err) } err = writeDatabase(&db) if err != nil { log.Panic(err) } err = kvdb.Unlock(kvlock) if err != nil { log.Panic("Fatal, unable to unlock cluster... Did something take too long to initialize?", err) } } else { kvdb.Unlock(kvlock) err = errors.New("Fatal, Cluster is in an unexpected state.") log.Panic(err) } // Start heartbeating to other nodes. go c.heartBeat() return nil }
func clearKey(nodes types.NodeInfoMap, key types.StoreKey, id int) { nodeId := types.NodeId(strconv.Itoa(id)) nodeInfo := nodes[nodeId] delete(nodeInfo.Value, types.StoreKey(CPU+key)) delete(nodeInfo.Value, types.StoreKey(MEMORY+key)) }
func TestTransportFailures(t *testing.T) { printTestInfo() time.Sleep(10 * time.Second) data1 := &TestData{} data2 := &TestData{} data1.Data = make(map[types.StoreKey]types.NodeInfo) data2.Data = make(map[types.StoreKey]types.NodeInfo) var handler types.OnMessageRcv = func(c types.MessageChannel) { err := c.RcvData(&data2) if err == nil { t.Error("Did not receive expected error") } return } fmt.Println("Close without sending data") ipString := "0.0.0.0:17016" r := NewRunnableMessageChannel(ipString, handler) go r.RunOnRcvData() time.Sleep(5 * time.Second) keyList := []types.StoreKey{"key1", "key2"} for i, key := range keyList { var node types.NodeInfo node.Id = types.NodeId(i) node.Value = make(types.StoreMap) node.Value[key] = "some data" data1.Data[key] = node } // close the channel without sending any message s := NewMessageChannel(ipString) if s == nil { t.Fatal("Error creating send channel, failing test") } time.Sleep(10 * time.Millisecond) s.Close() time.Sleep(10 * time.Millisecond) r.Close() time.Sleep(10 * time.Millisecond) fmt.Println("Close the channel and then send") // open and then close the channel ipString = "0.0.0.0:17617" r = NewRunnableMessageChannel(ipString, handler) go r.RunOnRcvData() time.Sleep(1 * time.Second) r.Close() time.Sleep(2 * time.Second) // try sending data to closed end s = NewMessageChannel(ipString) if s != nil { t.Error("Error, expected nil sender") } fmt.Println("Send non-marshalable data") // try sending non-marshabable data ipString = "0.0.0.0:17418" r = NewRunnableMessageChannel(ipString, handler) go r.RunOnRcvData() time.Sleep(1 * time.Second) s = NewMessageChannel(ipString) if s == nil { t.Fatal("Error creating send channel, failing test") } nonMarshalData := make(map[types.StoreKey]map[types.StoreKey]types.NodeInfoMap) err := s.SendData(nonMarshalData) if err != nil { t.Error("Expected error sending non-marshalable data") } s.Close() r.Close() }
func TestGossiperOneNodeNeverGossips(t *testing.T) { printTestInfo() nodes := []string{"0.0.0.0:9222", "0.0.0.0:9223", "0.0.0.0:9224"} rand.Seed(time.Now().UnixNano()) gossipers := make(map[int]*GossiperImpl) for i, nodeId := range nodes { if i == 0 { // node 0 never comes up continue } id := types.NodeId(strconv.Itoa(i)) g := NewGossiperImpl(nodeId, id) g.SetGossipInterval(time.Duration(200+rand.Intn(200)) * time.Millisecond) for j, peer := range nodes { if i == j { continue } g.AddNode(peer, types.NodeId(j)) } gossipers[i] = g } // each node must mark node 0 as down key := types.StoreKey("somekey") value := "someValue" for i, g := range gossipers { g.UpdateSelf(key, value+strconv.Itoa(i)) } for i, g := range gossipers { res := g.GetStoreKeyValue(key) for nodeId, n := range res { if nodeId != n.Id { t.Error("Gossiper ", i, "Id does not match ", nodeId, " n:", n.Id) } nid, ok := strconv.Atoi(string(nodeId)) if ok != nil { t.Error("Failed to convert node to id ", nodeId, " n.Id", n.Id) } if nid == 0 { if n.Status == types.NODE_STATUS_DOWN { t.Error("Gossiper ", i, "Expected node status not to be down: ", nodeId, " n:", n) } } } } time.Sleep(2 * time.Second) for i, g := range gossipers { res := g.GetStoreKeyValue(key) for nodeId, n := range res { if nodeId != n.Id { t.Error("Gossiper ", i, "Id does not match ", nodeId, " n:", n.Id) } nid, ok := strconv.Atoi(string(nodeId)) if ok != nil { t.Error("Failed to convert node to id ", nodeId, " n.Id", n.Id) } if nid == 0 { if n.Status != types.NODE_STATUS_DOWN { t.Error("Gossiper ", i, "Expected node status to be down: ", nodeId, " n:", n) } } else { if n.Status != types.NODE_STATUS_UP { t.Error("Gossiper ", i, "Expected node to be up: ", nodeId, " n:", n) } } } } for _, g := range gossipers { g.Stop() } }
func (c *ClusterManager) Start() error { logrus.Info("Cluster manager starting...") c.gEnabled = true c.selfNode = api.Node{} c.selfNode.GenNumber = uint64(time.Now().UnixNano()) c.selfNode.Id = c.config.NodeId c.selfNode.Status = api.StatusOk c.selfNode.Ip, _ = externalIp(&c.config) c.selfNode.NodeData = make(map[string]interface{}) c.system = systemutils.New() // Start the gossip protocol. // XXX Make the port configurable. gob.Register(api.Node{}) c.g = gossip.New("0.0.0.0:9002", types.NodeId(c.config.NodeId), c.selfNode.GenNumber) c.g.SetGossipInterval(2 * time.Second) kvdb := kvdb.Instance() kvlock, err := kvdb.Lock("cluster/lock", 60) if err != nil { logrus.Panic("Fatal, Unable to obtain cluster lock.", err) } defer kvdb.Unlock(kvlock) db, err := readDatabase() if err != nil { logrus.Panic(err) } if db.Status == api.StatusInit { logrus.Info("Will initialize a new cluster.") c.status = api.StatusOk db.Status = api.StatusOk self, _ := c.initNode(&db) err = c.initCluster(&db, self, false) if err != nil { logrus.Error("Failed to initialize the cluster.", err) return err } // Update the new state of the cluster in the KV Database err := writeDatabase(&db) if err != nil { logrus.Error("Failed to save the database.", err) return err } } else if db.Status&api.StatusOk > 0 { logrus.Info("Cluster state is OK... Joining the cluster.") c.status = api.StatusOk self, exist := c.initNode(&db) err = c.joinCluster(&db, self, exist) if err != nil { logrus.Error("Failed to join cluster.", err) return err } err := writeDatabase(&db) if err != nil { return err } } else { return errors.New("Fatal, Cluster is in an unexpected state.") } // Start heartbeating to other nodes. c.g.Start() go c.heartBeat() return nil }
func (c *ClusterManager) heartBeat() { gossipStoreKey := types.StoreKey(heartbeatKey + c.config.ClusterId) lastUpdateTs := time.Now() for { node := c.getCurrentState() c.nodeCache[node.Id] = *node currTime := time.Now() if currTime.Sub(lastUpdateTs) > 10*time.Second { logrus.Warn("No gossip update for 10 seconds") } c.g.UpdateSelf(gossipStoreKey, *node) lastUpdateTs = currTime // Process heartbeats from other nodes... gossipValues := c.g.GetStoreKeyValue(gossipStoreKey) for id, nodeInfo := range gossipValues { if id == types.NodeId(node.Id) { continue } cachedNodeInfo, nodeFoundInCache := c.nodeCache[string(id)] n := cachedNodeInfo ok := false if nodeInfo.Value != nil { n, ok = nodeInfo.Value.(api.Node) if !ok { logrus.Error("Received a bad broadcast packet: %v", nodeInfo.Value) continue } } if nodeFoundInCache { if n.Status != api.StatusOk { logrus.Warn("Detected node ", n.Id, " to be unhealthy.") for e := c.listeners.Front(); e != nil && c.gEnabled; e = e.Next() { err := e.Value.(ClusterListener).Update(&n) if err != nil { logrus.Warn("Failed to notify ", e.Value.(ClusterListener).String()) } } delete(c.nodeCache, n.Id) continue } else if nodeInfo.Status == types.NODE_STATUS_DOWN { ne := c.getLatestNodeConfig(string(id)) if ne != nil && nodeInfo.GenNumber < ne.GenNumber { logrus.Warn("Detected stale update for node ", id, " going down, ignoring it") c.g.MarkNodeHasOldGen(id) delete(c.nodeCache, cachedNodeInfo.Id) continue } logrus.Warn("Detected node ", id, " to be offline due to inactivity.") n.Status = api.StatusOffline for e := c.listeners.Front(); e != nil && c.gEnabled; e = e.Next() { err := e.Value.(ClusterListener).Update(&n) if err != nil { logrus.Warn("Failed to notify ", e.Value.(ClusterListener).String()) } } delete(c.nodeCache, cachedNodeInfo.Id) } else if nodeInfo.Status == types.NODE_STATUS_DOWN_WAITING_FOR_NEW_UPDATE { logrus.Warn("Detected node ", n.Id, " to be offline due to inactivity.") n.Status = api.StatusOffline for e := c.listeners.Front(); e != nil && c.gEnabled; e = e.Next() { err := e.Value.(ClusterListener).Update(&n) if err != nil { logrus.Warn("Failed to notify ", e.Value.(ClusterListener).String()) } } delete(c.nodeCache, cachedNodeInfo.Id) } else { // node may be up or waiting for new update, // no need to tell listeners as yet. c.nodeCache[cachedNodeInfo.Id] = n } } else if nodeInfo.Status == types.NODE_STATUS_UP { // A node discovered in the cluster. logrus.Warn("Detected node ", n.Id, " to be in the cluster.") c.nodeCache[n.Id] = n for e := c.listeners.Front(); e != nil && c.gEnabled; e = e.Next() { err := e.Value.(ClusterListener).Add(&n) if err != nil { logrus.Warn("Failed to notify ", e.Value.(ClusterListener).String()) } } } } time.Sleep(2 * time.Second) } }
func (c *ClusterManager) Start() error { dlog.Infoln("Cluster manager starting...") c.gEnabled = true c.selfNode = api.Node{} c.selfNode.GenNumber = uint64(time.Now().UnixNano()) c.selfNode.Id = c.config.NodeId c.selfNode.Status = api.Status_STATUS_OK c.selfNode.MgmtIp, c.selfNode.DataIp, _ = ExternalIp(&c.config) c.selfNode.NodeData = make(map[string]interface{}) c.system = systemutils.New() // Start the gossip protocol. // XXX Make the port configurable. gob.Register(api.Node{}) c.gossip = gossip.New("0.0.0.0:9002", types.NodeId(c.config.NodeId), c.selfNode.GenNumber) c.gossip.SetGossipInterval(2 * time.Second) kvdb := kvdb.Instance() kvlock, err := kvdb.Lock(clusterLockKey, 60) if err != nil { dlog.Panicln("Fatal, Unable to obtain cluster lock.", err) } defer kvdb.Unlock(kvlock) db, err := readDatabase() if err != nil { dlog.Panicln(err) } // Cluster database max size... 0 if unlimited. c.size = db.Size if db.Status == api.Status_STATUS_INIT { dlog.Infoln("Will initialize a new cluster.") c.status = api.Status_STATUS_OK db.Status = api.Status_STATUS_OK self, _ := c.initNode(&db) err = c.initCluster(&db, self, false) if err != nil { dlog.Errorln("Failed to initialize the cluster.", err) return err } // Update the new state of the cluster in the KV Database err := writeDatabase(&db) if err != nil { dlog.Errorln("Failed to save the database.", err) return err } } else if db.Status&api.Status_STATUS_OK > 0 { dlog.Infoln("Cluster state is OK... Joining the cluster.") c.status = api.Status_STATUS_OK self, exist := c.initNode(&db) err = c.joinCluster(&db, self, exist) if err != nil { dlog.Errorln("Failed to join cluster.", err) return err } err := writeDatabase(&db) if err != nil { return err } } else { return errors.New("Fatal, Cluster is in an unexpected state.") } // Start heartbeating to other nodes. go c.startHeartBeat() go c.updateClusterStatus() kvdb.WatchKey(ClusterDBKey, 0, nil, c.watchDB) return nil }
func (c *ClusterManager) updateClusterStatus() { gossipStoreKey := types.StoreKey(heartbeatKey + c.config.ClusterId) for { node := c.getCurrentState() c.nodeCache[node.Id] = *node // Process heartbeats from other nodes... gossipValues := c.gossip.GetStoreKeyValue(gossipStoreKey) numNodes := 0 for id, nodeInfo := range gossipValues { numNodes = numNodes + 1 // Check to make sure we are not exceeding the size of the cluster. if c.size > 0 && numNodes > c.size { dlog.Fatalf("Fatal, number of nodes in the cluster has"+ "exceeded the cluster size: %d > %d", numNodes, c.size) os.Exit(-1) } // Ignore updates from self node. if id == types.NodeId(node.Id) { continue } // Notify node status change if required. newNodeInfo := api.Node{} newNodeInfo.Id = string(id) newNodeInfo.Status = api.Status_STATUS_OK switch { case nodeInfo.Status == types.NODE_STATUS_DOWN: newNodeInfo.Status = api.Status_STATUS_OFFLINE lastStatus, ok := c.nodeStatuses[string(id)] if ok && lastStatus == newNodeInfo.Status { break } // Check if it is a stale update ne := c.getLatestNodeConfig(string(id)) if ne != nil && nodeInfo.GenNumber != 0 && nodeInfo.GenNumber < ne.GenNumber { dlog.Warnln("Detected stale update for node ", id, " going down, ignoring it") c.gossip.MarkNodeHasOldGen(id) break } c.nodeStatuses[string(id)] = newNodeInfo.Status dlog.Warnln("Detected node ", id, " to be offline due to inactivity.") for e := c.listeners.Front(); e != nil && c.gEnabled; e = e.Next() { err := e.Value.(ClusterListener).Update(&newNodeInfo) if err != nil { dlog.Warnln("Failed to notify ", e.Value.(ClusterListener).String()) } } case nodeInfo.Status == types.NODE_STATUS_DOWN_WAITING_FOR_NEW_UPDATE: newNodeInfo.Status = api.Status_STATUS_OFFLINE lastStatus, ok := c.nodeStatuses[string(id)] if ok && lastStatus == newNodeInfo.Status { break } c.nodeStatuses[string(id)] = newNodeInfo.Status dlog.Warnln("Detected node ", newNodeInfo.Id, " to be offline due to inactivity.") for e := c.listeners.Front(); e != nil && c.gEnabled; e = e.Next() { err := e.Value.(ClusterListener).Update(&newNodeInfo) if err != nil { dlog.Warnln("Failed to notify ", e.Value.(ClusterListener).String()) } } case nodeInfo.Status == types.NODE_STATUS_UP: newNodeInfo.Status = api.Status_STATUS_OK lastStatus, ok := c.nodeStatuses[string(id)] if ok && lastStatus == newNodeInfo.Status { break } c.nodeStatuses[string(id)] = newNodeInfo.Status // A node discovered in the cluster. dlog.Warnln("Detected node ", newNodeInfo.Id, " to be in the cluster.") for e := c.listeners.Front(); e != nil && c.gEnabled; e = e.Next() { err := e.Value.(ClusterListener).Add(&newNodeInfo) if err != nil { dlog.Warnln("Failed to notify ", e.Value.(ClusterListener).String()) } } } // Update cache. if nodeInfo.Value != nil { n, ok := nodeInfo.Value.(api.Node) if ok { n.Status = newNodeInfo.Status c.nodeCache[n.Id] = n } else { c.nodeCache[newNodeInfo.Id] = newNodeInfo } } else { newNodeInfo.Status = api.Status_STATUS_OFFLINE c.nodeCache[newNodeInfo.Id] = newNodeInfo } } time.Sleep(2 * time.Second) } }
func TestTransportTwoWayExchange(t *testing.T) { printTestInfo() time.Sleep(10 * time.Second) data1 := &TestData{} data2 := &TestData{} data3 := &TestData{} data4 := &TestData{} data1.Data = make(map[types.StoreKey]types.NodeInfo) data2.Data = make(map[types.StoreKey]types.NodeInfo) data3.Data = make(map[types.StoreKey]types.NodeInfo) data4.Data = make(map[types.StoreKey]types.NodeInfo) var handler types.OnMessageRcv = func(c types.MessageChannel) { err := c.RcvData(&data2) if err != nil { t.Error("Error receiving data2: ", err) } else { t.Log("Done receiving") } for key, nodeInfo := range data2.Data { intId, _ := strconv.Atoi(string(nodeInfo.Id)) nodeInfo.Id = types.NodeId(strconv.Itoa(intId + 1)) data2.Data[key] = nodeInfo } err = c.SendData(data2) if err != nil { t.Error("Error sending data2: ", err) } else { t.Log("Done Sending data2") } time.Sleep(20 * time.Millisecond) err = c.RcvData(&data4) if err != nil { t.Error("Error sending data4: ", err) } else { t.Log("Done receving data4") } time.Sleep(20 * time.Millisecond) } r := NewRunnableMessageChannel("0.0.0.0:19422", handler) go r.RunOnRcvData() time.Sleep(1 * time.Second) keyList := []types.StoreKey{"key1", "key2"} for i, key := range keyList { var node types.NodeInfo node.Id = types.NodeId(i) node.Value = make(types.StoreMap) node.Value[key] = "some data" data1.Data[key] = node } ipString := "0.0.0.0:19422" s := NewMessageChannel(ipString) if s == nil { t.Fatal("Error creating send channel, failing test") } s.SendData(&data1) time.Sleep(20 * time.Millisecond) err := s.RcvData(&data3) if err != nil { t.Fatal("Error receving data3: ", err) } for key, nodeInfo := range data3.Data { intId, _ := strconv.Atoi(string(nodeInfo.Id)) nodeInfo.Id = types.NodeId(strconv.Itoa(intId + 1)) data3.Data[key] = nodeInfo } time.Sleep(20 * time.Millisecond) err = s.SendData(&data3) if err != nil { t.Fatal("Error sending data3: ", err) } time.Sleep(20 * time.Millisecond) if len(data1.Data) != len(data2.Data) || len(data1.Data) != len(data3.Data) || len(data1.Data) != len(data4.Data) { t.Error("Data sent and received not matching Data1:", data1.Data, "\nData2:", data2.Data, "\nData3:", data3.Data, "\nData4:", data4.Data) } for key, nodeInfo := range data1.Data { nodeInfo2 := data2.Data[key] nodeInfo3 := data3.Data[key] nodeInfo4 := data4.Data[key] intId, _ := strconv.Atoi(string(nodeInfo.Id)) id1 := types.NodeId(strconv.Itoa(intId + 1)) id2 := types.NodeId(strconv.Itoa(intId + 2)) if nodeInfo2.Id != id1 || nodeInfo3.Id != id2 || nodeInfo4.Id != id2 { t.Error("Data mismatch, Data1: ", nodeInfo, "\nData2:", nodeInfo2, "\nData3:", nodeInfo3, "\nData4:", nodeInfo4) } compareNodeValues(nodeInfo2, nodeInfo, t) compareNodeValues(nodeInfo3, nodeInfo, t) compareNodeValues(nodeInfo4, nodeInfo, t) } s.Close() r.Close() }
func TestGossiperGossipMarkOldGenNode(t *testing.T) { printTestInfo() nodes := []string{"0.0.0.0:9225", "0.0.0.0:9226", "0.0.0.0:9227"} rand.Seed(time.Now().UnixNano()) gossipers := make(map[int]*GossiperImpl) for i, nodeId := range nodes { if i == 0 { // node 0 never comes up continue } id := types.NodeId(strconv.Itoa(i)) g := NewGossiperImpl(nodeId, id) g.SetGossipInterval(time.Duration(200+rand.Intn(200)) * time.Millisecond) for j, peer := range nodes { if i == j { continue } g.AddNode(peer, types.NodeId(j)) } gossipers[i] = g } // each node must mark node 0 as down key := types.StoreKey("somekey") value := "someValue" for i, g := range gossipers { g.UpdateSelf(key, value+strconv.Itoa(i)) } for i, g := range gossipers { res := g.GetStoreKeyValue(key) for nodeId, n := range res { if nodeId != n.Id { t.Error("Gossiper ", i, "Id does not match ", nodeId, " n:", n.Id) } nid, ok := strconv.Atoi(string(nodeId)) if ok != nil { t.Error("Failed to convert node to id ", nodeId, " n.Id", n.Id) } if nid == 0 { if n.Status == types.NODE_STATUS_DOWN { t.Error("Gossiper ", i, "Expected node status not to be down: ", nodeId, " n:", n) } } } } time.Sleep(2 * time.Second) for i, g := range gossipers { res := g.GetStoreKeyValue(key) for nodeId, n := range res { if nodeId != n.Id { t.Error("Gossiper ", i, "Id does not match ", nodeId, " n:", n.Id) } nid, ok := strconv.Atoi(string(nodeId)) if ok != nil { t.Error("Failed to convert node to id ", nodeId, " n.Id", n.Id) } if nid == 0 { if n.Status != types.NODE_STATUS_DOWN { t.Error("Gossiper ", i, "Expected node status to be down: ", nodeId, " n:", n) } } else { if n.Status != types.NODE_STATUS_UP { t.Error("Gossiper ", i, "Expected node to be up: ", nodeId, " n:", n) } } } } // Now Reset both node 0 and node 1 in node 2. nid0 := types.NodeId(strconv.Itoa(0)) nid1 := types.NodeId(strconv.Itoa(1)) nid2 := types.NodeId(strconv.Itoa(2)) g, _ := gossipers[2] g.MarkNodeHasOldGen(nid0) g.MarkNodeHasOldGen(nid1) // Update value in node 1 g, _ = gossipers[1] g.UpdateSelf(key, value+"__1") // Node must be up now g, _ = gossipers[2] res := g.GetStoreKeyValue(key) for nodeId, n := range res { if nid2 == nodeId { continue } if nodeId != n.Id { t.Error("Id does not match ", nodeId, " n:", n.Id) } _, ok := strconv.Atoi(string(nodeId)) if ok != nil { t.Error("Failed to convert node to id ", nodeId, " n.Id", n.Id) } if n.Status != types.NODE_STATUS_WAITING_FOR_NEW_UPDATE { t.Error("Expected node status to be down: ", nodeId, " n:", n) } } time.Sleep(800 * time.Millisecond) res = g.GetStoreKeyValue(key) g1Node, _ := res[nid1] if g1Node.Status != types.NODE_STATUS_UP { t.Error("Expected node to be up ", g1Node) } // Sleep for 15*(node death interval) i.e. 15*400 mil i.e. 6 seconds // , both nodes must me marked down time.Sleep(35 * time.Second) res = g.GetStoreKeyValue(key) for nodeId, n := range res { if nid2 == nodeId { continue } if nodeId != n.Id { t.Error("Id does not match ", nodeId, " n:", n.Id) } _, ok := strconv.Atoi(string(nodeId)) if ok != nil { t.Error("Failed to convert node to id ", nodeId, " n.Id", n.Id) } if n.Status != types.NODE_STATUS_DOWN { t.Error("Expected node to be down: ", nodeId, " n:", n) } } // Test that the node does not is marked down after reset g.MarkNodeHasOldGen(nid1) time.Sleep(35 * time.Second) res = g.GetStoreKeyValue(key) g1Node, _ = res[nid1] if g1Node.Status != types.NODE_STATUS_DOWN_WAITING_FOR_NEW_UPDATE { t.Error("Expected node to be down waiting for update ", g1Node) } for _, g := range gossipers { g.Stop() } }
func TestGossiperMultipleNodesGoingUpDown(t *testing.T) { printTestInfo() nodes := []string{"0.0.0.0:9152", "0.0.0.0:9153", "0.0.0.0:9154", "0.0.0.0:9155", "0.0.0.0:9156", "0.0.0.0:9157", "0.0.0.0:9158", "0.0.0.0:9159", "0.0.0.0:9160", "0.0.0.0:9161"} rand.Seed(time.Now().UnixNano()) gossipers := make(map[string]*GossiperImpl) for i, nodeId := range nodes { g := NewGossiperImpl(nodeId, types.NodeId(strconv.Itoa(i))) g.SetGossipInterval(time.Duration(1500+rand.Intn(200)) * time.Millisecond) // add one neighbor and 2 random peers if i < len(nodes)-1 { err := g.AddNode(nodes[i+1], types.NodeId(strconv.Itoa(i))) if err != nil { t.Error("Unexpected error adding node to id: ", nodeId, " node: ", nodes[i+1]) } } else { err := g.AddNode(nodes[0], types.NodeId(strconv.Itoa(0))) if err != nil { t.Error("Unexpected error adding node to id: ", nodeId, " node: ", nodes[0]) } } // to this gossiper, add two random peers for count := 0; count < 2; { randId := rand.Intn(len(nodes)) if randId == i { continue } err := g.AddNode(nodes[randId], types.NodeId(strconv.Itoa(randId))) if err != nil { t.Log("Unexpected error adding node to id: ", nodeId, " node: ", nodes[randId], " err: ", err) } else { count++ } } gossipers[nodeId] = g time.Sleep(2000 * time.Millisecond) } updateFunc := func(g *GossiperImpl, id string, max int, t *testing.T) { for i := 0; i < max; i++ { t.Log("Updting data for ", id) g.UpdateSelf("sameKey", strconv.Itoa(i)) g.UpdateSelf(types.StoreKey(g.NodeId()), strconv.Itoa(i*i)) time.Sleep(g.GossipInterval() + time.Duration(rand.Intn(100))) } } for id, g := range gossipers { go updateFunc(g, id, 10, t) } // Max duration for update is 1500 + 200 + 100 per update * 10 // = 1800 mil * 10 = 18000 mil. // To add go fork thread, 2000 mil on top. // Let gossip go on for another 10 seconds, after which it must settle time.Sleep(1 * time.Minute) // verify all of them are same for i := 1; i < len(nodes); i++ { t.Log("Checking equality of ", nodes[0], " and ", nodes[i]) verifyGossiperEquality(gossipers[nodes[0]], gossipers[nodes[i]], t) } // start another update round, however, we will shut down soem machines // in between for id, g := range gossipers { go updateFunc(g, id, 10, t) } shutdownNodes := make(map[int]bool) for { randId := rand.Intn(len(nodes)) if randId == 0 { continue } _, ok := shutdownNodes[randId] if ok == false { shutdownNodes[randId] = true gossipers[nodes[randId]].Stop() if len(shutdownNodes) == 3 { break } } } time.Sleep(1 * time.Minute) // verify all of them are same for i := 1; i < len(nodes); i++ { _, ok := shutdownNodes[i] if ok { continue } t.Log("Checking equality of ", nodes[0], " and ", nodes[i]) verifyGossiperEquality(gossipers[nodes[0]], gossipers[nodes[i]], t) g := gossipers[nodes[i]] keys := g.GetStoreKeys() for _, key := range keys { values := g.GetStoreKeyValue(key) for j, nodeInfo := range values { nodeId, _ := strconv.Atoi(string(j)) _, ok := shutdownNodes[nodeId] if ok && nodeInfo.Status == types.NODE_STATUS_UP { t.Error("Node not marked down: ", nodeInfo, " for node: ", nodes[i]) } } } } for i := 1; i < len(nodes); i++ { gossipers[nodes[i]].Stop() } }