Example #1
0
func TestGossiperAddRemoveGetNode(t *testing.T) {
	printTestInfo()
	g := NewGossiperImpl("0.0.0.0:9010", "0")

	nodes := []string{"0.0.0.0:90011",
		"0.0.0.0:90012", "0.0.0.0:90013",
		"0.0.0.0:90014"}

	// test add nodes
	i := 1
	for _, node := range nodes {
		err := g.AddNode(node, types.NodeId(strconv.Itoa(i)))
		if err != nil {
			t.Error("Error adding new node")
		}
		i++
	}

	// try adding existing node
	err := g.AddNode(nodes[0], types.NodeId(strconv.Itoa(1)))
	if err == nil {
		t.Error("Duplicate node addition did not fail")
	}

	// check the nodelist is same
	peerNodes := g.GetNodes()
	if len(peerNodes) != len(nodes) {
		t.Error("Peer nodes len does not match added nodes, got: ",
			peerNodes, " expected: ", nodes)
	}
outer:
	for _, origNode := range nodes {
		for _, peerNode := range peerNodes {
			if origNode == peerNode {
				continue outer
			}
		}
		t.Error("Peer nodes does not have added node: ", origNode)
	}

	// test remove nodes
	for _, node := range nodes {
		err := g.RemoveNode(node)
		if err != nil {
			t.Error("Error removing new node")
		}
	}

	// try removing non-existing node
	err = g.RemoveNode("0.0.0.0:9020")
	if err == nil {
		t.Error("Non-existing node removal did not fail")
	}

	g.Stop()
}
Example #2
0
func TestGossipStoreSubset(t *testing.T) {
	printTestInfo()

	g := NewGossipStore(ID)

	log.Info("Testing: empty store and empty nodelist")
	// empty store and empty nodelist and non-empty nodelist
	diff := make(types.StoreNodes, 0)
	sv := g.Subset(diff)
	if len(sv) != 0 {
		t.Error("Emtpy subset expected, got: ", sv)
	}

	nodeLen := 10
	for i := 0; i < nodeLen*2; i++ {
		diff = append(diff, types.NodeId(strconv.Itoa(i)))
	}

	log.Info("Testing: empty store and non-empty nodelist")
	sv = g.Subset(diff)
	if len(sv) != 0 {
		t.Error("Emtpy subset expected, got: ", sv)
	}

	// store and diff asks for 20 nodes but store
	// has only a subset of it, as well as some keys
	// it does not know about
	keyList := []types.StoreKey{"key1", "key2", "key3"}
	g.nodeMap = make(types.NodeInfoMap)
	for _, key := range keyList {
		fillUpNodeInfoMap(g.nodeMap, key, nodeLen)
	}

	diff = make(types.StoreNodes, 0)
	for i := 0; i < nodeLen; i++ {
		diff = append(diff, types.NodeId(strconv.Itoa(2*i)))
	}

	log.Info("Testing: empty store and non-empty nodelist")
	sv = g.Subset(diff)
	if len(sv) != nodeLen/2 {
		t.Error("Subset has more keys then requested: ", sv)
	}
	for id, info := range sv {
		gInfo, ok := g.nodeMap[id]
		if !ok {
			t.Error("Subset returned id which was not originally present ", id)
		}
		if !compareNodeInfo(info, gInfo) {
			t.Error("Node info does not match, d:", info, " o:", gInfo)
		}
	}

}
Example #3
0
func verifyGossiperEquality(g1 *GossiperImpl, g2 *GossiperImpl, t *testing.T) {
	// check for the equality
	g1Keys := g1.GetStoreKeys()
	g2Keys := g2.GetStoreKeys()
	if len(g1Keys) != len(g2Keys) {
		t.Error("Keys mismatch, g1: ", g1Keys, " g2:", g2Keys)
	}

	for _, key := range g1Keys {
		g1Values := g1.GetStoreKeyValue(key)
		g2Values := g1.GetStoreKeyValue(key)

		t.Log("g1Values: ", g1Values)
		t.Log("g2Values: ", g2Values)

		if len(g1Values) != len(g2Values) {
			t.Fatal("Lens mismatch between g1 and g2 values")
		}

		for i := 0; i < len(g1Values); i++ {
			id := types.NodeId(strconv.Itoa(i))
			if g1Values[id].Id != g2Values[id].Id {
				t.Error("Values mismtach between g1 and g2, g1:\n",
					g1Values[id].Id, "\ng2:", g2Values[id].Id)
			}
		}
	}
}
Example #4
0
func fillUpNodeInfo(node *types.NodeInfo, key types.StoreKey, i int) {
	node.Id = types.NodeId(strconv.Itoa(i))
	node.LastUpdateTs = time.Now()
	node.Status = types.NODE_STATUS_UP

	node.Value = make(types.StoreMap)
	node.Value[types.StoreKey(CPU+key)] = node.Id
	node.Value[types.StoreKey(MEMORY+key)] = node.Id
}
Example #5
0
// Initialize node and alert listeners that we are joining the cluster.
func (c *ClusterManager) joinCluster(db *Database, self *api.Node, exist bool) error {
	var err error

	// If I am already in the cluster map, don't add me again.
	if exist {
		goto found
	}

	// Alert all listeners that we are a new node joining an existing cluster.
	for e := c.listeners.Front(); e != nil; e = e.Next() {
		err = e.Value.(ClusterListener).Init(self, db)
		if err != nil {
			self.Status = api.Status_STATUS_ERROR
			dlog.Warnf("Failed to initialize Init %s: %v",
				e.Value.(ClusterListener).String(), err)
			c.cleanupInit(db, self)
			goto done
		}
	}

found:
	// Alert all listeners that we are joining the cluster.
	for e := c.listeners.Front(); e != nil; e = e.Next() {
		err = e.Value.(ClusterListener).Join(self, db)
		if err != nil {
			self.Status = api.Status_STATUS_ERROR
			dlog.Warnf("Failed to initialize Join %s: %v",
				e.Value.(ClusterListener).String(), err)

			if exist == false {
				c.cleanupInit(db, self)
			}
			goto done
		}
	}

	for id, n := range db.NodeEntries {
		if id != c.config.NodeId {
			// Check to see if the IP is the same.  If it is, then we have a stale entry.
			if n.MgmtIp == self.MgmtIp {
				dlog.Warnf("Warning, Detected node %s with the same IP %s in the database.  Will not connect to this node.",
					id, n.MgmtIp)
			} else {
				// Gossip with this node.
				dlog.Infof("Connecting to node %s with IP %s.", id, n.MgmtIp)
				c.gossip.AddNode(n.MgmtIp+":9002", types.NodeId(id))
			}
		}
	}

done:
	return err
}
Example #6
0
func TestTransportSendAndRcvData(t *testing.T) {
	printTestInfo()
	time.Sleep(10 * time.Second)
	data1 := &TestData{}
	data2 := &TestData{}

	data1.Data = make(map[types.StoreKey]types.NodeInfo)
	data2.Data = make(map[types.StoreKey]types.NodeInfo)

	var handler types.OnMessageRcv = func(c types.MessageChannel) {
		err := c.RcvData(&data2)
		if err != nil {
			t.Error("Error receiving data: ", err)
		} else {
			t.Log("Done receiving")
		}
	}

	ipString := "0.0.0.0:19002"
	r := NewRunnableMessageChannel(ipString, handler)
	go r.RunOnRcvData()
	time.Sleep(1 * time.Second)

	keyList := []types.StoreKey{"key1", "key2"}
	for i, key := range keyList {
		var node types.NodeInfo
		node.Id = types.NodeId(i)
		node.Value = make(types.StoreMap)
		node.Value[key] = "some data"
		data1.Data[key] = node
	}

	s := NewMessageChannel(ipString)
	if s == nil {
		t.Fatal("Error creating send channel, failing test")
	}
	go s.SendData(&data1)
	time.Sleep(1 * time.Second)
	s.Close()
	r.Close()

	if len(data1.Data) != len(data2.Data) {
		t.Error("Sent and rcvd messages mismatch, sent: ", data1,
			" got: ", data2)
	}
}
Example #7
0
func TestGossipStoreGetStoreKeyValue(t *testing.T) {
	printTestInfo()

	// Case: emtpy store
	// Case: key absent
	g := NewGossipStore(ID)

	keyList := []types.StoreKey{"key1", "key2"}

	nodeInfoMap := g.GetStoreKeyValue(keyList[0])
	if len(nodeInfoMap) != 0 {
		t.Error("Expected empty node info list, got: ", nodeInfoMap)
	}

	// Case: key present with nodes with holes in node ids
	fillUpNodeInfoMap(g.nodeMap, keyList[0], 6)
	if len(g.nodeMap) != 6 {
		t.Error("Failed to fillup node info map properly, got: ",
			g.nodeMap)
	}
	keyCheck := types.StoreKey(CPU + keyList[0])
	delete(g.nodeMap["0"].Value, keyCheck)
	delete(g.nodeMap["2"].Value, keyCheck)
	delete(g.nodeMap["4"].Value, keyCheck)
	nodeInfoMap = g.GetStoreKeyValue(keyCheck)
	if len(nodeInfoMap) != 3 {
		t.Error("Expected list with atleast 6 elements, got: ", nodeInfoMap)
	}

	for i := 0; i < len(nodeInfoMap); i++ {
		id := types.NodeId(strconv.Itoa(i))
		if i%2 == 0 {
			if _, ok := nodeInfoMap[id]; ok {
				t.Error("No node expected, got: ", nodeInfoMap[id])
			}
			continue
		}
		infoMap := nodeInfoMap[id].Value.(types.NodeId)
		if nodeInfoMap[id].Id != id ||
			nodeInfoMap[id].Status != types.NODE_STATUS_UP ||
			infoMap != id {
			t.Error("Invalid node content received, got: ", nodeInfoMap[id])
		}
	}
}
Example #8
0
// Get the latest config.
func (c *ClusterManager) watchDB(key string, opaque interface{},
	kvp *kvdb.KVPair, err error) error {

	db, err := readDatabase()
	if err != nil {
		dlog.Warnln("Failed to read database after update ", err)
		return nil
	}

	// The only value we rely on during an update is the cluster size.
	c.size = db.Size
	for id, n := range db.NodeEntries {
		if id != c.config.NodeId {
			// Check to see if the IP is the same.  If it is, then we have a stale entry.
			c.gossip.UpdateNode(n.MgmtIp+":9002", types.NodeId(id))
		}
	}

	return nil
}
Example #9
0
func (c *ClusterManager) Start() error {
	log.Info("Cluster manager starting...")
	kvdb := kv.Instance()

	// Start the gossip protocol.
	// XXX Make the port configurable.
	gob.Register(api.Node{})
	c.g = gossip.New("0.0.0.0:9002", gossiptypes.NodeId(c.config.NodeId))
	c.g.SetGossipInterval(2 * time.Second)

	kvlock, err := kvdb.Lock("cluster/lock", 60)
	if err != nil {
		log.Panic("Fatal, Unable to obtain cluster lock.", err)
	}

	db, err := readDatabase()
	if err != nil {
		log.Panic(err)
	}

	if db.Status == api.StatusInit {
		log.Info("Will initialize a new cluster.")

		c.status = api.StatusOk
		db.Status = api.StatusOk
		self, _ := c.initNode(&db)

		err = c.initCluster(&db, self, false)
		if err != nil {
			kvdb.Unlock(kvlock)
			log.Error("Failed to initialize the cluster.", err)
			log.Panic(err)
		}

		// Update the new state of the cluster in the KV Database
		err = writeDatabase(&db)
		if err != nil {
			log.Errorf("Failed to save the database.", err)
			log.Panic(err)
		}

		err = kvdb.Unlock(kvlock)
		if err != nil {
			log.Panic("Fatal, unable to unlock cluster... Did something take too long to initialize?", err)
		}
	} else if db.Status&api.StatusOk > 0 {
		log.Info("Cluster state is OK... Joining the cluster.")

		c.status = api.StatusOk
		self, exist := c.initNode(&db)

		err = c.joinCluster(&db, self, exist)
		if err != nil {
			kvdb.Unlock(kvlock)
			log.Panic(err)
		}

		err = writeDatabase(&db)
		if err != nil {
			log.Panic(err)
		}

		err = kvdb.Unlock(kvlock)
		if err != nil {
			log.Panic("Fatal, unable to unlock cluster... Did something take too long to initialize?", err)
		}
	} else {
		kvdb.Unlock(kvlock)
		err = errors.New("Fatal, Cluster is in an unexpected state.")
		log.Panic(err)
	}

	// Start heartbeating to other nodes.
	go c.heartBeat()

	return nil
}
Example #10
0
func clearKey(nodes types.NodeInfoMap, key types.StoreKey, id int) {
	nodeId := types.NodeId(strconv.Itoa(id))
	nodeInfo := nodes[nodeId]
	delete(nodeInfo.Value, types.StoreKey(CPU+key))
	delete(nodeInfo.Value, types.StoreKey(MEMORY+key))
}
Example #11
0
func TestTransportFailures(t *testing.T) {
	printTestInfo()
	time.Sleep(10 * time.Second)
	data1 := &TestData{}
	data2 := &TestData{}

	data1.Data = make(map[types.StoreKey]types.NodeInfo)
	data2.Data = make(map[types.StoreKey]types.NodeInfo)

	var handler types.OnMessageRcv = func(c types.MessageChannel) {
		err := c.RcvData(&data2)
		if err == nil {
			t.Error("Did not receive expected error")
		}
		return
	}

	fmt.Println("Close without sending data")
	ipString := "0.0.0.0:17016"
	r := NewRunnableMessageChannel(ipString, handler)
	go r.RunOnRcvData()
	time.Sleep(5 * time.Second)

	keyList := []types.StoreKey{"key1", "key2"}
	for i, key := range keyList {
		var node types.NodeInfo
		node.Id = types.NodeId(i)
		node.Value = make(types.StoreMap)
		node.Value[key] = "some data"
		data1.Data[key] = node
	}

	// close the channel without sending any message
	s := NewMessageChannel(ipString)
	if s == nil {
		t.Fatal("Error creating send channel, failing test")
	}
	time.Sleep(10 * time.Millisecond)
	s.Close()
	time.Sleep(10 * time.Millisecond)
	r.Close()
	time.Sleep(10 * time.Millisecond)

	fmt.Println("Close the channel and then send")
	// open and then close the channel
	ipString = "0.0.0.0:17617"
	r = NewRunnableMessageChannel(ipString, handler)
	go r.RunOnRcvData()
	time.Sleep(1 * time.Second)
	r.Close()

	time.Sleep(2 * time.Second)
	// try sending data to closed end
	s = NewMessageChannel(ipString)
	if s != nil {
		t.Error("Error, expected nil sender")
	}

	fmt.Println("Send non-marshalable data")
	// try sending non-marshabable data
	ipString = "0.0.0.0:17418"
	r = NewRunnableMessageChannel(ipString, handler)
	go r.RunOnRcvData()
	time.Sleep(1 * time.Second)
	s = NewMessageChannel(ipString)
	if s == nil {
		t.Fatal("Error creating send channel, failing test")
	}
	nonMarshalData := make(map[types.StoreKey]map[types.StoreKey]types.NodeInfoMap)
	err := s.SendData(nonMarshalData)
	if err != nil {
		t.Error("Expected error sending non-marshalable data")
	}
	s.Close()
	r.Close()
}
Example #12
0
func TestGossiperOneNodeNeverGossips(t *testing.T) {
	printTestInfo()

	nodes := []string{"0.0.0.0:9222", "0.0.0.0:9223",
		"0.0.0.0:9224"}

	rand.Seed(time.Now().UnixNano())
	gossipers := make(map[int]*GossiperImpl)
	for i, nodeId := range nodes {
		if i == 0 {
			// node 0 never comes up
			continue
		}
		id := types.NodeId(strconv.Itoa(i))
		g := NewGossiperImpl(nodeId, id)
		g.SetGossipInterval(time.Duration(200+rand.Intn(200)) * time.Millisecond)
		for j, peer := range nodes {
			if i == j {
				continue
			}
			g.AddNode(peer, types.NodeId(j))
		}
		gossipers[i] = g
	}

	// each node must mark node 0 as down
	key := types.StoreKey("somekey")
	value := "someValue"
	for i, g := range gossipers {
		g.UpdateSelf(key, value+strconv.Itoa(i))
	}

	for i, g := range gossipers {
		res := g.GetStoreKeyValue(key)
		for nodeId, n := range res {
			if nodeId != n.Id {
				t.Error("Gossiper ", i, "Id does not match ",
					nodeId, " n:", n.Id)
			}
			nid, ok := strconv.Atoi(string(nodeId))
			if ok != nil {
				t.Error("Failed to convert node to id ", nodeId, " n.Id", n.Id)
			}
			if nid == 0 {
				if n.Status == types.NODE_STATUS_DOWN {
					t.Error("Gossiper ", i,
						"Expected node status not to be down: ", nodeId, " n:", n)
				}
			}
		}
	}

	time.Sleep(2 * time.Second)
	for i, g := range gossipers {
		res := g.GetStoreKeyValue(key)
		for nodeId, n := range res {
			if nodeId != n.Id {
				t.Error("Gossiper ", i, "Id does not match ",
					nodeId, " n:", n.Id)
			}
			nid, ok := strconv.Atoi(string(nodeId))
			if ok != nil {
				t.Error("Failed to convert node to id ", nodeId, " n.Id", n.Id)
			}
			if nid == 0 {
				if n.Status != types.NODE_STATUS_DOWN {
					t.Error("Gossiper ", i,
						"Expected node status to be down: ", nodeId, " n:", n)
				}
			} else {
				if n.Status != types.NODE_STATUS_UP {
					t.Error("Gossiper ", i, "Expected node to be up: ", nodeId,
						" n:", n)
				}
			}
		}
	}

	for _, g := range gossipers {
		g.Stop()
	}
}
Example #13
0
func (c *ClusterManager) Start() error {
	logrus.Info("Cluster manager starting...")

	c.gEnabled = true
	c.selfNode = api.Node{}
	c.selfNode.GenNumber = uint64(time.Now().UnixNano())
	c.selfNode.Id = c.config.NodeId
	c.selfNode.Status = api.StatusOk
	c.selfNode.Ip, _ = externalIp(&c.config)
	c.selfNode.NodeData = make(map[string]interface{})
	c.system = systemutils.New()

	// Start the gossip protocol.
	// XXX Make the port configurable.
	gob.Register(api.Node{})
	c.g = gossip.New("0.0.0.0:9002", types.NodeId(c.config.NodeId),
		c.selfNode.GenNumber)
	c.g.SetGossipInterval(2 * time.Second)

	kvdb := kvdb.Instance()
	kvlock, err := kvdb.Lock("cluster/lock", 60)
	if err != nil {
		logrus.Panic("Fatal, Unable to obtain cluster lock.", err)
	}
	defer kvdb.Unlock(kvlock)

	db, err := readDatabase()
	if err != nil {
		logrus.Panic(err)
	}

	if db.Status == api.StatusInit {
		logrus.Info("Will initialize a new cluster.")

		c.status = api.StatusOk
		db.Status = api.StatusOk
		self, _ := c.initNode(&db)

		err = c.initCluster(&db, self, false)
		if err != nil {
			logrus.Error("Failed to initialize the cluster.", err)
			return err
		}

		// Update the new state of the cluster in the KV Database
		err := writeDatabase(&db)
		if err != nil {
			logrus.Error("Failed to save the database.", err)
			return err
		}

	} else if db.Status&api.StatusOk > 0 {
		logrus.Info("Cluster state is OK... Joining the cluster.")

		c.status = api.StatusOk
		self, exist := c.initNode(&db)

		err = c.joinCluster(&db, self, exist)
		if err != nil {
			logrus.Error("Failed to join cluster.", err)
			return err
		}

		err := writeDatabase(&db)
		if err != nil {
			return err
		}

	} else {
		return errors.New("Fatal, Cluster is in an unexpected state.")
	}

	// Start heartbeating to other nodes.
	c.g.Start()
	go c.heartBeat()

	return nil
}
Example #14
0
func (c *ClusterManager) heartBeat() {
	gossipStoreKey := types.StoreKey(heartbeatKey + c.config.ClusterId)
	lastUpdateTs := time.Now()

	for {
		node := c.getCurrentState()
		c.nodeCache[node.Id] = *node

		currTime := time.Now()
		if currTime.Sub(lastUpdateTs) > 10*time.Second {
			logrus.Warn("No gossip update for 10 seconds")
		}
		c.g.UpdateSelf(gossipStoreKey, *node)
		lastUpdateTs = currTime

		// Process heartbeats from other nodes...
		gossipValues := c.g.GetStoreKeyValue(gossipStoreKey)

		for id, nodeInfo := range gossipValues {
			if id == types.NodeId(node.Id) {
				continue
			}

			cachedNodeInfo, nodeFoundInCache := c.nodeCache[string(id)]
			n := cachedNodeInfo
			ok := false
			if nodeInfo.Value != nil {
				n, ok = nodeInfo.Value.(api.Node)
				if !ok {
					logrus.Error("Received a bad broadcast packet: %v", nodeInfo.Value)
					continue
				}
			}

			if nodeFoundInCache {
				if n.Status != api.StatusOk {
					logrus.Warn("Detected node ", n.Id, " to be unhealthy.")

					for e := c.listeners.Front(); e != nil && c.gEnabled; e = e.Next() {
						err := e.Value.(ClusterListener).Update(&n)
						if err != nil {
							logrus.Warn("Failed to notify ", e.Value.(ClusterListener).String())
						}
					}

					delete(c.nodeCache, n.Id)
					continue
				} else if nodeInfo.Status == types.NODE_STATUS_DOWN {
					ne := c.getLatestNodeConfig(string(id))
					if ne != nil && nodeInfo.GenNumber < ne.GenNumber {
						logrus.Warn("Detected stale update for node ", id,
							" going down, ignoring it")
						c.g.MarkNodeHasOldGen(id)
						delete(c.nodeCache, cachedNodeInfo.Id)
						continue
					}

					logrus.Warn("Detected node ", id, " to be offline due to inactivity.")

					n.Status = api.StatusOffline
					for e := c.listeners.Front(); e != nil && c.gEnabled; e = e.Next() {
						err := e.Value.(ClusterListener).Update(&n)
						if err != nil {
							logrus.Warn("Failed to notify ", e.Value.(ClusterListener).String())
						}
					}

					delete(c.nodeCache, cachedNodeInfo.Id)
				} else if nodeInfo.Status == types.NODE_STATUS_DOWN_WAITING_FOR_NEW_UPDATE {
					logrus.Warn("Detected node ", n.Id, " to be offline due to inactivity.")

					n.Status = api.StatusOffline
					for e := c.listeners.Front(); e != nil && c.gEnabled; e = e.Next() {
						err := e.Value.(ClusterListener).Update(&n)
						if err != nil {
							logrus.Warn("Failed to notify ", e.Value.(ClusterListener).String())
						}
					}

					delete(c.nodeCache, cachedNodeInfo.Id)
				} else {
					// node may be up or waiting for new update,
					// no need to tell listeners as yet.
					c.nodeCache[cachedNodeInfo.Id] = n
				}
			} else if nodeInfo.Status == types.NODE_STATUS_UP {
				// A node discovered in the cluster.
				logrus.Warn("Detected node ", n.Id, " to be in the cluster.")

				c.nodeCache[n.Id] = n
				for e := c.listeners.Front(); e != nil && c.gEnabled; e = e.Next() {
					err := e.Value.(ClusterListener).Add(&n)
					if err != nil {
						logrus.Warn("Failed to notify ", e.Value.(ClusterListener).String())
					}
				}
			}
		}

		time.Sleep(2 * time.Second)
	}
}
Example #15
0
func (c *ClusterManager) Start() error {
	dlog.Infoln("Cluster manager starting...")

	c.gEnabled = true
	c.selfNode = api.Node{}
	c.selfNode.GenNumber = uint64(time.Now().UnixNano())
	c.selfNode.Id = c.config.NodeId
	c.selfNode.Status = api.Status_STATUS_OK
	c.selfNode.MgmtIp, c.selfNode.DataIp, _ = ExternalIp(&c.config)
	c.selfNode.NodeData = make(map[string]interface{})
	c.system = systemutils.New()

	// Start the gossip protocol.
	// XXX Make the port configurable.
	gob.Register(api.Node{})
	c.gossip = gossip.New("0.0.0.0:9002", types.NodeId(c.config.NodeId),
		c.selfNode.GenNumber)
	c.gossip.SetGossipInterval(2 * time.Second)

	kvdb := kvdb.Instance()
	kvlock, err := kvdb.Lock(clusterLockKey, 60)
	if err != nil {
		dlog.Panicln("Fatal, Unable to obtain cluster lock.", err)
	}
	defer kvdb.Unlock(kvlock)

	db, err := readDatabase()
	if err != nil {
		dlog.Panicln(err)
	}

	// Cluster database max size... 0 if unlimited.
	c.size = db.Size

	if db.Status == api.Status_STATUS_INIT {
		dlog.Infoln("Will initialize a new cluster.")

		c.status = api.Status_STATUS_OK
		db.Status = api.Status_STATUS_OK
		self, _ := c.initNode(&db)

		err = c.initCluster(&db, self, false)
		if err != nil {
			dlog.Errorln("Failed to initialize the cluster.", err)
			return err
		}

		// Update the new state of the cluster in the KV Database
		err := writeDatabase(&db)
		if err != nil {
			dlog.Errorln("Failed to save the database.", err)
			return err
		}

	} else if db.Status&api.Status_STATUS_OK > 0 {
		dlog.Infoln("Cluster state is OK... Joining the cluster.")

		c.status = api.Status_STATUS_OK
		self, exist := c.initNode(&db)

		err = c.joinCluster(&db, self, exist)
		if err != nil {
			dlog.Errorln("Failed to join cluster.", err)
			return err
		}

		err := writeDatabase(&db)
		if err != nil {
			return err
		}

	} else {
		return errors.New("Fatal, Cluster is in an unexpected state.")
	}

	// Start heartbeating to other nodes.
	go c.startHeartBeat()
	go c.updateClusterStatus()

	kvdb.WatchKey(ClusterDBKey, 0, nil, c.watchDB)

	return nil
}
Example #16
0
func (c *ClusterManager) updateClusterStatus() {
	gossipStoreKey := types.StoreKey(heartbeatKey + c.config.ClusterId)

	for {
		node := c.getCurrentState()
		c.nodeCache[node.Id] = *node

		// Process heartbeats from other nodes...
		gossipValues := c.gossip.GetStoreKeyValue(gossipStoreKey)

		numNodes := 0
		for id, nodeInfo := range gossipValues {
			numNodes = numNodes + 1

			// Check to make sure we are not exceeding the size of the cluster.
			if c.size > 0 && numNodes > c.size {
				dlog.Fatalf("Fatal, number of nodes in the cluster has"+
					"exceeded the cluster size: %d > %d", numNodes, c.size)
				os.Exit(-1)
			}

			// Ignore updates from self node.
			if id == types.NodeId(node.Id) {
				continue
			}

			// Notify node status change if required.
			newNodeInfo := api.Node{}
			newNodeInfo.Id = string(id)
			newNodeInfo.Status = api.Status_STATUS_OK

			switch {
			case nodeInfo.Status == types.NODE_STATUS_DOWN:
				newNodeInfo.Status = api.Status_STATUS_OFFLINE
				lastStatus, ok := c.nodeStatuses[string(id)]
				if ok && lastStatus == newNodeInfo.Status {
					break
				}

				// Check if it is a stale update
				ne := c.getLatestNodeConfig(string(id))
				if ne != nil && nodeInfo.GenNumber != 0 &&
					nodeInfo.GenNumber < ne.GenNumber {
					dlog.Warnln("Detected stale update for node ", id,
						" going down, ignoring it")
					c.gossip.MarkNodeHasOldGen(id)
					break
				}
				c.nodeStatuses[string(id)] = newNodeInfo.Status

				dlog.Warnln("Detected node ", id,
					" to be offline due to inactivity.")

				for e := c.listeners.Front(); e != nil && c.gEnabled; e = e.Next() {
					err := e.Value.(ClusterListener).Update(&newNodeInfo)
					if err != nil {
						dlog.Warnln("Failed to notify ",
							e.Value.(ClusterListener).String())
					}
				}

			case nodeInfo.Status == types.NODE_STATUS_DOWN_WAITING_FOR_NEW_UPDATE:
				newNodeInfo.Status = api.Status_STATUS_OFFLINE
				lastStatus, ok := c.nodeStatuses[string(id)]
				if ok && lastStatus == newNodeInfo.Status {
					break
				}
				c.nodeStatuses[string(id)] = newNodeInfo.Status

				dlog.Warnln("Detected node ", newNodeInfo.Id,
					" to be offline due to inactivity.")

				for e := c.listeners.Front(); e != nil && c.gEnabled; e = e.Next() {
					err := e.Value.(ClusterListener).Update(&newNodeInfo)
					if err != nil {
						dlog.Warnln("Failed to notify ",
							e.Value.(ClusterListener).String())
					}
				}

			case nodeInfo.Status == types.NODE_STATUS_UP:
				newNodeInfo.Status = api.Status_STATUS_OK
				lastStatus, ok := c.nodeStatuses[string(id)]
				if ok && lastStatus == newNodeInfo.Status {
					break
				}
				c.nodeStatuses[string(id)] = newNodeInfo.Status

				// A node discovered in the cluster.
				dlog.Warnln("Detected node ", newNodeInfo.Id,
					" to be in the cluster.")

				for e := c.listeners.Front(); e != nil && c.gEnabled; e = e.Next() {
					err := e.Value.(ClusterListener).Add(&newNodeInfo)
					if err != nil {
						dlog.Warnln("Failed to notify ",
							e.Value.(ClusterListener).String())
					}
				}
			}

			// Update cache.
			if nodeInfo.Value != nil {
				n, ok := nodeInfo.Value.(api.Node)
				if ok {
					n.Status = newNodeInfo.Status
					c.nodeCache[n.Id] = n
				} else {
					c.nodeCache[newNodeInfo.Id] = newNodeInfo
				}
			} else {
				newNodeInfo.Status = api.Status_STATUS_OFFLINE
				c.nodeCache[newNodeInfo.Id] = newNodeInfo
			}
		}

		time.Sleep(2 * time.Second)
	}
}
Example #17
0
func TestTransportTwoWayExchange(t *testing.T) {
	printTestInfo()
	time.Sleep(10 * time.Second)
	data1 := &TestData{}
	data2 := &TestData{}
	data3 := &TestData{}
	data4 := &TestData{}

	data1.Data = make(map[types.StoreKey]types.NodeInfo)
	data2.Data = make(map[types.StoreKey]types.NodeInfo)
	data3.Data = make(map[types.StoreKey]types.NodeInfo)
	data4.Data = make(map[types.StoreKey]types.NodeInfo)

	var handler types.OnMessageRcv = func(c types.MessageChannel) {
		err := c.RcvData(&data2)
		if err != nil {
			t.Error("Error receiving data2: ", err)
		} else {
			t.Log("Done receiving")
		}

		for key, nodeInfo := range data2.Data {
			intId, _ := strconv.Atoi(string(nodeInfo.Id))
			nodeInfo.Id = types.NodeId(strconv.Itoa(intId + 1))
			data2.Data[key] = nodeInfo
		}
		err = c.SendData(data2)
		if err != nil {
			t.Error("Error sending data2: ", err)
		} else {
			t.Log("Done Sending data2")
		}
		time.Sleep(20 * time.Millisecond)

		err = c.RcvData(&data4)
		if err != nil {
			t.Error("Error sending data4: ", err)
		} else {
			t.Log("Done receving data4")
		}
		time.Sleep(20 * time.Millisecond)
	}

	r := NewRunnableMessageChannel("0.0.0.0:19422", handler)
	go r.RunOnRcvData()
	time.Sleep(1 * time.Second)

	keyList := []types.StoreKey{"key1", "key2"}
	for i, key := range keyList {
		var node types.NodeInfo
		node.Id = types.NodeId(i)
		node.Value = make(types.StoreMap)
		node.Value[key] = "some data"
		data1.Data[key] = node
	}

	ipString := "0.0.0.0:19422"
	s := NewMessageChannel(ipString)
	if s == nil {
		t.Fatal("Error creating send channel, failing test")
	}
	s.SendData(&data1)
	time.Sleep(20 * time.Millisecond)

	err := s.RcvData(&data3)
	if err != nil {
		t.Fatal("Error receving data3: ", err)
	}
	for key, nodeInfo := range data3.Data {
		intId, _ := strconv.Atoi(string(nodeInfo.Id))
		nodeInfo.Id = types.NodeId(strconv.Itoa(intId + 1))
		data3.Data[key] = nodeInfo
	}
	time.Sleep(20 * time.Millisecond)

	err = s.SendData(&data3)
	if err != nil {
		t.Fatal("Error sending data3: ", err)
	}
	time.Sleep(20 * time.Millisecond)

	if len(data1.Data) != len(data2.Data) ||
		len(data1.Data) != len(data3.Data) ||
		len(data1.Data) != len(data4.Data) {
		t.Error("Data sent and received not matching Data1:",
			data1.Data, "\nData2:", data2.Data,
			"\nData3:", data3.Data, "\nData4:", data4.Data)
	}

	for key, nodeInfo := range data1.Data {
		nodeInfo2 := data2.Data[key]
		nodeInfo3 := data3.Data[key]
		nodeInfo4 := data4.Data[key]

		intId, _ := strconv.Atoi(string(nodeInfo.Id))
		id1 := types.NodeId(strconv.Itoa(intId + 1))
		id2 := types.NodeId(strconv.Itoa(intId + 2))
		if nodeInfo2.Id != id1 ||
			nodeInfo3.Id != id2 ||
			nodeInfo4.Id != id2 {
			t.Error("Data mismatch, Data1: ",
				nodeInfo, "\nData2:", nodeInfo2,
				"\nData3:", nodeInfo3, "\nData4:", nodeInfo4)
		}

		compareNodeValues(nodeInfo2, nodeInfo, t)
		compareNodeValues(nodeInfo3, nodeInfo, t)
		compareNodeValues(nodeInfo4, nodeInfo, t)
	}

	s.Close()
	r.Close()
}
Example #18
0
func TestGossiperGossipMarkOldGenNode(t *testing.T) {
	printTestInfo()

	nodes := []string{"0.0.0.0:9225", "0.0.0.0:9226", "0.0.0.0:9227"}

	rand.Seed(time.Now().UnixNano())
	gossipers := make(map[int]*GossiperImpl)
	for i, nodeId := range nodes {
		if i == 0 {
			// node 0 never comes up
			continue
		}
		id := types.NodeId(strconv.Itoa(i))
		g := NewGossiperImpl(nodeId, id)

		g.SetGossipInterval(time.Duration(200+rand.Intn(200)) * time.Millisecond)
		for j, peer := range nodes {
			if i == j {
				continue
			}
			g.AddNode(peer, types.NodeId(j))
		}
		gossipers[i] = g
	}

	// each node must mark node 0 as down
	key := types.StoreKey("somekey")
	value := "someValue"
	for i, g := range gossipers {
		g.UpdateSelf(key, value+strconv.Itoa(i))
	}

	for i, g := range gossipers {
		res := g.GetStoreKeyValue(key)
		for nodeId, n := range res {
			if nodeId != n.Id {
				t.Error("Gossiper ", i, "Id does not match ",
					nodeId, " n:", n.Id)
			}
			nid, ok := strconv.Atoi(string(nodeId))
			if ok != nil {
				t.Error("Failed to convert node to id ", nodeId, " n.Id", n.Id)
			}
			if nid == 0 {
				if n.Status == types.NODE_STATUS_DOWN {
					t.Error("Gossiper ", i,
						"Expected node status not to be down: ", nodeId, " n:", n)
				}
			}
		}
	}

	time.Sleep(2 * time.Second)
	for i, g := range gossipers {
		res := g.GetStoreKeyValue(key)
		for nodeId, n := range res {
			if nodeId != n.Id {
				t.Error("Gossiper ", i, "Id does not match ",
					nodeId, " n:", n.Id)
			}
			nid, ok := strconv.Atoi(string(nodeId))
			if ok != nil {
				t.Error("Failed to convert node to id ", nodeId, " n.Id", n.Id)
			}
			if nid == 0 {
				if n.Status != types.NODE_STATUS_DOWN {
					t.Error("Gossiper ", i,
						"Expected node status to be down: ", nodeId, " n:", n)
				}
			} else {
				if n.Status != types.NODE_STATUS_UP {
					t.Error("Gossiper ", i, "Expected node to be up: ", nodeId,
						" n:", n)
				}
			}
		}
	}

	// Now Reset both node 0 and node 1 in node 2.
	nid0 := types.NodeId(strconv.Itoa(0))
	nid1 := types.NodeId(strconv.Itoa(1))
	nid2 := types.NodeId(strconv.Itoa(2))

	g, _ := gossipers[2]
	g.MarkNodeHasOldGen(nid0)
	g.MarkNodeHasOldGen(nid1)
	// Update value in node 1
	g, _ = gossipers[1]
	g.UpdateSelf(key, value+"__1")

	// Node must be up now
	g, _ = gossipers[2]
	res := g.GetStoreKeyValue(key)
	for nodeId, n := range res {
		if nid2 == nodeId {
			continue
		}
		if nodeId != n.Id {
			t.Error("Id does not match ", nodeId, " n:", n.Id)
		}
		_, ok := strconv.Atoi(string(nodeId))
		if ok != nil {
			t.Error("Failed to convert node to id ", nodeId, " n.Id", n.Id)
		}
		if n.Status != types.NODE_STATUS_WAITING_FOR_NEW_UPDATE {
			t.Error("Expected node status to be down: ", nodeId, " n:", n)
		}
	}

	time.Sleep(800 * time.Millisecond)
	res = g.GetStoreKeyValue(key)
	g1Node, _ := res[nid1]
	if g1Node.Status != types.NODE_STATUS_UP {
		t.Error("Expected node to be up ", g1Node)
	}

	// Sleep for 15*(node death interval) i.e. 15*400 mil i.e. 6 seconds
	// , both nodes must me marked down
	time.Sleep(35 * time.Second)
	res = g.GetStoreKeyValue(key)
	for nodeId, n := range res {
		if nid2 == nodeId {
			continue
		}
		if nodeId != n.Id {
			t.Error("Id does not match ", nodeId, " n:", n.Id)
		}
		_, ok := strconv.Atoi(string(nodeId))
		if ok != nil {
			t.Error("Failed to convert node to id ", nodeId, " n.Id", n.Id)
		}
		if n.Status != types.NODE_STATUS_DOWN {
			t.Error("Expected node to be down: ", nodeId, " n:", n)
		}
	}

	// Test that the node does not is marked down after reset
	g.MarkNodeHasOldGen(nid1)
	time.Sleep(35 * time.Second)
	res = g.GetStoreKeyValue(key)
	g1Node, _ = res[nid1]
	if g1Node.Status != types.NODE_STATUS_DOWN_WAITING_FOR_NEW_UPDATE {
		t.Error("Expected node to be down waiting for update ", g1Node)
	}

	for _, g := range gossipers {
		g.Stop()
	}
}
Example #19
0
func TestGossiperMultipleNodesGoingUpDown(t *testing.T) {
	printTestInfo()

	nodes := []string{"0.0.0.0:9152", "0.0.0.0:9153",
		"0.0.0.0:9154", "0.0.0.0:9155",
		"0.0.0.0:9156", "0.0.0.0:9157",
		"0.0.0.0:9158", "0.0.0.0:9159",
		"0.0.0.0:9160", "0.0.0.0:9161"}

	rand.Seed(time.Now().UnixNano())
	gossipers := make(map[string]*GossiperImpl)
	for i, nodeId := range nodes {
		g := NewGossiperImpl(nodeId, types.NodeId(strconv.Itoa(i)))

		g.SetGossipInterval(time.Duration(1500+rand.Intn(200)) * time.Millisecond)
		// add one neighbor and 2 random peers
		if i < len(nodes)-1 {
			err := g.AddNode(nodes[i+1], types.NodeId(strconv.Itoa(i)))
			if err != nil {
				t.Error("Unexpected error adding node to id: ", nodeId,
					" node: ", nodes[i+1])
			}
		} else {
			err := g.AddNode(nodes[0], types.NodeId(strconv.Itoa(0)))
			if err != nil {
				t.Error("Unexpected error adding node to id: ", nodeId,
					" node: ", nodes[0])
			}
		}

		// to this gossiper, add two random peers
		for count := 0; count < 2; {
			randId := rand.Intn(len(nodes))
			if randId == i {
				continue
			}

			err := g.AddNode(nodes[randId], types.NodeId(strconv.Itoa(randId)))
			if err != nil {
				t.Log("Unexpected error adding node to id: ", nodeId,
					" node: ", nodes[randId], " err: ", err)
			} else {
				count++
			}
		}
		gossipers[nodeId] = g
		time.Sleep(2000 * time.Millisecond)
	}

	updateFunc := func(g *GossiperImpl, id string, max int, t *testing.T) {
		for i := 0; i < max; i++ {
			t.Log("Updting data for ", id)
			g.UpdateSelf("sameKey", strconv.Itoa(i))
			g.UpdateSelf(types.StoreKey(g.NodeId()), strconv.Itoa(i*i))
			time.Sleep(g.GossipInterval() + time.Duration(rand.Intn(100)))
		}
	}

	for id, g := range gossipers {
		go updateFunc(g, id, 10, t)
	}

	// Max duration for update is 1500 + 200 + 100 per update * 10
	// = 1800 mil * 10 = 18000 mil.
	// To add go fork thread, 2000 mil on top.
	// Let gossip go on for another 10 seconds, after which it must settle
	time.Sleep(1 * time.Minute)

	// verify all of them are same
	for i := 1; i < len(nodes); i++ {
		t.Log("Checking equality of ", nodes[0], " and ", nodes[i])
		verifyGossiperEquality(gossipers[nodes[0]], gossipers[nodes[i]], t)
	}

	// start another update round, however, we will shut down soem machines
	// in between
	for id, g := range gossipers {
		go updateFunc(g, id, 10, t)
	}

	shutdownNodes := make(map[int]bool)
	for {
		randId := rand.Intn(len(nodes))
		if randId == 0 {
			continue
		}
		_, ok := shutdownNodes[randId]
		if ok == false {
			shutdownNodes[randId] = true
			gossipers[nodes[randId]].Stop()
			if len(shutdownNodes) == 3 {
				break
			}
		}
	}

	time.Sleep(1 * time.Minute)
	// verify all of them are same
	for i := 1; i < len(nodes); i++ {
		_, ok := shutdownNodes[i]
		if ok {
			continue
		}
		t.Log("Checking equality of ", nodes[0], " and ", nodes[i])
		verifyGossiperEquality(gossipers[nodes[0]], gossipers[nodes[i]], t)

		g := gossipers[nodes[i]]
		keys := g.GetStoreKeys()
		for _, key := range keys {
			values := g.GetStoreKeyValue(key)

			for j, nodeInfo := range values {
				nodeId, _ := strconv.Atoi(string(j))
				_, ok := shutdownNodes[nodeId]
				if ok && nodeInfo.Status == types.NODE_STATUS_UP {
					t.Error("Node not marked down: ", nodeInfo, " for node: ", nodes[i])
				}
			}
		}
	}

	for i := 1; i < len(nodes); i++ {
		gossipers[nodes[i]].Stop()
	}

}