Example #1
0
func TestDemoteDownedManager(t *testing.T) {
	numWorker, numManager := 0, 3
	cl := newCluster(t, numWorker, numManager)
	defer func() {
		require.NoError(t, cl.Stop())
	}()

	leader, err := cl.Leader()
	require.NoError(t, err)

	// add a new manager so we have 3, then find one (not the leader) to demote
	var demotee *testNode
	for _, n := range cl.nodes {
		if n.IsManager() && n.node.NodeID() != leader.node.NodeID() {
			demotee = n
			break
		}
	}

	nodeID := demotee.node.NodeID()

	resp, err := cl.api.GetNode(context.Background(), &api.GetNodeRequest{NodeID: nodeID})
	require.NoError(t, err)
	spec := resp.Node.Spec.Copy()
	spec.Role = api.NodeRoleWorker

	// stop the node, then demote it, and start it back up again so when it comes back up it has to realize
	// it's not running anymore
	require.NoError(t, demotee.Pause())

	// demote node, but don't use SetNodeRole, which waits until it successfully becomes a worker, since
	// the node is currently down
	require.NoError(t, raftutils.PollFuncWithTimeout(nil, func() error {
		_, err := cl.api.UpdateNode(context.Background(), &api.UpdateNodeRequest{
			NodeID:      nodeID,
			Spec:        spec,
			NodeVersion: &resp.Node.Meta.Version,
		})
		return err
	}, opsTimeout))

	// start it back up again
	require.NoError(t, cl.StartNode(nodeID))

	// wait to become worker
	require.NoError(t, raftutils.PollFuncWithTimeout(nil, func() error {
		if demotee.IsManager() {
			return fmt.Errorf("node is still not a worker")
		}
		return nil
	}, opsTimeout))

	// agents 1, managers 2
	numWorker++
	numManager--
	pollClusterReady(t, cl, numWorker, numManager)
}
Example #2
0
// SetNodeRole sets role for node through control api.
func (c *testCluster) SetNodeRole(id string, role api.NodeRole) error {
	node, ok := c.nodes[id]
	if !ok {
		return fmt.Errorf("set node role: node %s not found", id)
	}
	if node.IsManager() && role == api.NodeRoleManager {
		return fmt.Errorf("node is already manager")
	}
	if !node.IsManager() && role == api.NodeRoleWorker {
		return fmt.Errorf("node is already worker")
	}

	var initialTimeout time.Duration
	// version might change between get and update, so retry
	for i := 0; i < 5; i++ {
		time.Sleep(initialTimeout)
		initialTimeout += 500 * time.Millisecond
		resp, err := c.api.GetNode(context.Background(), &api.GetNodeRequest{NodeID: id})
		if err != nil {
			return err
		}
		spec := resp.Node.Spec.Copy()
		spec.Role = role
		if _, err := c.api.UpdateNode(context.Background(), &api.UpdateNodeRequest{
			NodeID:      id,
			Spec:        spec,
			NodeVersion: &resp.Node.Meta.Version,
		}); err != nil {
			// there possible problems on calling update node because redirecting
			// node or leader might want to shut down
			if grpc.ErrorDesc(err) == "update out of sequence" {
				continue
			}
			return err
		}
		if role == api.NodeRoleManager {
			// wait to become manager
			return raftutils.PollFuncWithTimeout(nil, func() error {
				if !node.IsManager() {
					return fmt.Errorf("node is still not a manager")
				}
				return nil
			}, opsTimeout)
		}
		// wait to become worker
		return raftutils.PollFuncWithTimeout(nil, func() error {
			if node.IsManager() {
				return fmt.Errorf("node is still not a worker")
			}
			return nil
		}, opsTimeout)
	}
	return fmt.Errorf("set role %s for node %s, got sequence error 5 times", role, id)
}
Example #3
0
// RemoveNode removes node entirely. It tries to demote managers.
func (c *testCluster) RemoveNode(id string, graceful bool) error {
	node, ok := c.nodes[id]
	if !ok {
		return fmt.Errorf("remove node: node %s not found", id)
	}
	// demote before removal
	if node.IsManager() {
		if err := c.SetNodeRole(id, api.NodeRoleWorker); err != nil {
			return fmt.Errorf("demote manager: %v", err)
		}

	}
	if err := node.Stop(); err != nil {
		return err
	}
	delete(c.nodes, id)
	if graceful {
		if err := raftutils.PollFuncWithTimeout(nil, func() error {
			resp, err := c.api.GetNode(context.Background(), &api.GetNodeRequest{NodeID: id})
			if err != nil {
				return fmt.Errorf("get node: %v", err)
			}
			if resp.Node.Status.State != api.NodeStatus_DOWN {
				return fmt.Errorf("node %s is still not down", id)
			}
			return nil
		}, opsTimeout); err != nil {
			return err
		}
	}
	if _, err := c.api.RemoveNode(context.Background(), &api.RemoveNodeRequest{NodeID: id, Force: !graceful}); err != nil {
		return fmt.Errorf("remove node: %v", err)
	}
	return nil
}
Example #4
0
// pollClusterReady calls control api until all conditions are true:
// * all nodes are ready
// * all managers has membership == accepted
// * all managers has reachability == reachable
// * one node is leader
// * number of workers and managers equals to expected
func pollClusterReady(t *testing.T, c *testCluster, numWorker, numManager int) {
	pollFunc := func() error {
		res, err := c.api.ListNodes(context.Background(), &api.ListNodesRequest{})
		if err != nil {
			return err
		}
		var mCount int
		var leaderFound bool
		for _, n := range res.Nodes {
			if n.Status.State != api.NodeStatus_READY {
				return fmt.Errorf("node %s with desired role %s isn't ready, status %s, message %s", n.ID, n.Spec.DesiredRole, n.Status.State, n.Status.Message)
			}
			if n.Spec.Membership != api.NodeMembershipAccepted {
				return fmt.Errorf("node %s with desired role %s isn't accepted to cluster, membership %s", n.ID, n.Spec.DesiredRole, n.Spec.Membership)
			}
			if n.Certificate.Role != n.Spec.DesiredRole {
				return fmt.Errorf("node %s had different roles in spec and certificate, %s and %s respectively", n.ID, n.Spec.DesiredRole, n.Certificate.Role)
			}
			if n.Certificate.Status.State != api.IssuanceStateIssued {
				return fmt.Errorf("node %s with desired role %s has no issued certificate, issuance state %s", n.ID, n.Spec.DesiredRole, n.Certificate.Status.State)
			}
			if n.Role == api.NodeRoleManager {
				if n.ManagerStatus == nil {
					return fmt.Errorf("manager node %s has no ManagerStatus field", n.ID)
				}
				if n.ManagerStatus.Reachability != api.RaftMemberStatus_REACHABLE {
					return fmt.Errorf("manager node %s has reachable status: %s", n.ID, n.ManagerStatus.Reachability)
				}
				mCount++
				if n.ManagerStatus.Leader {
					leaderFound = true
				}
			} else {
				if n.ManagerStatus != nil {
					return fmt.Errorf("worker node %s should not have manager status, returned %s", n.ID, n.ManagerStatus)
				}
			}
		}
		if !leaderFound {
			return fmt.Errorf("leader of cluster is not found")
		}
		wCount := len(res.Nodes) - mCount
		if mCount != numManager {
			return fmt.Errorf("unexpected number of managers: %d, expected %d", mCount, numManager)
		}
		if wCount != numWorker {
			return fmt.Errorf("unexpected number of workers: %d, expected %d", wCount, numWorker)
		}
		return nil
	}
	err := raftutils.PollFuncWithTimeout(nil, pollFunc, opsTimeout)
	require.NoError(t, err)
}
Example #5
0
func TestClusterStorePasshphraseRotationForRootCA(t *testing.T) {
	if !*integrationTests {
		t.Skip("integration test")
	}
	os.Setenv(ca.PassphraseENVVar, "password1")
	defer os.Setenv(ca.PassphraseENVVar, "")
	defer os.Setenv(ca.PassphraseENVVarPrev, "")

	mCount, aCount := 5, 15
	c := createManagersCluster(t, mCount, aCount)
	require.NoError(t, testutils.PollFunc(nil, c.pollRegister))

	// Get the leader
	leader, err := c.leader()
	assert.NoError(t, err)

	// check key material in store
	var clusters []*api.Cluster
	leader.m.RaftNode.MemoryStore().View(func(tx store.ReadTx) {
		clusters, err = store.FindClusters(tx, store.All)
	})
	assert.NoError(t, err)
	assert.Len(t, clusters, 1, "there should be one cluster")
	assert.NotNil(t, clusters[0].RootCA.CACert)
	assert.NotNil(t, clusters[0].RootCA.CAKey)
	assert.Contains(t, string(clusters[0].RootCA.CAKey), "Proc-Type: 4,ENCRYPTED")

	firstEncryptedKey := clusters[0].RootCA.CAKey

	// Set an ENV passphrase and kill the current leader
	os.Setenv(ca.PassphraseENVVarPrev, "password1")
	os.Setenv(ca.PassphraseENVVar, "password2")
	require.NoError(t, c.destroyLeader())

	// ensure that cluster will converge to expected number of agents, we need big timeout because of heartbeat times
	require.NoError(t, testutils.PollFuncWithTimeout(nil, c.pollRegister, 30*time.Second))

	// Get the new leader
	leader, err = c.leader()
	assert.NoError(t, err)
	// check key material in store
	leader.m.RaftNode.MemoryStore().View(func(tx store.ReadTx) {
		clusters, err = store.FindClusters(tx, store.All)
	})
	assert.NoError(t, err)
	assert.Len(t, clusters, 1, "there should be one cluster")
	assert.NotNil(t, clusters[0].RootCA.CACert)
	assert.NotNil(t, clusters[0].RootCA.CAKey)
	assert.Contains(t, string(clusters[0].RootCA.CAKey), "Proc-Type: 4,ENCRYPTED")
	assert.NotEqual(t, firstEncryptedKey, clusters[0].RootCA.CAKey)

}
Example #6
0
func TestGetUnlockKey(t *testing.T) {
	t.Parallel()

	tc := testutils.NewTestCA(t)
	defer tc.Stop()

	var cluster *api.Cluster
	tc.MemoryStore.View(func(tx store.ReadTx) {
		clusters, err := store.FindClusters(tx, store.ByName(store.DefaultClusterName))
		require.NoError(t, err)
		cluster = clusters[0]
	})

	resp, err := tc.CAClients[0].GetUnlockKey(context.Background(), &api.GetUnlockKeyRequest{})
	require.NoError(t, err)
	require.Nil(t, resp.UnlockKey)
	require.Equal(t, cluster.Meta.Version, resp.Version)

	// Update the unlock key
	require.NoError(t, tc.MemoryStore.Update(func(tx store.Tx) error {
		cluster = store.GetCluster(tx, cluster.ID)
		cluster.Spec.EncryptionConfig.AutoLockManagers = true
		cluster.UnlockKeys = []*api.EncryptionKey{{
			Subsystem: ca.ManagerRole,
			Key:       []byte("secret"),
		}}
		return store.UpdateCluster(tx, cluster)
	}))

	tc.MemoryStore.View(func(tx store.ReadTx) {
		cluster = store.GetCluster(tx, cluster.ID)
	})

	require.NoError(t, raftutils.PollFuncWithTimeout(nil, func() error {
		resp, err = tc.CAClients[0].GetUnlockKey(context.Background(), &api.GetUnlockKeyRequest{})
		if err != nil {
			return fmt.Errorf("get unlock key: %v", err)
		}
		if !bytes.Equal(resp.UnlockKey, []byte("secret")) {
			return fmt.Errorf("secret hasn't rotated yet")
		}
		if cluster.Meta.Version.Index > resp.Version.Index {
			return fmt.Errorf("hasn't updated to the right version yet")
		}
		return nil
	}, 250*time.Millisecond))
}
Example #7
0
func pollServiceReady(t *testing.T, c *testCluster, sid string) {
	pollFunc := func() error {
		req := &api.ListTasksRequest{}
		res, err := c.api.ListTasks(context.Background(), req)
		require.NoError(t, err)
		if len(res.Tasks) == 0 {
			return fmt.Errorf("tasks list is empty")
		}
		for _, task := range res.Tasks {
			if task.Status.State != api.TaskStateRunning {
				return fmt.Errorf("task %s is not running, status %s", task.ID, task.Status.State)
			}
		}
		return nil
	}
	require.NoError(t, raftutils.PollFuncWithTimeout(nil, pollFunc, opsTimeout))
}
Example #8
0
// ControlClient returns grpc client to ControlAPI of node. It will panic for
// non-manager nodes.
func (n *testNode) ControlClient(ctx context.Context) (api.ControlClient, error) {
	ctx, cancel := context.WithTimeout(ctx, opsTimeout)
	defer cancel()
	connChan := n.node.ListenControlSocket(ctx)
	var controlConn *grpc.ClientConn
	if err := raftutils.PollFuncWithTimeout(nil, func() error {
		select {
		case controlConn = <-connChan:
		default:
		}
		if controlConn == nil {
			return fmt.Errorf("didn't get control api connection")
		}
		return nil
	}, opsTimeout); err != nil {
		return nil, err
	}
	return api.NewControlClient(controlConn), nil
}
Example #9
0
func TestRaftWipedState(t *testing.T) {
	t.Parallel()

	nodes, clockSource := raftutils.NewRaftCluster(t, tc)
	defer raftutils.TeardownCluster(t, nodes)

	// Stop node 3
	nodes[3].Server.Stop()
	nodes[3].ShutdownRaft()

	// Remove its state
	os.RemoveAll(nodes[3].StateDir)

	raftutils.AdvanceTicks(clockSource, 5)

	// Restart node 3
	nodes[3] = raftutils.RestartNode(t, clockSource, nodes[3], false)

	// Make sure this doesn't panic.
	raftutils.PollFuncWithTimeout(clockSource, func() error { return errors.New("keep the poll going") }, time.Second)
}
Example #10
0
func TestClusterReelection(t *testing.T) {
	if !*integrationTests {
		t.Skip("integration test")
	}
	mCount, aCount := 5, 15
	c := createManagersCluster(t, mCount, aCount)
	require.NoError(t, testutils.PollFunc(nil, c.pollRegister))

	require.NoError(t, c.destroyLeader())
	// let's down some managers in the meantime
	require.NoError(t, c.destroyAgents(5))
	// ensure that cluster will converge to expected number of agents, we need big timeout because of heartbeat times
	require.NoError(t, testutils.PollFuncWithTimeout(nil, c.pollRegister, 30*time.Second))

	leader, err := c.leader()
	assert.NoError(t, err)

	// check nodes in store
	var nodes []*api.Node
	leader.m.RaftNode.MemoryStore().View(func(tx store.ReadTx) {
		ns, err := store.FindNodes(tx, store.All)
		assert.NoError(t, err)
		for _, n := range ns {
			if n.Spec.Role == api.NodeRoleWorker {
				nodes = append(nodes, n)
			}
		}
	})
	assert.NoError(t, err)
	assert.Len(t, nodes, aCount, "there should be all nodes in store")
	var downAgentsCount int
	for _, node := range nodes {
		if node.Status.State == api.NodeStatus_DOWN {
			downAgentsCount++
			continue
		}
		assert.Equal(t, api.NodeStatus_READY, node.Status.State, "there should be only down and ready nodes at this point")
	}
	assert.Equal(t, 5, downAgentsCount, "unexpected number of down agents")
}
Example #11
0
func TestRestartLeader(t *testing.T) {
	t.Parallel()

	numWorker, numManager := 5, 3
	cl := newCluster(t, numWorker, numManager)
	defer func() {
		require.NoError(t, cl.Stop())
	}()
	leader, err := cl.Leader()
	require.NoError(t, err)

	origLeaderID := leader.node.NodeID()

	require.NoError(t, leader.Pause(false))

	require.NoError(t, raftutils.PollFuncWithTimeout(nil, func() error {
		resp, err := cl.api.ListNodes(context.Background(), &api.ListNodesRequest{})
		if err != nil {
			return err
		}
		for _, node := range resp.Nodes {
			if node.ID == origLeaderID {
				continue
			}
			require.False(t, node.Status.State == api.NodeStatus_DOWN, "nodes shouldn't go to down")
			if node.Status.State != api.NodeStatus_READY {
				return errors.Errorf("node %s is still not ready", node.ID)
			}
		}
		return nil
	}, opsTimeout))

	require.NoError(t, cl.StartNode(origLeaderID))

	pollClusterReady(t, cl, numWorker, numManager)
}
Example #12
0
// Tests locking and unlocking the manager and key rotations
func TestManagerLockUnlock(t *testing.T) {
	ctx := context.Background()

	temp, err := ioutil.TempFile("", "test-manager-lock")
	require.NoError(t, err)
	require.NoError(t, temp.Close())
	require.NoError(t, os.Remove(temp.Name()))

	defer os.RemoveAll(temp.Name())

	stateDir, err := ioutil.TempDir("", "test-raft")
	require.NoError(t, err)
	defer os.RemoveAll(stateDir)

	tc := testutils.NewTestCA(t)
	defer tc.Stop()

	managerSecurityConfig, err := tc.NewNodeConfig(ca.ManagerRole)
	require.NoError(t, err)

	_, _, err = managerSecurityConfig.KeyReader().Read()
	require.NoError(t, err)

	m, err := New(&Config{
		RemoteAPI:      RemoteAddrs{ListenAddr: "127.0.0.1:0"},
		ControlAPI:     temp.Name(),
		StateDir:       stateDir,
		SecurityConfig: managerSecurityConfig,
		// start off without any encryption
	})
	require.NoError(t, err)
	require.NotNil(t, m)

	done := make(chan error)
	defer close(done)
	go func() {
		done <- m.Run(ctx)
	}()

	opts := []grpc.DialOption{
		grpc.WithTimeout(10 * time.Second),
		grpc.WithTransportCredentials(managerSecurityConfig.ClientTLSCreds),
	}

	conn, err := grpc.Dial(m.Addr(), opts...)
	require.NoError(t, err)
	defer func() {
		require.NoError(t, conn.Close())
	}()

	// check that there is no kek currently - we are using the API because this
	// lets us wait until the manager is up and listening, as well
	var cluster *api.Cluster
	client := api.NewControlClient(conn)

	require.NoError(t, raftutils.PollFuncWithTimeout(nil, func() error {
		resp, err := client.ListClusters(ctx, &api.ListClustersRequest{})
		if err != nil {
			return err
		}
		if len(resp.Clusters) == 0 {
			return fmt.Errorf("no clusters yet")
		}
		cluster = resp.Clusters[0]
		return nil
	}, 1*time.Second))

	require.Nil(t, cluster.UnlockKeys)

	// tls key is unencrypted, but there is a DEK
	key, err := ioutil.ReadFile(tc.Paths.Node.Key)
	require.NoError(t, err)
	keyBlock, _ := pem.Decode(key)
	require.NotNil(t, keyBlock)
	require.False(t, x509.IsEncryptedPEMBlock(keyBlock))
	require.Len(t, keyBlock.Headers, 2)
	currentDEK, err := decodePEMHeaderValue(keyBlock.Headers[pemHeaderRaftDEK], nil)
	require.NoError(t, err)
	require.NotEmpty(t, currentDEK)

	// update the lock key - this may fail due to update out of sequence errors, so try again
	for {
		getResp, err := client.GetCluster(ctx, &api.GetClusterRequest{ClusterID: cluster.ID})
		require.NoError(t, err)
		cluster = getResp.Cluster

		spec := cluster.Spec.Copy()
		spec.EncryptionConfig.AutoLockManagers = true
		updateResp, err := client.UpdateCluster(ctx, &api.UpdateClusterRequest{
			ClusterID:      cluster.ID,
			ClusterVersion: &cluster.Meta.Version,
			Spec:           spec,
		})
		if grpc.ErrorDesc(err) == "update out of sequence" {
			continue
		}
		// if there is any other type of error, this should fail
		if err == nil {
			cluster = updateResp.Cluster
		}
		break
	}
	require.NoError(t, err)

	caConn := api.NewCAClient(conn)
	unlockKeyResp, err := caConn.GetUnlockKey(ctx, &api.GetUnlockKeyRequest{})
	require.NoError(t, err)

	// this should update the TLS key, rotate the DEK, and finish snapshotting
	var updatedKey []byte
	require.NoError(t, raftutils.PollFuncWithTimeout(nil, func() error {
		updatedKey, err = ioutil.ReadFile(tc.Paths.Node.Key)
		require.NoError(t, err) // this should never error due to atomic writes

		if bytes.Equal(key, updatedKey) {
			return fmt.Errorf("TLS key should have been re-encrypted at least")
		}

		keyBlock, _ = pem.Decode(updatedKey)
		require.NotNil(t, keyBlock) // this should never error due to atomic writes

		if !x509.IsEncryptedPEMBlock(keyBlock) {
			return fmt.Errorf("Key not encrypted")
		}

		// we don't check that the TLS key has been rotated, because that may take
		// a little bit, and is best effort only

		currentDEKString, ok := keyBlock.Headers[pemHeaderRaftDEK]
		require.True(t, ok) // there should never NOT be a current header
		nowCurrentDEK, err := decodePEMHeaderValue(currentDEKString, unlockKeyResp.UnlockKey)
		require.NoError(t, err) // it should always be encrypted
		if bytes.Equal(currentDEK, nowCurrentDEK) {
			return fmt.Errorf("snapshot has not been finished yet")
		}

		currentDEK = nowCurrentDEK
		return nil
	}, 1*time.Second))

	_, ok := keyBlock.Headers[pemHeaderRaftPendingDEK]
	require.False(t, ok) // once the snapshot is do

	_, ok = keyBlock.Headers[pemHeaderRaftDEKNeedsRotation]
	require.False(t, ok)

	// verify that the snapshot is readable with the new DEK
	encrypter, decrypter := encryption.Defaults(currentDEK)
	// we can't use the raftLogger, because the WALs are still locked while the raft node is up.  And once we remove
	// the manager, they'll be deleted.
	snapshot, err := storage.NewSnapFactory(encrypter, decrypter).New(filepath.Join(stateDir, "raft", "snap-v3-encrypted")).Load()
	require.NoError(t, err)
	require.NotNil(t, snapshot)

	// update the lock key to nil
	for i := 0; i < 3; i++ {
		getResp, err := client.GetCluster(ctx, &api.GetClusterRequest{ClusterID: cluster.ID})
		require.NoError(t, err)
		cluster = getResp.Cluster

		spec := cluster.Spec.Copy()
		spec.EncryptionConfig.AutoLockManagers = false
		_, err = client.UpdateCluster(ctx, &api.UpdateClusterRequest{
			ClusterID:      cluster.ID,
			ClusterVersion: &cluster.Meta.Version,
			Spec:           spec,
		})
		if grpc.ErrorDesc(err) == "update out of sequence" {
			continue
		}
		require.NoError(t, err)
	}

	// this should update the TLS key
	var unlockedKey []byte
	require.NoError(t, raftutils.PollFuncWithTimeout(nil, func() error {
		unlockedKey, err = ioutil.ReadFile(tc.Paths.Node.Key)
		if err != nil {
			return err
		}

		if bytes.Equal(unlockedKey, updatedKey) {
			return fmt.Errorf("TLS key should have been rotated")
		}

		return nil
	}, 1*time.Second))

	// the new key should not be encrypted, and the DEK should also be unencrypted
	// but not rotated
	keyBlock, _ = pem.Decode(unlockedKey)
	require.NotNil(t, keyBlock)
	require.False(t, x509.IsEncryptedPEMBlock(keyBlock))

	unencryptedDEK, err := decodePEMHeaderValue(keyBlock.Headers[pemHeaderRaftDEK], nil)
	require.NoError(t, err)
	require.NotNil(t, unencryptedDEK)
	require.Equal(t, currentDEK, unencryptedDEK)

	m.Stop(ctx)

	// After stopping we should MAY receive an error from ListenAndServe if
	// all this happened before WaitForLeader completed, so don't check the
	// error.
	<-done
}
func startDispatcher(c *Config) (*grpcDispatcher, error) {
	l, err := net.Listen("tcp", "127.0.0.1:0")
	if err != nil {
		return nil, err
	}

	tca := testutils.NewTestCA(nil, testutils.AcceptancePolicy(true, true, ""))
	agentSecurityConfig1, err := tca.NewNodeConfig(ca.AgentRole)
	if err != nil {
		return nil, err
	}
	agentSecurityConfig2, err := tca.NewNodeConfig(ca.AgentRole)
	if err != nil {
		return nil, err
	}
	managerSecurityConfig, err := tca.NewNodeConfig(ca.ManagerRole)
	if err != nil {
		return nil, err
	}

	serverOpts := []grpc.ServerOption{grpc.Creds(managerSecurityConfig.ServerTLSCreds)}

	s := grpc.NewServer(serverOpts...)
	tc := &testCluster{addr: l.Addr().String(), store: tca.MemoryStore}
	d := New(tc, c)

	authorize := func(ctx context.Context, roles []string) error {
		_, err := ca.AuthorizeForwardedRoleAndOrg(ctx, roles, []string{ca.ManagerRole}, tca.Organization)
		return err
	}
	authenticatedDispatcherAPI := api.NewAuthenticatedWrapperDispatcherServer(d, authorize)

	api.RegisterDispatcherServer(s, authenticatedDispatcherAPI)
	go func() {
		// Serve will always return an error (even when properly stopped).
		// Explicitly ignore it.
		_ = s.Serve(l)
	}()
	go d.Run(context.Background())
	if err := raftutils.PollFuncWithTimeout(nil, func() error {
		d.mu.Lock()
		defer d.mu.Unlock()
		if !d.isRunning() {
			return fmt.Errorf("dispatcher is not running")
		}
		return nil
	}, 5*time.Second); err != nil {
		return nil, err
	}

	clientOpts := []grpc.DialOption{grpc.WithTimeout(10 * time.Second)}
	clientOpts1 := append(clientOpts, grpc.WithTransportCredentials(agentSecurityConfig1.ClientTLSCreds))
	clientOpts2 := append(clientOpts, grpc.WithTransportCredentials(agentSecurityConfig2.ClientTLSCreds))
	clientOpts3 := append(clientOpts, grpc.WithTransportCredentials(credentials.NewTLS(&tls.Config{InsecureSkipVerify: true})))

	conn1, err := grpc.Dial(l.Addr().String(), clientOpts1...)
	if err != nil {
		return nil, err
	}

	conn2, err := grpc.Dial(l.Addr().String(), clientOpts2...)
	if err != nil {
		return nil, err
	}

	conn3, err := grpc.Dial(l.Addr().String(), clientOpts3...)
	if err != nil {
		return nil, err
	}

	clients := []api.DispatcherClient{api.NewDispatcherClient(conn1), api.NewDispatcherClient(conn2), api.NewDispatcherClient(conn3)}
	securityConfigs := []*ca.SecurityConfig{agentSecurityConfig1, agentSecurityConfig2, managerSecurityConfig}
	conns := []*grpc.ClientConn{conn1, conn2, conn3}
	return &grpcDispatcher{
		Clients:          clients,
		SecurityConfigs:  securityConfigs,
		Store:            tc.MemoryStore(),
		dispatcherServer: d,
		conns:            conns,
		grpcServer:       s,
		testCA:           tca,
	}, nil
}
Example #14
0
func TestForceNewCluster(t *testing.T) {
	t.Parallel()

	// create an external CA so that we can use it to generate expired certificates
	tempDir, err := ioutil.TempDir("", "external-ca")
	require.NoError(t, err)
	defer os.RemoveAll(tempDir)
	rootCA, err := ca.CreateRootCA("externalRoot", ca.NewConfigPaths(tempDir).RootCA)
	require.NoError(t, err)

	// start a new cluster with the external CA bootstrapped
	numWorker, numManager := 0, 1
	cl := newTestCluster()
	defer func() {
		require.NoError(t, cl.Stop())
	}()
	require.NoError(t, cl.AddManager(false, &rootCA), "manager number 1")
	pollClusterReady(t, cl, numWorker, numManager)

	leader, err := cl.Leader()
	require.NoError(t, err)

	sid, err := cl.CreateService("test_service", 2)
	require.NoError(t, err)
	pollServiceReady(t, cl, sid)

	// generate an expired certificate
	rootKey, err := helpers.ParsePrivateKeyPEM(rootCA.Key)
	require.NoError(t, err)
	rootCert, err := helpers.ParseCertificatePEM(rootCA.Cert)
	require.NoError(t, err)

	managerCertFile := filepath.Join(leader.stateDir, "certificates", "swarm-node.crt")
	certBytes, err := ioutil.ReadFile(managerCertFile)
	require.NoError(t, err)
	managerCerts, err := helpers.ParseCertificatesPEM(certBytes)
	require.NoError(t, err)
	expiredCertTemplate := managerCerts[0]
	expiredCertTemplate.NotBefore = time.Now().Add(time.Hour * -5)
	expiredCertTemplate.NotAfter = time.Now().Add(time.Hour * -3)
	expiredCertDERBytes, err := x509.CreateCertificate(rand.Reader, expiredCertTemplate, rootCert, expiredCertTemplate.PublicKey, rootKey)
	require.NoError(t, err)
	expiredCertPEM := pem.EncodeToMemory(&pem.Block{
		Type:  "CERTIFICATE",
		Bytes: expiredCertDERBytes,
	})

	// restart node with an expired certificate while forcing a new cluster - it should start without error and the certificate should be renewed
	nodeID := leader.node.NodeID()
	require.NoError(t, leader.Pause(true))
	require.NoError(t, ioutil.WriteFile(managerCertFile, expiredCertPEM, 0644))
	require.NoError(t, cl.StartNode(nodeID))
	pollClusterReady(t, cl, numWorker, numManager)
	pollServiceReady(t, cl, sid)

	err = raftutils.PollFuncWithTimeout(nil, func() error {
		certBytes, err := ioutil.ReadFile(managerCertFile)
		if err != nil {
			return err
		}
		managerCerts, err := helpers.ParseCertificatesPEM(certBytes)
		if err != nil {
			return err
		}
		if managerCerts[0].NotAfter.Before(time.Now()) {
			return errors.New("certificate hasn't been renewed yet")
		}
		return nil
	}, opsTimeout)
	require.NoError(t, err)

	// restart node with an expired certificate without forcing a new cluster - it should error on start
	require.NoError(t, leader.Pause(true))
	require.NoError(t, ioutil.WriteFile(managerCertFile, expiredCertPEM, 0644))
	require.Error(t, cl.StartNode(nodeID))
}
Example #15
0
func TestDemoteDownedManager(t *testing.T) {
	t.Parallel()

	numWorker, numManager := 0, 3
	cl := newCluster(t, numWorker, numManager)
	defer func() {
		require.NoError(t, cl.Stop())
	}()

	leader, err := cl.Leader()
	require.NoError(t, err)

	// Find a manager (not the leader) to demote. It must not be the third
	// manager we added, because there may not have been enough time for
	// that one to write anything to its WAL.
	var demotee *testNode
	for _, n := range cl.nodes {
		nodeID := n.node.NodeID()
		if n.IsManager() && nodeID != leader.node.NodeID() && cl.nodesOrder[nodeID] != 3 {
			demotee = n
			break
		}
	}

	nodeID := demotee.node.NodeID()

	resp, err := cl.api.GetNode(context.Background(), &api.GetNodeRequest{NodeID: nodeID})
	require.NoError(t, err)
	spec := resp.Node.Spec.Copy()
	spec.DesiredRole = api.NodeRoleWorker

	// stop the node, then demote it, and start it back up again so when it comes back up it has to realize
	// it's not running anymore
	require.NoError(t, demotee.Pause(false))

	// demote node, but don't use SetNodeRole, which waits until it successfully becomes a worker, since
	// the node is currently down
	require.NoError(t, raftutils.PollFuncWithTimeout(nil, func() error {
		_, err := cl.api.UpdateNode(context.Background(), &api.UpdateNodeRequest{
			NodeID:      nodeID,
			Spec:        spec,
			NodeVersion: &resp.Node.Meta.Version,
		})
		return err
	}, opsTimeout))

	// start it back up again
	require.NoError(t, cl.StartNode(nodeID))

	// wait to become worker
	require.NoError(t, raftutils.PollFuncWithTimeout(nil, func() error {
		if demotee.IsManager() {
			return fmt.Errorf("node is still not a worker")
		}
		return nil
	}, opsTimeout))

	// agents 1, managers 2
	numWorker++
	numManager--
	pollClusterReady(t, cl, numWorker, numManager)
}