func newManager(t *testing.T, joinAddr string, securityConfig *ca.SecurityConfig) (*testManager, error) { ltcp, err := net.Listen("tcp", "127.0.0.1:0") if err != nil { return nil, err } stateDir, err := ioutil.TempDir("", "test-raft") if err != nil { return nil, err } m, err := manager.New(&manager.Config{ ProtoListener: map[string]net.Listener{"tcp": ltcp}, StateDir: stateDir, JoinRaft: joinAddr, SecurityConfig: securityConfig, }) if err != nil { return nil, err } go m.Run(context.Background()) time.Sleep(100 * time.Millisecond) return &testManager{ m: m, addr: ltcp.Addr().String(), }, nil }
func TestManager(t *testing.T) { ctx := context.TODO() store := store.NewMemoryStore(nil) assert.NotNil(t, store) temp, err := ioutil.TempFile("", "test-socket") assert.NoError(t, err) assert.NoError(t, temp.Close()) assert.NoError(t, os.Remove(temp.Name())) defer os.RemoveAll(temp.Name()) lunix, err := net.Listen("unix", temp.Name()) assert.NoError(t, err) ltcp, err := net.Listen("tcp", "127.0.0.1:0") assert.NoError(t, err) stateDir, err := ioutil.TempDir("", "test-raft") assert.NoError(t, err) defer os.RemoveAll(stateDir) tc := testutils.NewTestCA(t) defer tc.Stop() agentSecurityConfig, err := tc.NewNodeConfig(ca.AgentRole) assert.NoError(t, err) agentDiffOrgSecurityConfig, err := tc.NewNodeConfigOrg(ca.AgentRole, "another-org") assert.NoError(t, err) managerSecurityConfig, err := tc.NewNodeConfig(ca.ManagerRole) assert.NoError(t, err) m, err := manager.New(&manager.Config{ ProtoListener: map[string]net.Listener{"unix": lunix, "tcp": ltcp}, StateDir: stateDir, SecurityConfig: managerSecurityConfig, }) assert.NoError(t, err) assert.NotNil(t, m) done := make(chan error) defer close(done) go func() { done <- m.Run(ctx) }() opts := []grpc.DialOption{ grpc.WithTimeout(10 * time.Second), grpc.WithTransportCredentials(agentSecurityConfig.ClientTLSCreds), } conn, err := grpc.Dial(ltcp.Addr().String(), opts...) assert.NoError(t, err) defer func() { assert.NoError(t, conn.Close()) }() // We have to send a dummy request to verify if the connection is actually up. client := api.NewDispatcherClient(conn) _, err = client.Heartbeat(context.Background(), &api.HeartbeatRequest{}) assert.Equal(t, dispatcher.ErrNodeNotRegistered.Error(), grpc.ErrorDesc(err)) // Try to have a client in a different org access this manager opts = []grpc.DialOption{ grpc.WithTimeout(10 * time.Second), grpc.WithTransportCredentials(agentDiffOrgSecurityConfig.ClientTLSCreds), } conn2, err := grpc.Dial(ltcp.Addr().String(), opts...) assert.NoError(t, err) defer func() { assert.NoError(t, conn2.Close()) }() // We have to send a dummy request to verify if the connection is actually up. client = api.NewDispatcherClient(conn2) _, err = client.Heartbeat(context.Background(), &api.HeartbeatRequest{}) assert.Contains(t, grpc.ErrorDesc(err), "Permission denied: unauthorized peer role: rpc error: code = 7 desc = Permission denied: remote certificate not part of organization") // Verify that requests to the various GRPC services running on TCP // are rejected if they don't have certs. opts = []grpc.DialOption{ grpc.WithTimeout(10 * time.Second), grpc.WithTransportCredentials(credentials.NewTLS(&tls.Config{InsecureSkipVerify: true})), } noCertConn, err := grpc.Dial(ltcp.Addr().String(), opts...) assert.NoError(t, err) defer func() { assert.NoError(t, noCertConn.Close()) }() client = api.NewDispatcherClient(noCertConn) _, err = client.Heartbeat(context.Background(), &api.HeartbeatRequest{}) assert.EqualError(t, err, "rpc error: code = 7 desc = Permission denied: unauthorized peer role: rpc error: code = 7 desc = no client certificates in request") controlClient := api.NewControlClient(noCertConn) _, err = controlClient.ListNodes(context.Background(), &api.ListNodesRequest{}) assert.EqualError(t, err, "rpc error: code = 7 desc = Permission denied: unauthorized peer role: rpc error: code = 7 desc = no client certificates in request") raftClient := api.NewRaftMembershipClient(noCertConn) _, err = raftClient.Join(context.Background(), &api.JoinRequest{}) assert.EqualError(t, err, "rpc error: code = 7 desc = Permission denied: unauthorized peer role: rpc error: code = 7 desc = no client certificates in request") m.Stop(ctx) // After stopping we should MAY receive an error from ListenAndServe if // all this happened before WaitForLeader completed, so don't check the // error. <-done }
func (n *Node) runManager(ctx context.Context, securityConfig *ca.SecurityConfig, ready chan struct{}) error { for { if err := n.waitRole(ctx, ca.ManagerRole); err != nil { return err } remoteAddr, _ := n.remotes.Select(n.NodeID()) m, err := manager.New(&manager.Config{ ForceNewCluster: n.config.ForceNewCluster, ProtoAddr: map[string]string{ "tcp": n.config.ListenRemoteAPI, "unix": n.config.ListenControlAPI, }, AdvertiseAddr: n.config.AdvertiseRemoteAPI, SecurityConfig: securityConfig, ExternalCAs: n.config.ExternalCAs, JoinRaft: remoteAddr.Addr, StateDir: n.config.StateDir, HeartbeatTick: n.config.HeartbeatTick, ElectionTick: n.config.ElectionTick, }) if err != nil { return err } done := make(chan struct{}) var runErr error go func() { runErr = m.Run(context.Background()) close(done) }() n.Lock() n.manager = m n.Unlock() connCtx, connCancel := context.WithCancel(ctx) go n.initManagerConnection(connCtx, ready) // this happens only on initial start if ready != nil { go func(ready chan struct{}) { select { case <-ready: addr, err := n.RemoteAPIAddr() if err != nil { log.G(ctx).WithError(err).Errorf("get remote api addr") } else { n.remotes.Observe(api.Peer{NodeID: n.NodeID(), Addr: addr}, remotes.DefaultObservationWeight) } case <-connCtx.Done(): } }(ready) ready = nil } roleChanged := make(chan error) waitCtx, waitCancel := context.WithCancel(ctx) go func() { err := n.waitRole(waitCtx, ca.WorkerRole) roleChanged <- err }() select { case <-done: // Fail out if m.Run() returns error, otherwise wait for // role change. if runErr != nil { err = runErr } else { err = <-roleChanged } case err = <-roleChanged: } n.Lock() n.manager = nil n.Unlock() select { case <-done: case <-ctx.Done(): err = ctx.Err() m.Stop(context.Background()) <-done } connCancel() n.setControlSocket(nil) waitCancel() if err != nil { return err } } }
func (n *Node) runManager(ctx context.Context, securityConfig *ca.SecurityConfig, ready chan struct{}) error { for { n.waitRole(ctx, ca.ManagerRole) if ctx.Err() != nil { return ctx.Err() } remoteAddr, _ := n.remotes.Select(n.nodeID) m, err := manager.New(&manager.Config{ ForceNewCluster: n.config.ForceNewCluster, ProtoAddr: map[string]string{ "tcp": n.config.ListenRemoteAPI, "unix": n.config.ListenControlAPI, }, AdvertiseAddr: n.config.AdvertiseRemoteAPI, SecurityConfig: securityConfig, ExternalCAs: n.config.ExternalCAs, JoinRaft: remoteAddr.Addr, StateDir: n.config.StateDir, HeartbeatTick: n.config.HeartbeatTick, ElectionTick: n.config.ElectionTick, }) if err != nil { return err } done := make(chan struct{}) go func() { m.Run(context.Background()) // todo: store error close(done) }() n.Lock() n.manager = m n.Unlock() connCtx, connCancel := context.WithCancel(ctx) go n.initManagerConnection(connCtx, ready) // this happens only on initial start if ready != nil { go func(ready chan struct{}) { select { case <-ready: n.remotes.Observe(api.Peer{NodeID: n.nodeID, Addr: n.config.ListenRemoteAPI}, 5) case <-connCtx.Done(): } }(ready) ready = nil } n.waitRole(ctx, ca.AgentRole) n.Lock() n.manager = nil n.Unlock() select { case <-done: case <-ctx.Done(): err = ctx.Err() m.Stop(context.Background()) <-done } connCancel() if err != nil { return err } } }
func (n *Node) runManager(ctx context.Context, securityConfig *ca.SecurityConfig, ready chan struct{}) error { remoteAddr, _ := n.remotes.Select(n.NodeID()) m, err := manager.New(&manager.Config{ ForceNewCluster: n.config.ForceNewCluster, RemoteAPI: manager.RemoteAddrs{ ListenAddr: n.config.ListenRemoteAPI, AdvertiseAddr: n.config.AdvertiseRemoteAPI, }, ControlAPI: n.config.ListenControlAPI, SecurityConfig: securityConfig, ExternalCAs: n.config.ExternalCAs, JoinRaft: remoteAddr.Addr, StateDir: n.config.StateDir, HeartbeatTick: n.config.HeartbeatTick, ElectionTick: n.config.ElectionTick, AutoLockManagers: n.config.AutoLockManagers, UnlockKey: n.unlockKey, Availability: n.config.Availability, }) if err != nil { return err } done := make(chan struct{}) var runErr error go func() { if err := m.Run(context.Background()); err != nil && err != raft.ErrMemberRemoved { runErr = err } close(done) }() workerRole := make(chan struct{}) waitRoleCtx, waitRoleCancel := context.WithCancel(ctx) defer waitRoleCancel() go func() { n.waitRole(waitRoleCtx, ca.WorkerRole) close(workerRole) }() defer func() { n.Lock() n.manager = nil n.Unlock() m.Stop(ctx) <-done n.setControlSocket(nil) }() n.Lock() n.manager = m n.Unlock() connCtx, connCancel := context.WithCancel(ctx) defer connCancel() go n.initManagerConnection(connCtx, ready) // this happens only on initial start if ready != nil { go func(ready chan struct{}) { select { case <-ready: addr, err := n.RemoteAPIAddr() if err != nil { log.G(ctx).WithError(err).Errorf("get remote api addr") } else { n.remotes.Observe(api.Peer{NodeID: n.NodeID(), Addr: addr}, remotes.DefaultObservationWeight) } case <-connCtx.Done(): } }(ready) } // wait for manager stop or for role change // if manager stopped before role change, wait for new role for 16 seconds, // then just restart manager, we might just miss that event. // we need to wait for role to prevent manager to start again with wrong // certificate select { case <-done: timer := time.NewTimer(16 * time.Second) defer timer.Stop() select { case <-timer.C: log.G(ctx).Warn("failed to get worker role after manager stop, restart manager") case <-workerRole: case <-ctx.Done(): return ctx.Err() } return runErr case <-workerRole: log.G(ctx).Info("role changed to worker, wait for manager to stop") select { case <-done: return runErr case <-ctx.Done(): return ctx.Err() } case <-ctx.Done(): return ctx.Err() } }
func (n *Node) runManager(ctx context.Context, securityConfig *ca.SecurityConfig, ready chan struct{}) error { for { select { case <-ctx.Done(): return ctx.Err() case <-n.managerRoleCh: if ctx.Err() != nil { return ctx.Err() } n.Lock() // in case if we missed some notifications if n.role != ca.ManagerRole { n.Unlock() continue } n.Unlock() remoteAddr, _ := n.remotes.Select(n.nodeID) m, err := manager.New(&manager.Config{ ForceNewCluster: n.config.ForceNewCluster, ProtoAddr: map[string]string{ "tcp": n.config.ListenRemoteAPI, "unix": n.config.ListenControlAPI, }, SecurityConfig: securityConfig, JoinRaft: remoteAddr.Addr, StateDir: n.config.StateDir, HeartbeatTick: n.config.HeartbeatTick, ElectionTick: n.config.ElectionTick, }) if err != nil { return err } done := make(chan struct{}) go func() { m.Run(context.Background()) // todo: store error close(done) }() n.Lock() n.manager = m n.Unlock() go n.initManagerConnection(ctx, ready) go func() { select { case <-ready: case <-ctx.Done(): } if ctx.Err() == nil { n.remotes.Observe(api.Peer{NodeID: n.nodeID, Addr: n.config.ListenRemoteAPI}, 5) } }() select { case <-ctx.Done(): m.Stop(context.Background()) // todo: this should be sync like other components <-done // in case of demotion manager will stop itself case <-done: } ready = nil // ready event happens once, even on multiple starts n.Lock() n.manager = nil if n.conn != nil { n.conn.Close() } n.Unlock() if ctx.Err() != nil { return err } } } }
func (n *Node) runManager(ctx context.Context, securityConfig *ca.SecurityConfig, ready chan struct{}, workerRole <-chan struct{}) error { remoteAddr, _ := n.remotes.Select(n.NodeID()) m, err := manager.New(&manager.Config{ ForceNewCluster: n.config.ForceNewCluster, RemoteAPI: manager.RemoteAddrs{ ListenAddr: n.config.ListenRemoteAPI, AdvertiseAddr: n.config.AdvertiseRemoteAPI, }, ControlAPI: n.config.ListenControlAPI, SecurityConfig: securityConfig, ExternalCAs: n.config.ExternalCAs, JoinRaft: remoteAddr.Addr, StateDir: n.config.StateDir, HeartbeatTick: n.config.HeartbeatTick, ElectionTick: n.config.ElectionTick, AutoLockManagers: n.config.AutoLockManagers, UnlockKey: n.unlockKey, Availability: n.config.Availability, PluginGetter: n.config.PluginGetter, }) if err != nil { return err } done := make(chan struct{}) var runErr error go func() { if err := m.Run(context.Background()); err != nil { runErr = err } close(done) }() var clearData bool defer func() { n.Lock() n.manager = nil n.Unlock() m.Stop(ctx, clearData) <-done n.setControlSocket(nil) }() n.Lock() n.manager = m n.Unlock() connCtx, connCancel := context.WithCancel(ctx) defer connCancel() go n.initManagerConnection(connCtx, ready) // wait for manager stop or for role change select { case <-done: return runErr case <-workerRole: log.G(ctx).Info("role changed to worker, stopping manager") clearData = true case <-m.RemovedFromRaft(): log.G(ctx).Info("manager removed from raft cluster, stopping manager") clearData = true case <-ctx.Done(): return ctx.Err() } return nil }