// RenewTLSConfig will continuously monitor for the necessity of renewing the local certificates, either by // issuing them locally if key-material is available, or requesting them from a remote CA. func RenewTLSConfig(ctx context.Context, s *SecurityConfig, remotes remotes.Remotes, renew <-chan struct{}) <-chan CertificateUpdate { updates := make(chan CertificateUpdate) go func() { var retry time.Duration defer close(updates) for { ctx = log.WithModule(ctx, "tls") log := log.G(ctx).WithFields(logrus.Fields{ "node.id": s.ClientTLSCreds.NodeID(), "node.role": s.ClientTLSCreds.Role(), }) // Our starting default will be 5 minutes retry = 5 * time.Minute // Since the expiration of the certificate is managed remotely we should update our // retry timer on every iteration of this loop. // Retrieve the current certificate expiration information. validFrom, validUntil, err := readCertValidity(s.KeyReader()) if err != nil { // We failed to read the expiration, let's stick with the starting default log.Errorf("failed to read the expiration of the TLS certificate in: %s", s.KeyReader().Target()) updates <- CertificateUpdate{Err: errors.New("failed to read certificate expiration")} } else { // If we have an expired certificate, we let's stick with the starting default in // the hope that this is a temporary clock skew. if validUntil.Before(time.Now()) { log.WithError(err).Errorf("failed to create a new client TLS config") updates <- CertificateUpdate{Err: errors.New("TLS certificate is expired")} } else { // Random retry time between 50% and 80% of the total time to expiration retry = calculateRandomExpiry(validFrom, validUntil) } } log.WithFields(logrus.Fields{ "time": time.Now().Add(retry), }).Debugf("next certificate renewal scheduled") select { case <-time.After(retry): log.Infof("renewing certificate") case <-renew: log.Infof("forced certificate renewal") case <-ctx.Done(): log.Infof("shuting down certificate renewal routine") return } // ignore errors - it will just try again laster if err := RenewTLSConfigNow(ctx, s, remotes); err != nil { updates <- CertificateUpdate{Err: err} } else { updates <- CertificateUpdate{Role: s.ClientTLSCreds.Role()} } } }() return updates }
// Init prepares the worker for assignments. func (w *worker) Init(ctx context.Context) error { w.mu.Lock() defer w.mu.Unlock() ctx = log.WithModule(ctx, "worker") // TODO(stevvooe): Start task cleanup process. // read the tasks from the database and start any task managers that may be needed. return w.db.Update(func(tx *bolt.Tx) error { return WalkTasks(tx, func(task *api.Task) error { if !TaskAssigned(tx, task.ID) { // NOTE(stevvooe): If tasks can survive worker restart, we need // to startup the controller and ensure they are removed. For // now, we can simply remove them from the database. if err := DeleteTask(tx, task.ID); err != nil { log.G(ctx).WithError(err).Errorf("error removing task %v", task.ID) } return nil } status, err := GetTaskStatus(tx, task.ID) if err != nil { log.G(ctx).WithError(err).Error("unable to read tasks status") return nil } task.Status = *status // merges the status into the task, ensuring we start at the right point. return w.startTask(ctx, tx, task) }) }) }
// CreateSecurityConfig creates a new key and cert for this node, either locally // or via a remote CA. func (rootCA RootCA) CreateSecurityConfig(ctx context.Context, krw *KeyReadWriter, config CertificateRequestConfig) (*SecurityConfig, error) { ctx = log.WithModule(ctx, "tls") var ( tlsKeyPair *tls.Certificate err error ) if rootCA.CanSign() { // Create a new random ID for this certificate cn := identity.NewID() org := identity.NewID() proposedRole := ManagerRole tlsKeyPair, err = rootCA.IssueAndSaveNewCertificates(krw, cn, proposedRole, org) if err != nil { log.G(ctx).WithFields(logrus.Fields{ "node.id": cn, "node.role": proposedRole, }).WithError(err).Errorf("failed to issue and save new certificate") return nil, err } log.G(ctx).WithFields(logrus.Fields{ "node.id": cn, "node.role": proposedRole, }).Debug("issued new TLS certificate") } else { // Request certificate issuance from a remote CA. // Last argument is nil because at this point we don't have any valid TLS creds tlsKeyPair, err = rootCA.RequestAndSaveNewCertificates(ctx, krw, config) if err != nil { log.G(ctx).WithError(err).Error("failed to request save new certificate") return nil, err } } // Create the Server TLS Credentials for this node. These will not be used by workers. serverTLSCreds, err := rootCA.NewServerTLSCredentials(tlsKeyPair) if err != nil { return nil, err } // Create a TLSConfig to be used when this node connects as a client to another remote node. // We're using ManagerRole as remote serverName for TLS host verification clientTLSCreds, err := rootCA.NewClientTLSCredentials(tlsKeyPair, ManagerRole) if err != nil { return nil, err } log.G(ctx).WithFields(logrus.Fields{ "node.id": clientTLSCreds.NodeID(), "node.role": clientTLSCreds.Role(), }).Debugf("new node credentials generated: %s", krw.Target()) return NewSecurityConfig(&rootCA, krw, clientTLSCreds, serverTLSCreds), nil }
func (tm *taskManager) Logs(ctx context.Context, options api.LogSubscriptionOptions, publisher exec.LogPublisher) { ctx = log.WithModule(ctx, "taskmanager") logCtlr, ok := tm.ctlr.(exec.ControllerLogs) if !ok { return // no logs available } if err := logCtlr.Logs(ctx, publisher, options); err != nil { log.G(ctx).WithError(err).Errorf("logs call failed") } }
// RenewTLSConfigNow gets a new TLS cert and key, and updates the security config if provided. This is similar to // RenewTLSConfig, except while that monitors for expiry, and periodically renews, this renews once and is blocking func RenewTLSConfigNow(ctx context.Context, s *SecurityConfig, r remotes.Remotes) error { s.renewalMu.Lock() defer s.renewalMu.Unlock() ctx = log.WithModule(ctx, "tls") log := log.G(ctx).WithFields(logrus.Fields{ "node.id": s.ClientTLSCreds.NodeID(), "node.role": s.ClientTLSCreds.Role(), }) // Let's request new certs. Renewals don't require a token. rootCA := s.RootCA() tlsKeyPair, err := rootCA.RequestAndSaveNewCertificates(ctx, s.KeyWriter(), CertificateRequestConfig{ Remotes: r, Credentials: s.ClientTLSCreds, }) if err != nil { log.WithError(err).Errorf("failed to renew the certificate") return err } clientTLSConfig, err := NewClientTLSConfig(tlsKeyPair, rootCA.Pool, CARole) if err != nil { log.WithError(err).Errorf("failed to create a new client config") return err } serverTLSConfig, err := NewServerTLSConfig(tlsKeyPair, rootCA.Pool) if err != nil { log.WithError(err).Errorf("failed to create a new server config") return err } if err = s.ClientTLSCreds.LoadNewTLSConfig(clientTLSConfig); err != nil { log.WithError(err).Errorf("failed to update the client credentials") return err } // Update the external CA to use the new client TLS // config using a copy without a serverName specified. s.externalCA.UpdateTLSConfig(&tls.Config{ Certificates: clientTLSConfig.Certificates, RootCAs: clientTLSConfig.RootCAs, MinVersion: tls.VersionTLS12, }) if err = s.ServerTLSCreds.LoadNewTLSConfig(serverTLSConfig); err != nil { log.WithError(err).Errorf("failed to update the server TLS credentials") return err } return nil }
// Run starts the keymanager, it doesn't return func (k *KeyManager) Run(ctx context.Context) error { k.mu.Lock() ctx = log.WithModule(ctx, "keymanager") var ( clusters []*api.Cluster err error ) k.store.View(func(readTx store.ReadTx) { clusters, err = store.FindClusters(readTx, store.ByName(k.config.ClusterName)) }) if err != nil { log.G(ctx).Errorf("reading cluster config failed, %v", err) k.mu.Unlock() return err } cluster := clusters[0] if len(cluster.NetworkBootstrapKeys) == 0 { for _, subsys := range k.config.Subsystems { for i := 0; i < keyringSize; i++ { k.keyRing.keys = append(k.keyRing.keys, k.allocateKey(ctx, subsys)) } } if err := k.updateKey(cluster); err != nil { log.G(ctx).Errorf("store update failed %v", err) } } else { k.keyRing.lClock = cluster.EncryptionKeyLamportClock k.keyRing.keys = cluster.NetworkBootstrapKeys k.rotateKey(ctx) } ticker := time.NewTicker(k.config.RotationInterval) defer ticker.Stop() k.ctx, k.cancel = context.WithCancel(ctx) k.mu.Unlock() for { select { case <-ticker.C: k.rotateKey(ctx) case <-k.ctx.Done(): return nil } } }
// Run runs dispatcher tasks which should be run on leader dispatcher. // Dispatcher can be stopped with cancelling ctx or calling Stop(). func (d *Dispatcher) Run(ctx context.Context) error { d.mu.Lock() if d.isRunning() { d.mu.Unlock() return errors.New("dispatcher is already running") } ctx = log.WithModule(ctx, "dispatcher") if err := d.markNodesUnknown(ctx); err != nil { log.G(ctx).Errorf(`failed to move all nodes to "unknown" state: %v`, err) } configWatcher, cancel, err := store.ViewAndWatch( d.store, func(readTx store.ReadTx) error { clusters, err := store.FindClusters(readTx, store.ByName(store.DefaultClusterName)) if err != nil { return err } if err == nil && len(clusters) == 1 { heartbeatPeriod, err := gogotypes.DurationFromProto(clusters[0].Spec.Dispatcher.HeartbeatPeriod) if err == nil && heartbeatPeriod > 0 { d.config.HeartbeatPeriod = heartbeatPeriod } if clusters[0].NetworkBootstrapKeys != nil { d.networkBootstrapKeys = clusters[0].NetworkBootstrapKeys } } return nil }, state.EventUpdateCluster{}, ) if err != nil { d.mu.Unlock() return err } // set queues here to guarantee that Close will close them d.mgrQueue = watch.NewQueue() d.keyMgrQueue = watch.NewQueue() peerWatcher, peerCancel := d.cluster.SubscribePeers() defer peerCancel() d.lastSeenManagers = getWeightedPeers(d.cluster) defer cancel() d.ctx, d.cancel = context.WithCancel(ctx) ctx = d.ctx d.wg.Add(1) defer d.wg.Done() d.mu.Unlock() publishManagers := func(peers []*api.Peer) { var mgrs []*api.WeightedPeer for _, p := range peers { mgrs = append(mgrs, &api.WeightedPeer{ Peer: p, Weight: remotes.DefaultObservationWeight, }) } d.mu.Lock() d.lastSeenManagers = mgrs d.mu.Unlock() d.mgrQueue.Publish(mgrs) } batchTimer := time.NewTimer(maxBatchInterval) defer batchTimer.Stop() for { select { case ev := <-peerWatcher: publishManagers(ev.([]*api.Peer)) case <-d.processUpdatesTrigger: d.processUpdates(ctx) batchTimer.Reset(maxBatchInterval) case <-batchTimer.C: d.processUpdates(ctx) batchTimer.Reset(maxBatchInterval) case v := <-configWatcher: cluster := v.(state.EventUpdateCluster) d.mu.Lock() if cluster.Cluster.Spec.Dispatcher.HeartbeatPeriod != nil { // ignore error, since Spec has passed validation before heartbeatPeriod, _ := gogotypes.DurationFromProto(cluster.Cluster.Spec.Dispatcher.HeartbeatPeriod) if heartbeatPeriod != d.config.HeartbeatPeriod { // only call d.nodes.updatePeriod when heartbeatPeriod changes d.config.HeartbeatPeriod = heartbeatPeriod d.nodes.updatePeriod(d.config.HeartbeatPeriod, d.config.HeartbeatEpsilon, d.config.GracePeriodMultiplier) } } d.networkBootstrapKeys = cluster.Cluster.NetworkBootstrapKeys d.mu.Unlock() d.keyMgrQueue.Publish(cluster.Cluster.NetworkBootstrapKeys) case <-ctx.Done(): return nil } } }
// Run runs the CA signer main loop. // The CA signer can be stopped with cancelling ctx or calling Stop(). func (s *Server) Run(ctx context.Context) error { s.mu.Lock() if s.isRunning() { s.mu.Unlock() return errors.New("CA signer is already running") } s.wg.Add(1) s.mu.Unlock() defer s.wg.Done() ctx = log.WithModule(ctx, "ca") // Retrieve the channels to keep track of changes in the cluster // Retrieve all the currently registered nodes var nodes []*api.Node updates, cancel, err := store.ViewAndWatch( s.store, func(readTx store.ReadTx) error { clusters, err := store.FindClusters(readTx, store.ByName(store.DefaultClusterName)) if err != nil { return err } if len(clusters) != 1 { return errors.New("could not find cluster object") } s.updateCluster(ctx, clusters[0]) nodes, err = store.FindNodes(readTx, store.All) return err }, state.EventCreateNode{}, state.EventUpdateNode{}, state.EventUpdateCluster{}, ) // Do this after updateCluster has been called, so isRunning never // returns true without joinTokens being set correctly. s.mu.Lock() s.ctx, s.cancel = context.WithCancel(ctx) s.mu.Unlock() close(s.started) if err != nil { log.G(ctx).WithFields(logrus.Fields{ "method": "(*Server).Run", }).WithError(err).Errorf("snapshot store view failed") return err } defer cancel() // We might have missed some updates if there was a leader election, // so let's pick up the slack. if err := s.reconcileNodeCertificates(ctx, nodes); err != nil { // We don't return here because that means the Run loop would // never run. Log an error instead. log.G(ctx).WithFields(logrus.Fields{ "method": "(*Server).Run", }).WithError(err).Errorf("error attempting to reconcile certificates") } ticker := time.NewTicker(s.reconciliationRetryInterval) defer ticker.Stop() // Watch for new nodes being created, new nodes being updated, and changes // to the cluster for { select { case event := <-updates: switch v := event.(type) { case state.EventCreateNode: s.evaluateAndSignNodeCert(ctx, v.Node) case state.EventUpdateNode: // If this certificate is already at a final state // no need to evaluate and sign it. if !isFinalState(v.Node.Certificate.Status) { s.evaluateAndSignNodeCert(ctx, v.Node) } case state.EventUpdateCluster: s.updateCluster(ctx, v.Cluster) } case <-ticker.C: for _, node := range s.pending { if err := s.evaluateAndSignNodeCert(ctx, node); err != nil { // If this sign operation did not succeed, the rest are // unlikely to. Yield so that we don't hammer an external CA. // Since the map iteration order is randomized, there is no // risk of getting stuck on a problematic CSR. break } } case <-ctx.Done(): return ctx.Err() case <-s.ctx.Done(): return nil } } }
func (n *Node) run(ctx context.Context) (err error) { defer func() { n.err = err close(n.closed) }() ctx, cancel := context.WithCancel(ctx) defer cancel() ctx = log.WithModule(ctx, "node") go func() { select { case <-ctx.Done(): case <-n.stopped: cancel() } }() // NOTE: When this node is created by NewNode(), our nodeID is set if // n.loadCertificates() succeeded in loading TLS credentials. if n.config.JoinAddr == "" && n.nodeID == "" { if err := n.bootstrapCA(); err != nil { return err } } if n.config.JoinAddr != "" || n.config.ForceNewCluster { n.remotes = newPersistentRemotes(filepath.Join(n.config.StateDir, stateFilename)) if n.config.JoinAddr != "" { n.remotes.Observe(api.Peer{Addr: n.config.JoinAddr}, remotes.DefaultObservationWeight) } } // Obtain new certs and setup TLS certificates renewal for this node: // - We call LoadOrCreateSecurityConfig which blocks until a valid certificate has been issued // - We retrieve the nodeID from LoadOrCreateSecurityConfig through the info channel. This allows // us to display the ID before the certificate gets issued (for potential approval). // - We wait for LoadOrCreateSecurityConfig to finish since we need a certificate to operate. // - Given a valid certificate, spin a renewal go-routine that will ensure that certificates stay // up to date. issueResponseChan := make(chan api.IssueNodeCertificateResponse, 1) go func() { select { case <-ctx.Done(): case resp := <-issueResponseChan: log.G(log.WithModule(ctx, "tls")).WithFields(logrus.Fields{ "node.id": resp.NodeID, }).Debugf("requesting certificate") n.Lock() n.nodeID = resp.NodeID n.nodeMembership = resp.NodeMembership n.Unlock() close(n.certificateRequested) } }() certDir := filepath.Join(n.config.StateDir, "certificates") securityConfig, err := ca.LoadOrCreateSecurityConfig(ctx, certDir, n.config.JoinToken, ca.ManagerRole, n.remotes, issueResponseChan) if err != nil { return err } taskDBPath := filepath.Join(n.config.StateDir, "worker/tasks.db") if err := os.MkdirAll(filepath.Dir(taskDBPath), 0777); err != nil { return err } db, err := bolt.Open(taskDBPath, 0666, nil) if err != nil { return err } defer db.Close() if err := n.loadCertificates(); err != nil { return err } forceCertRenewal := make(chan struct{}) renewCert := func() { select { case forceCertRenewal <- struct{}{}: case <-ctx.Done(): } } go func() { for { select { case <-ctx.Done(): return case node := <-n.notifyNodeChange: // If the server is sending us a ForceRenewal State, renew if node.Certificate.Status.State == api.IssuanceStateRotate { renewCert() continue } n.Lock() // If we got a role change, renew lastRole := n.role role := ca.WorkerRole if node.Spec.Role == api.NodeRoleManager { role = ca.ManagerRole } if lastRole == role { n.Unlock() continue } // switch role to agent immediately to shutdown manager early if role == ca.WorkerRole { n.role = role n.roleCond.Broadcast() } n.Unlock() renewCert() } } }() updates := ca.RenewTLSConfig(ctx, securityConfig, certDir, n.remotes, forceCertRenewal) go func() { for { select { case certUpdate := <-updates: if certUpdate.Err != nil { logrus.Warnf("error renewing TLS certificate: %v", certUpdate.Err) continue } n.Lock() n.role = certUpdate.Role n.roleCond.Broadcast() n.Unlock() case <-ctx.Done(): return } } }() role := n.role managerReady := make(chan struct{}) agentReady := make(chan struct{}) var managerErr error var agentErr error var wg sync.WaitGroup wg.Add(2) go func() { managerErr = n.runManager(ctx, securityConfig, managerReady) // store err and loop wg.Done() cancel() }() go func() { agentErr = n.runAgent(ctx, db, securityConfig.ClientTLSCreds, agentReady) wg.Done() cancel() }() go func() { <-agentReady if role == ca.ManagerRole { <-managerReady } close(n.ready) }() wg.Wait() if managerErr != nil && managerErr != context.Canceled { return managerErr } if agentErr != nil && agentErr != context.Canceled { return agentErr } return err }
func (a *Agent) run(ctx context.Context) { ctx, cancel := context.WithCancel(ctx) defer cancel() defer close(a.closed) // full shutdown. ctx = log.WithModule(ctx, "agent") log.G(ctx).Debugf("(*Agent).run") defer log.G(ctx).Debugf("(*Agent).run exited") var ( backoff time.Duration session = newSession(ctx, a, backoff) // start the initial session registered = session.registered ready = a.ready // first session ready sessionq chan sessionOperation ) if err := a.worker.Init(ctx); err != nil { log.G(ctx).WithError(err).Error("worker initialization failed") a.err = err return // fatal? } // setup a reliable reporter to call back to us. reporter := newStatusReporter(ctx, a) defer reporter.Close() a.worker.Listen(ctx, reporter) for { select { case operation := <-sessionq: operation.response <- operation.fn(session) case msg := <-session.tasks: if err := a.worker.Assign(ctx, msg.Tasks); err != nil { log.G(ctx).WithError(err).Error("task assignment failed") } case msg := <-session.messages: if err := a.handleSessionMessage(ctx, msg); err != nil { log.G(ctx).WithError(err).Error("session message handler failed") } case <-registered: log.G(ctx).Debugln("agent: registered") if ready != nil { close(ready) } ready = nil registered = nil // we only care about this once per session backoff = 0 // reset backoff sessionq = a.sessionq case err := <-session.errs: // TODO(stevvooe): This may actually block if a session is closed // but no error was sent. Session.close must only be called here // for this to work. if err != nil { log.G(ctx).WithError(err).Error("agent: session failed") backoff = initialSessionFailureBackoff + 2*backoff if backoff > maxSessionFailureBackoff { backoff = maxSessionFailureBackoff } } if err := session.close(); err != nil { log.G(ctx).WithError(err).Error("agent: closing session failed") } sessionq = nil // if we're here before <-registered, do nothing for that event registered = nil case <-session.closed: log.G(ctx).Debugf("agent: rebuild session") // select a session registration delay from backoff range. delay := time.Duration(rand.Int63n(int64(backoff))) session = newSession(ctx, a, delay) registered = session.registered sessionq = a.sessionq case <-a.stopped: // TODO(stevvooe): Wait on shutdown and cleanup. May need to pump // this loop a few times. return case <-ctx.Done(): if a.err == nil { a.err = ctx.Err() } session.close() return } } }
// RenewTLSConfig will continuously monitor for the necessity of renewing the local certificates, either by // issuing them locally if key-material is available, or requesting them from a remote CA. func RenewTLSConfig(ctx context.Context, s *SecurityConfig, baseCertDir string, remotes remotes.Remotes, renew <-chan struct{}) <-chan CertificateUpdate { paths := NewConfigPaths(baseCertDir) updates := make(chan CertificateUpdate) go func() { var retry time.Duration defer close(updates) for { ctx = log.WithModule(ctx, "tls") log := log.G(ctx).WithFields(logrus.Fields{ "node.id": s.ClientTLSCreds.NodeID(), "node.role": s.ClientTLSCreds.Role(), }) // Our starting default will be 5 minutes retry = 5 * time.Minute // Since the expiration of the certificate is managed remotely we should update our // retry timer on every iteration of this loop. // Retrieve the time until the certificate expires. expiresIn, err := readCertExpiration(paths.Node) if err != nil { // We failed to read the expiration, let's stick with the starting default log.Errorf("failed to read the expiration of the TLS certificate in: %s", paths.Node.Cert) updates <- CertificateUpdate{Err: fmt.Errorf("failed to read certificate expiration")} } else { // If we have an expired certificate, we let's stick with the starting default in // the hope that this is a temporary clock skew. if expiresIn.Minutes() < 0 { log.WithError(err).Errorf("failed to create a new client TLS config") updates <- CertificateUpdate{Err: fmt.Errorf("TLS certificate is expired")} } else { // Random retry time between 50% and 80% of the total time to expiration retry = calculateRandomExpiry(expiresIn) } } log.WithFields(logrus.Fields{ "time": time.Now().Add(retry), }).Debugf("next certificate renewal scheduled") select { case <-time.After(retry): log.Infof("renewing certificate") case <-renew: log.Infof("forced certificate renewal") case <-ctx.Done(): log.Infof("shuting down certificate renewal routine") return } // Let's request new certs. Renewals don't require a token. rootCA := s.RootCA() tlsKeyPair, err := rootCA.RequestAndSaveNewCertificates(ctx, paths.Node, "", remotes, s.ClientTLSCreds, nil) if err != nil { log.WithError(err).Errorf("failed to renew the certificate") updates <- CertificateUpdate{Err: err} continue } clientTLSConfig, err := NewClientTLSConfig(tlsKeyPair, rootCA.Pool, CARole) if err != nil { log.WithError(err).Errorf("failed to create a new client config") updates <- CertificateUpdate{Err: err} } serverTLSConfig, err := NewServerTLSConfig(tlsKeyPair, rootCA.Pool) if err != nil { log.WithError(err).Errorf("failed to create a new server config") updates <- CertificateUpdate{Err: err} } err = s.ClientTLSCreds.LoadNewTLSConfig(clientTLSConfig) if err != nil { log.WithError(err).Errorf("failed to update the client credentials") updates <- CertificateUpdate{Err: err} } // Update the external CA to use the new client TLS // config using a copy without a serverName specified. s.externalCA.UpdateTLSConfig(&tls.Config{ Certificates: clientTLSConfig.Certificates, RootCAs: clientTLSConfig.RootCAs, MinVersion: tls.VersionTLS12, }) err = s.ServerTLSCreds.LoadNewTLSConfig(serverTLSConfig) if err != nil { log.WithError(err).Errorf("failed to update the server TLS credentials") updates <- CertificateUpdate{Err: err} } updates <- CertificateUpdate{Role: s.ClientTLSCreds.Role()} } }() return updates }
// LoadSecurityConfig loads TLS credentials from disk, or returns an error if // these credentials do not exist or are unusable. func LoadSecurityConfig(ctx context.Context, rootCA RootCA, krw *KeyReadWriter) (*SecurityConfig, error) { ctx = log.WithModule(ctx, "tls") // At this point we've successfully loaded the CA details from disk, or // successfully downloaded them remotely. The next step is to try to // load our certificates. // Read both the Cert and Key from disk cert, key, err := krw.Read() if err != nil { return nil, err } // Create an x509 certificate out of the contents on disk certBlock, _ := pem.Decode([]byte(cert)) if certBlock == nil { return nil, errors.New("failed to parse certificate PEM") } // Create an X509Cert so we can .Verify() X509Cert, err := x509.ParseCertificate(certBlock.Bytes) if err != nil { return nil, err } // Include our root pool opts := x509.VerifyOptions{ Roots: rootCA.Pool, } // Check to see if this certificate was signed by our CA, and isn't expired if _, err := X509Cert.Verify(opts); err != nil { return nil, err } // Now that we know this certificate is valid, create a TLS Certificate for our // credentials keyPair, err := tls.X509KeyPair(cert, key) if err != nil { return nil, err } // Load the Certificates as server credentials serverTLSCreds, err := rootCA.NewServerTLSCredentials(&keyPair) if err != nil { return nil, err } // Load the Certificates also as client credentials. // Both workers and managers always connect to remote managers, // so ServerName is always set to ManagerRole here. clientTLSCreds, err := rootCA.NewClientTLSCredentials(&keyPair, ManagerRole) if err != nil { return nil, err } log.G(ctx).WithFields(logrus.Fields{ "node.id": clientTLSCreds.NodeID(), "node.role": clientTLSCreds.Role(), }).Debug("loaded node credentials") return NewSecurityConfig(&rootCA, krw, clientTLSCreds, serverTLSCreds), nil }
// Run runs dispatcher tasks which should be run on leader dispatcher. // Dispatcher can be stopped with cancelling ctx or calling Stop(). func (d *Dispatcher) Run(ctx context.Context) error { d.mu.Lock() if d.isRunning() { d.mu.Unlock() return fmt.Errorf("dispatcher is already running") } ctx = log.WithModule(ctx, "dispatcher") if err := d.markNodesUnknown(ctx); err != nil { log.G(ctx).Errorf(`failed to move all nodes to "unknown" state: %v`, err) } configWatcher, cancel, err := store.ViewAndWatch( d.store, func(readTx store.ReadTx) error { clusters, err := store.FindClusters(readTx, store.ByName(store.DefaultClusterName)) if err != nil { return err } if err == nil && len(clusters) == 1 { heartbeatPeriod, err := ptypes.Duration(clusters[0].Spec.Dispatcher.HeartbeatPeriod) if err == nil && heartbeatPeriod > 0 { d.config.HeartbeatPeriod = heartbeatPeriod } if clusters[0].NetworkBootstrapKeys != nil { d.networkBootstrapKeys = clusters[0].NetworkBootstrapKeys } } return nil }, state.EventUpdateCluster{}, ) if err != nil { d.mu.Unlock() return err } defer cancel() d.ctx, d.cancel = context.WithCancel(ctx) d.mu.Unlock() publishManagers := func() { mgrs := getWeightedPeers(d.cluster) sort.Sort(weightedPeerByNodeID(mgrs)) d.mu.Lock() if reflect.DeepEqual(mgrs, d.lastSeenManagers) { d.mu.Unlock() return } d.lastSeenManagers = mgrs d.mu.Unlock() d.mgrQueue.Publish(mgrs) } publishManagers() publishTicker := time.NewTicker(1 * time.Second) defer publishTicker.Stop() batchTimer := time.NewTimer(maxBatchInterval) defer batchTimer.Stop() for { select { case <-publishTicker.C: publishManagers() case <-d.processUpdatesTrigger: d.processUpdates() batchTimer.Reset(maxBatchInterval) case <-batchTimer.C: d.processUpdates() batchTimer.Reset(maxBatchInterval) case v := <-configWatcher: cluster := v.(state.EventUpdateCluster) d.mu.Lock() if cluster.Cluster.Spec.Dispatcher.HeartbeatPeriod != nil { // ignore error, since Spec has passed validation before heartbeatPeriod, _ := ptypes.Duration(cluster.Cluster.Spec.Dispatcher.HeartbeatPeriod) if heartbeatPeriod != d.config.HeartbeatPeriod { // only call d.nodes.updatePeriod when heartbeatPeriod changes d.config.HeartbeatPeriod = heartbeatPeriod d.nodes.updatePeriod(d.config.HeartbeatPeriod, d.config.HeartbeatEpsilon, d.config.GracePeriodMultiplier) } } d.networkBootstrapKeys = cluster.Cluster.NetworkBootstrapKeys d.mu.Unlock() d.keyMgrQueue.Publish(struct{}{}) case <-d.ctx.Done(): return nil } } }
func (n *Node) loadSecurityConfig(ctx context.Context) (*ca.SecurityConfig, error) { paths := ca.NewConfigPaths(filepath.Join(n.config.StateDir, certDirectory)) var securityConfig *ca.SecurityConfig krw := ca.NewKeyReadWriter(paths.Node, n.unlockKey, &manager.RaftDEKData{}) if err := krw.Migrate(); err != nil { return nil, err } // Check if we already have a valid certificates on disk. rootCA, err := ca.GetLocalRootCA(paths.RootCA) if err != nil && err != ca.ErrNoLocalRootCA { return nil, err } if err == nil { clientTLSCreds, serverTLSCreds, err := ca.LoadTLSCreds(rootCA, krw) _, ok := errors.Cause(err).(ca.ErrInvalidKEK) switch { case err == nil: securityConfig = ca.NewSecurityConfig(&rootCA, krw, clientTLSCreds, serverTLSCreds) log.G(ctx).Debug("loaded CA and TLS certificates") case ok: return nil, ErrInvalidUnlockKey case os.IsNotExist(err): break default: return nil, errors.Wrapf(err, "error while loading TLS certificate in %s", paths.Node.Cert) } } if securityConfig == nil { if n.config.JoinAddr == "" { // if we're not joining a cluster, bootstrap a new one - and we have to set the unlock key n.unlockKey = nil if n.config.AutoLockManagers { n.unlockKey = encryption.GenerateSecretKey() } krw = ca.NewKeyReadWriter(paths.Node, n.unlockKey, &manager.RaftDEKData{}) rootCA, err = ca.CreateRootCA(ca.DefaultRootCN, paths.RootCA) if err != nil { return nil, err } log.G(ctx).Debug("generated CA key and certificate") } else if err == ca.ErrNoLocalRootCA { // from previous error loading the root CA from disk rootCA, err = ca.DownloadRootCA(ctx, paths.RootCA, n.config.JoinToken, n.remotes) if err != nil { return nil, err } log.G(ctx).Debug("downloaded CA certificate") } // Obtain new certs and setup TLS certificates renewal for this node: // - We call LoadOrCreateSecurityConfig which blocks until a valid certificate has been issued // - We retrieve the nodeID from LoadOrCreateSecurityConfig through the info channel. This allows // us to display the ID before the certificate gets issued (for potential approval). // - We wait for LoadOrCreateSecurityConfig to finish since we need a certificate to operate. // - Given a valid certificate, spin a renewal go-routine that will ensure that certificates stay // up to date. issueResponseChan := make(chan api.IssueNodeCertificateResponse, 1) go func() { select { case <-ctx.Done(): case resp := <-issueResponseChan: log.G(log.WithModule(ctx, "tls")).WithFields(logrus.Fields{ "node.id": resp.NodeID, }).Debugf("loaded TLS certificate") n.Lock() n.nodeID = resp.NodeID n.nodeMembership = resp.NodeMembership n.Unlock() close(n.certificateRequested) } }() // LoadOrCreateSecurityConfig is the point at which a new node joining a cluster will retrieve TLS // certificates and write them to disk securityConfig, err = ca.LoadOrCreateSecurityConfig( ctx, rootCA, n.config.JoinToken, ca.ManagerRole, n.remotes, issueResponseChan, krw) if err != nil { if _, ok := errors.Cause(err).(ca.ErrInvalidKEK); ok { return nil, ErrInvalidUnlockKey } return nil, err } } n.Lock() n.role = securityConfig.ClientTLSCreds.Role() n.nodeID = securityConfig.ClientTLSCreds.NodeID() n.nodeMembership = api.NodeMembershipAccepted n.roleCond.Broadcast() n.Unlock() return securityConfig, nil }
func (a *Agent) run(ctx context.Context) { ctx, cancel := context.WithCancel(ctx) defer cancel() defer close(a.closed) // full shutdown. ctx = log.WithModule(ctx, "agent") log.G(ctx).Debugf("(*Agent).run") defer log.G(ctx).Debugf("(*Agent).run exited") // get the node description nodeDescription, err := a.nodeDescriptionWithHostname(ctx) if err != nil { log.G(ctx).WithError(err).WithField("agent", a.config.Executor).Errorf("agent: node description unavailable") } // nodeUpdateTicker is used to periodically check for updates to node description nodeUpdateTicker := time.NewTicker(nodeUpdatePeriod) defer nodeUpdateTicker.Stop() var ( backoff time.Duration session = newSession(ctx, a, backoff, "", nodeDescription) // start the initial session registered = session.registered ready = a.ready // first session ready sessionq chan sessionOperation leaving = a.leaving subscriptions = map[string]context.CancelFunc{} ) if err := a.worker.Init(ctx); err != nil { log.G(ctx).WithError(err).Error("worker initialization failed") a.err = err return // fatal? } defer a.worker.Close() // setup a reliable reporter to call back to us. reporter := newStatusReporter(ctx, a) defer reporter.Close() a.worker.Listen(ctx, reporter) for { select { case operation := <-sessionq: operation.response <- operation.fn(session) case <-leaving: leaving = nil // TODO(stevvooe): Signal to the manager that the node is leaving. // when leaving we remove all assignments. if err := a.worker.Assign(ctx, nil); err != nil { log.G(ctx).WithError(err).Error("failed removing all assignments") } case msg := <-session.assignments: // if we have left, accept no more assignments if leaving == nil { continue } switch msg.Type { case api.AssignmentsMessage_COMPLETE: // Need to assign secrets before tasks, because tasks might depend on new secrets if err := a.worker.Assign(ctx, msg.Changes); err != nil { log.G(ctx).WithError(err).Error("failed to synchronize worker assignments") } case api.AssignmentsMessage_INCREMENTAL: if err := a.worker.Update(ctx, msg.Changes); err != nil { log.G(ctx).WithError(err).Error("failed to update worker assignments") } } case msg := <-session.messages: if err := a.handleSessionMessage(ctx, msg); err != nil { log.G(ctx).WithError(err).Error("session message handler failed") } case sub := <-session.subscriptions: if sub.Close { if cancel, ok := subscriptions[sub.ID]; ok { cancel() } delete(subscriptions, sub.ID) continue } if _, ok := subscriptions[sub.ID]; ok { // Duplicate subscription continue } subCtx, subCancel := context.WithCancel(ctx) subscriptions[sub.ID] = subCancel go a.worker.Subscribe(subCtx, sub) case <-registered: log.G(ctx).Debugln("agent: registered") if ready != nil { close(ready) } ready = nil registered = nil // we only care about this once per session backoff = 0 // reset backoff sessionq = a.sessionq case err := <-session.errs: // TODO(stevvooe): This may actually block if a session is closed // but no error was sent. Session.close must only be called here // for this to work. if err != nil { log.G(ctx).WithError(err).Error("agent: session failed") backoff = initialSessionFailureBackoff + 2*backoff if backoff > maxSessionFailureBackoff { backoff = maxSessionFailureBackoff } } if err := session.close(); err != nil { log.G(ctx).WithError(err).Error("agent: closing session failed") } sessionq = nil // if we're here before <-registered, do nothing for that event registered = nil case <-session.closed: log.G(ctx).Debugf("agent: rebuild session") // select a session registration delay from backoff range. delay := time.Duration(0) if backoff > 0 { delay = time.Duration(rand.Int63n(int64(backoff))) } session = newSession(ctx, a, delay, session.sessionID, nodeDescription) registered = session.registered case <-nodeUpdateTicker.C: // skip this case if the registration isn't finished if registered != nil { continue } // get the current node description newNodeDescription, err := a.nodeDescriptionWithHostname(ctx) if err != nil { log.G(ctx).WithError(err).WithField("agent", a.config.Executor).Errorf("agent: updated node description unavailable") } // if newNodeDescription is nil, it will cause a panic when // trying to create a session. Typically this can happen // if the engine goes down if newNodeDescription == nil { continue } // if the node description has changed, update it to the new one // and close the session. The old session will be stopped and a // new one will be created with the updated description if !reflect.DeepEqual(nodeDescription, newNodeDescription) { nodeDescription = newNodeDescription // close the session log.G(ctx).Info("agent: found node update") session.sendError(nil) } case <-a.stopped: // TODO(stevvooe): Wait on shutdown and cleanup. May need to pump // this loop a few times. return case <-ctx.Done(): if a.err == nil { a.err = ctx.Err() } session.close() return } } }
func (tm *taskManager) run(ctx context.Context) { ctx, cancelAll := context.WithCancel(ctx) defer cancelAll() // cancel all child operations on exit. ctx = log.WithModule(ctx, "taskmanager") var ( opctx context.Context cancel context.CancelFunc run = make(chan struct{}, 1) statusq = make(chan *api.TaskStatus) errs = make(chan error) shutdown = tm.shutdown updated bool // true if the task was updated. ) defer func() { // closure picks up current value of cancel. if cancel != nil { cancel() } }() run <- struct{}{} // prime the pump for { select { case <-run: // always check for shutdown before running. select { case <-tm.shutdown: continue // ignore run request and handle shutdown case <-tm.closed: continue default: } opctx, cancel = context.WithCancel(ctx) // Several variables need to be snapshotted for the closure below. opcancel := cancel // fork for the closure running := tm.task.Copy() // clone the task before dispatch statusqLocal := statusq updatedLocal := updated // capture state of update for goroutine updated = false go runctx(ctx, tm.closed, errs, func(ctx context.Context) error { defer opcancel() if updatedLocal { // before we do anything, update the task for the controller. // always update the controller before running. if err := tm.ctlr.Update(opctx, running); err != nil { log.G(ctx).WithError(err).Error("updating task controller failed") return err } } status, err := exec.Do(opctx, running, tm.ctlr) if status != nil { // always report the status if we get one back. This // returns to the manager loop, then reports the status // upstream. select { case statusqLocal <- status: case <-ctx.Done(): // not opctx, since that may have been cancelled. } if err := tm.reporter.UpdateTaskStatus(ctx, running.ID, status); err != nil { log.G(ctx).WithError(err).Error("failed reporting status to agent") } } return err }) case err := <-errs: // This branch is always executed when an operations completes. The // goal is to decide whether or not we re-dispatch the operation. cancel = nil select { case <-tm.shutdown: shutdown = tm.shutdown // re-enable the shutdown branch continue // no dispatch if we are in shutdown. default: } switch err { case exec.ErrTaskNoop: if !updated { continue // wait till getting pumped via update. } case exec.ErrTaskRetry: // TODO(stevvooe): Add exponential backoff with random jitter // here. For now, this backoff is enough to keep the task // manager from running away with the CPU. time.AfterFunc(time.Second, func() { errs <- nil // repump this branch, with no err }) continue case nil, context.Canceled, context.DeadlineExceeded: // no log in this case default: log.G(ctx).WithError(err).Error("task operation failed") } select { case run <- struct{}{}: default: } case status := <-statusq: tm.task.Status = *status case task := <-tm.updateq: if equality.TasksEqualStable(task, tm.task) { continue // ignore the update } if task.ID != tm.task.ID { log.G(ctx).WithField("task.update.id", task.ID).Error("received update for incorrect task") continue } if task.DesiredState < tm.task.DesiredState { log.G(ctx).WithField("task.update.desiredstate", task.DesiredState). Error("ignoring task update with invalid desired state") continue } task = task.Copy() task.Status = tm.task.Status // overwrite our status, as it is canonical. tm.task = task updated = true // we have accepted the task update if cancel != nil { cancel() // cancel outstanding if necessary. } else { // If this channel op fails, it means there is already a // message on the run queue. select { case run <- struct{}{}: default: } } case <-shutdown: if cancel != nil { // cancel outstanding operation. cancel() // subtle: after a cancellation, we want to avoid busy wait // here. this gets renabled in the errs branch and we'll come // back around and try shutdown again. shutdown = nil // turn off this branch until op proceeds continue // wait until operation actually exits. } // TODO(stevvooe): This should be left for the repear. // make an attempt at removing. this is best effort. any errors will be // retried by the reaper later. if err := tm.ctlr.Remove(ctx); err != nil { log.G(ctx).WithError(err).WithField("task.id", tm.task.ID).Error("remove task failed") } if err := tm.ctlr.Close(); err != nil { log.G(ctx).WithError(err).Error("error closing controller") } // disable everything, and prepare for closing. statusq = nil errs = nil shutdown = nil close(tm.closed) case <-tm.closed: return case <-ctx.Done(): return } } }
// LoadOrCreateSecurityConfig encapsulates the security logic behind joining a cluster. // Every node requires at least a set of TLS certificates with which to join the cluster with. // In the case of a manager, these certificates will be used both for client and server credentials. func LoadOrCreateSecurityConfig(ctx context.Context, baseCertDir, token, proposedRole string, remotes remotes.Remotes, nodeInfo chan<- api.IssueNodeCertificateResponse) (*SecurityConfig, error) { ctx = log.WithModule(ctx, "tls") paths := NewConfigPaths(baseCertDir) var ( rootCA RootCA serverTLSCreds, clientTLSCreds *MutableTLSCreds err error ) // Check if we already have a CA certificate on disk. We need a CA to have a valid SecurityConfig rootCA, err = GetLocalRootCA(baseCertDir) switch err { case nil: log.G(ctx).Debug("loaded CA certificate") case ErrNoLocalRootCA: log.G(ctx).WithError(err).Debugf("failed to load local CA certificate") // Get a digest for the optional CA hash string that we've been provided // If we were provided a non-empty string, and it is an invalid hash, return // otherwise, allow the invalid digest through. var d digest.Digest if token != "" { d, err = getCAHashFromToken(token) if err != nil { return nil, err } } // Get the remote CA certificate, verify integrity with the // hash provided. Retry up to 5 times, in case the manager we // first try to contact is not responding properly (it may have // just been demoted, for example). for i := 0; i != 5; i++ { rootCA, err = GetRemoteCA(ctx, d, remotes) if err == nil { break } log.G(ctx).WithError(err).Errorf("failed to retrieve remote root CA certificate") } if err != nil { return nil, err } // Save root CA certificate to disk if err = saveRootCA(rootCA, paths.RootCA); err != nil { return nil, err } log.G(ctx).Debugf("retrieved remote CA certificate: %s", paths.RootCA.Cert) default: return nil, err } // At this point we've successfully loaded the CA details from disk, or // successfully downloaded them remotely. The next step is to try to // load our certificates. clientTLSCreds, serverTLSCreds, err = LoadTLSCreds(rootCA, paths.Node) if err != nil { log.G(ctx).WithError(err).Debugf("no node credentials found in: %s", paths.Node.Cert) var ( tlsKeyPair *tls.Certificate err error ) if rootCA.CanSign() { // Create a new random ID for this certificate cn := identity.NewID() org := identity.NewID() if nodeInfo != nil { nodeInfo <- api.IssueNodeCertificateResponse{ NodeID: cn, NodeMembership: api.NodeMembershipAccepted, } } tlsKeyPair, err = rootCA.IssueAndSaveNewCertificates(paths.Node, cn, proposedRole, org) if err != nil { log.G(ctx).WithFields(logrus.Fields{ "node.id": cn, "node.role": proposedRole, }).WithError(err).Errorf("failed to issue and save new certificate") return nil, err } log.G(ctx).WithFields(logrus.Fields{ "node.id": cn, "node.role": proposedRole, }).Debug("issued new TLS certificate") } else { // There was an error loading our Credentials, let's get a new certificate issued // Last argument is nil because at this point we don't have any valid TLS creds tlsKeyPair, err = rootCA.RequestAndSaveNewCertificates(ctx, paths.Node, token, remotes, nil, nodeInfo) if err != nil { log.G(ctx).WithError(err).Error("failed to request save new certificate") return nil, err } } // Create the Server TLS Credentials for this node. These will not be used by workers. serverTLSCreds, err = rootCA.NewServerTLSCredentials(tlsKeyPair) if err != nil { return nil, err } // Create a TLSConfig to be used when this node connects as a client to another remote node. // We're using ManagerRole as remote serverName for TLS host verification clientTLSCreds, err = rootCA.NewClientTLSCredentials(tlsKeyPair, ManagerRole) if err != nil { return nil, err } log.G(ctx).WithFields(logrus.Fields{ "node.id": clientTLSCreds.NodeID(), "node.role": clientTLSCreds.Role(), }).Debugf("new node credentials generated: %s", paths.Node.Cert) } else { if nodeInfo != nil { nodeInfo <- api.IssueNodeCertificateResponse{ NodeID: clientTLSCreds.NodeID(), NodeMembership: api.NodeMembershipAccepted, } } log.G(ctx).WithFields(logrus.Fields{ "node.id": clientTLSCreds.NodeID(), "node.role": clientTLSCreds.Role(), }).Debug("loaded node credentials") } return NewSecurityConfig(&rootCA, clientTLSCreds, serverTLSCreds), nil }
func (n *Node) run(ctx context.Context) (err error) { defer func() { n.err = err close(n.closed) }() ctx, cancel := context.WithCancel(ctx) defer cancel() ctx = log.WithModule(ctx, "node") go func() { select { case <-ctx.Done(): case <-n.stopped: cancel() } }() securityConfig, err := n.loadSecurityConfig(ctx) if err != nil { return err } taskDBPath := filepath.Join(n.config.StateDir, "worker/tasks.db") if err := os.MkdirAll(filepath.Dir(taskDBPath), 0777); err != nil { return err } db, err := bolt.Open(taskDBPath, 0666, nil) if err != nil { return err } defer db.Close() forceCertRenewal := make(chan struct{}) renewCert := func() { select { case forceCertRenewal <- struct{}{}: case <-ctx.Done(): } } go func() { for { select { case <-ctx.Done(): return case node := <-n.notifyNodeChange: // If the server is sending us a ForceRenewal State, renew if node.Certificate.Status.State == api.IssuanceStateRotate { renewCert() continue } n.Lock() // If we got a role change, renew lastRole := n.role role := ca.WorkerRole if node.Spec.Role == api.NodeRoleManager { role = ca.ManagerRole } if lastRole == role { n.Unlock() continue } // switch role to agent immediately to shutdown manager early if role == ca.WorkerRole { n.role = role n.roleCond.Broadcast() } n.Unlock() renewCert() } } }() updates := ca.RenewTLSConfig(ctx, securityConfig, n.remotes, forceCertRenewal) go func() { for { select { case certUpdate := <-updates: if certUpdate.Err != nil { logrus.Warnf("error renewing TLS certificate: %v", certUpdate.Err) continue } n.Lock() n.role = certUpdate.Role n.roleCond.Broadcast() n.Unlock() case <-ctx.Done(): return } } }() role := n.role managerReady := make(chan struct{}) agentReady := make(chan struct{}) var managerErr error var agentErr error var wg sync.WaitGroup wg.Add(2) go func() { managerErr = n.superviseManager(ctx, securityConfig, managerReady) // store err and loop wg.Done() }() go func() { agentErr = n.runAgent(ctx, db, securityConfig.ClientTLSCreds, agentReady) wg.Done() }() go func() { <-agentReady if role == ca.ManagerRole { <-managerReady } close(n.ready) }() wg.Wait() if managerErr != nil && managerErr != context.Canceled { return managerErr } if agentErr != nil && agentErr != context.Canceled { return agentErr } return err }
// LoadOrCreateSecurityConfig encapsulates the security logic behind joining a cluster. // Every node requires at least a set of TLS certificates with which to join the cluster with. // In the case of a manager, these certificates will be used both for client and server credentials. func LoadOrCreateSecurityConfig(ctx context.Context, rootCA RootCA, token, proposedRole string, remotes remotes.Remotes, nodeInfo chan<- api.IssueNodeCertificateResponse, krw *KeyReadWriter) (*SecurityConfig, error) { ctx = log.WithModule(ctx, "tls") // At this point we've successfully loaded the CA details from disk, or // successfully downloaded them remotely. The next step is to try to // load our certificates. clientTLSCreds, serverTLSCreds, err := LoadTLSCreds(rootCA, krw) if err != nil { if _, ok := errors.Cause(err).(ErrInvalidKEK); ok { return nil, err } log.G(ctx).WithError(err).Debugf("no node credentials found in: %s", krw.Target()) var ( tlsKeyPair *tls.Certificate err error ) if rootCA.CanSign() { // Create a new random ID for this certificate cn := identity.NewID() org := identity.NewID() if nodeInfo != nil { nodeInfo <- api.IssueNodeCertificateResponse{ NodeID: cn, NodeMembership: api.NodeMembershipAccepted, } } tlsKeyPair, err = rootCA.IssueAndSaveNewCertificates(krw, cn, proposedRole, org) if err != nil { log.G(ctx).WithFields(logrus.Fields{ "node.id": cn, "node.role": proposedRole, }).WithError(err).Errorf("failed to issue and save new certificate") return nil, err } log.G(ctx).WithFields(logrus.Fields{ "node.id": cn, "node.role": proposedRole, }).Debug("issued new TLS certificate") } else { // There was an error loading our Credentials, let's get a new certificate issued // Last argument is nil because at this point we don't have any valid TLS creds tlsKeyPair, err = rootCA.RequestAndSaveNewCertificates(ctx, krw, token, remotes, nil, nodeInfo) if err != nil { log.G(ctx).WithError(err).Error("failed to request save new certificate") return nil, err } } // Create the Server TLS Credentials for this node. These will not be used by workers. serverTLSCreds, err = rootCA.NewServerTLSCredentials(tlsKeyPair) if err != nil { return nil, err } // Create a TLSConfig to be used when this node connects as a client to another remote node. // We're using ManagerRole as remote serverName for TLS host verification clientTLSCreds, err = rootCA.NewClientTLSCredentials(tlsKeyPair, ManagerRole) if err != nil { return nil, err } log.G(ctx).WithFields(logrus.Fields{ "node.id": clientTLSCreds.NodeID(), "node.role": clientTLSCreds.Role(), }).Debugf("new node credentials generated: %s", krw.Target()) } else { if nodeInfo != nil { nodeInfo <- api.IssueNodeCertificateResponse{ NodeID: clientTLSCreds.NodeID(), NodeMembership: api.NodeMembershipAccepted, } } log.G(ctx).WithFields(logrus.Fields{ "node.id": clientTLSCreds.NodeID(), "node.role": clientTLSCreds.Role(), }).Debug("loaded node credentials") } return NewSecurityConfig(&rootCA, krw, clientTLSCreds, serverTLSCreds), nil }
// RenewTLSConfig will continuously monitor for the necessity of renewing the local certificates, either by // issuing them locally if key-material is available, or requesting them from a remote CA. func RenewTLSConfig(ctx context.Context, s *SecurityConfig, connBroker *connectionbroker.Broker, renew <-chan struct{}) <-chan CertificateUpdate { updates := make(chan CertificateUpdate) go func() { var retry time.Duration expBackoff := events.NewExponentialBackoff(RenewTLSExponentialBackoff) defer close(updates) for { ctx = log.WithModule(ctx, "tls") log := log.G(ctx).WithFields(logrus.Fields{ "node.id": s.ClientTLSCreds.NodeID(), "node.role": s.ClientTLSCreds.Role(), }) // Our starting default will be 5 minutes retry = 5 * time.Minute // Since the expiration of the certificate is managed remotely we should update our // retry timer on every iteration of this loop. // Retrieve the current certificate expiration information. validFrom, validUntil, err := readCertValidity(s.KeyReader()) if err != nil { // We failed to read the expiration, let's stick with the starting default log.Errorf("failed to read the expiration of the TLS certificate in: %s", s.KeyReader().Target()) select { case updates <- CertificateUpdate{Err: errors.New("failed to read certificate expiration")}: case <-ctx.Done(): log.Info("shutting down certificate renewal routine") return } } else { // If we have an expired certificate, try to renew immediately: the hope that this is a temporary clock skew, or // we can issue our own TLS certs. if validUntil.Before(time.Now()) { log.Warn("the current TLS certificate is expired, so an attempt to renew it will be made immediately") // retry immediately(ish) with exponential backoff retry = expBackoff.Proceed(nil) } else { // Random retry time between 50% and 80% of the total time to expiration retry = calculateRandomExpiry(validFrom, validUntil) } } log.WithFields(logrus.Fields{ "time": time.Now().Add(retry), }).Debugf("next certificate renewal scheduled for %v from now", retry) select { case <-time.After(retry): log.Info("renewing certificate") case <-renew: log.Info("forced certificate renewal") case <-ctx.Done(): log.Info("shutting down certificate renewal routine") return } // ignore errors - it will just try again later var certUpdate CertificateUpdate if err := RenewTLSConfigNow(ctx, s, connBroker); err != nil { certUpdate.Err = err expBackoff.Failure(nil, nil) } else { certUpdate.Role = s.ClientTLSCreds.Role() expBackoff = events.NewExponentialBackoff(RenewTLSExponentialBackoff) } select { case updates <- certUpdate: case <-ctx.Done(): log.Info("shutting down certificate renewal routine") return } } }() return updates }