Пример #1
0
// RenewTLSConfig will continuously monitor for the necessity of renewing the local certificates, either by
// issuing them locally if key-material is available, or requesting them from a remote CA.
func RenewTLSConfig(ctx context.Context, s *SecurityConfig, remotes remotes.Remotes, renew <-chan struct{}) <-chan CertificateUpdate {
	updates := make(chan CertificateUpdate)

	go func() {
		var retry time.Duration
		defer close(updates)
		for {
			ctx = log.WithModule(ctx, "tls")
			log := log.G(ctx).WithFields(logrus.Fields{
				"node.id":   s.ClientTLSCreds.NodeID(),
				"node.role": s.ClientTLSCreds.Role(),
			})
			// Our starting default will be 5 minutes
			retry = 5 * time.Minute

			// Since the expiration of the certificate is managed remotely we should update our
			// retry timer on every iteration of this loop.
			// Retrieve the current certificate expiration information.
			validFrom, validUntil, err := readCertValidity(s.KeyReader())
			if err != nil {
				// We failed to read the expiration, let's stick with the starting default
				log.Errorf("failed to read the expiration of the TLS certificate in: %s", s.KeyReader().Target())
				updates <- CertificateUpdate{Err: errors.New("failed to read certificate expiration")}
			} else {
				// If we have an expired certificate, we let's stick with the starting default in
				// the hope that this is a temporary clock skew.
				if validUntil.Before(time.Now()) {
					log.WithError(err).Errorf("failed to create a new client TLS config")
					updates <- CertificateUpdate{Err: errors.New("TLS certificate is expired")}
				} else {
					// Random retry time between 50% and 80% of the total time to expiration
					retry = calculateRandomExpiry(validFrom, validUntil)
				}
			}

			log.WithFields(logrus.Fields{
				"time": time.Now().Add(retry),
			}).Debugf("next certificate renewal scheduled")

			select {
			case <-time.After(retry):
				log.Infof("renewing certificate")
			case <-renew:
				log.Infof("forced certificate renewal")
			case <-ctx.Done():
				log.Infof("shuting down certificate renewal routine")
				return
			}

			// ignore errors - it will just try again laster
			if err := RenewTLSConfigNow(ctx, s, remotes); err != nil {
				updates <- CertificateUpdate{Err: err}
			} else {
				updates <- CertificateUpdate{Role: s.ClientTLSCreds.Role()}
			}
		}
	}()

	return updates
}
Пример #2
0
// Init prepares the worker for assignments.
func (w *worker) Init(ctx context.Context) error {
	w.mu.Lock()
	defer w.mu.Unlock()

	ctx = log.WithModule(ctx, "worker")

	// TODO(stevvooe): Start task cleanup process.

	// read the tasks from the database and start any task managers that may be needed.
	return w.db.Update(func(tx *bolt.Tx) error {
		return WalkTasks(tx, func(task *api.Task) error {
			if !TaskAssigned(tx, task.ID) {
				// NOTE(stevvooe): If tasks can survive worker restart, we need
				// to startup the controller and ensure they are removed. For
				// now, we can simply remove them from the database.
				if err := DeleteTask(tx, task.ID); err != nil {
					log.G(ctx).WithError(err).Errorf("error removing task %v", task.ID)
				}
				return nil
			}

			status, err := GetTaskStatus(tx, task.ID)
			if err != nil {
				log.G(ctx).WithError(err).Error("unable to read tasks status")
				return nil
			}

			task.Status = *status // merges the status into the task, ensuring we start at the right point.
			return w.startTask(ctx, tx, task)
		})
	})
}
Пример #3
0
// CreateSecurityConfig creates a new key and cert for this node, either locally
// or via a remote CA.
func (rootCA RootCA) CreateSecurityConfig(ctx context.Context, krw *KeyReadWriter, config CertificateRequestConfig) (*SecurityConfig, error) {
	ctx = log.WithModule(ctx, "tls")

	var (
		tlsKeyPair *tls.Certificate
		err        error
	)

	if rootCA.CanSign() {
		// Create a new random ID for this certificate
		cn := identity.NewID()
		org := identity.NewID()

		proposedRole := ManagerRole
		tlsKeyPair, err = rootCA.IssueAndSaveNewCertificates(krw, cn, proposedRole, org)
		if err != nil {
			log.G(ctx).WithFields(logrus.Fields{
				"node.id":   cn,
				"node.role": proposedRole,
			}).WithError(err).Errorf("failed to issue and save new certificate")
			return nil, err
		}

		log.G(ctx).WithFields(logrus.Fields{
			"node.id":   cn,
			"node.role": proposedRole,
		}).Debug("issued new TLS certificate")
	} else {
		// Request certificate issuance from a remote CA.
		// Last argument is nil because at this point we don't have any valid TLS creds
		tlsKeyPair, err = rootCA.RequestAndSaveNewCertificates(ctx, krw, config)
		if err != nil {
			log.G(ctx).WithError(err).Error("failed to request save new certificate")
			return nil, err
		}
	}
	// Create the Server TLS Credentials for this node. These will not be used by workers.
	serverTLSCreds, err := rootCA.NewServerTLSCredentials(tlsKeyPair)
	if err != nil {
		return nil, err
	}

	// Create a TLSConfig to be used when this node connects as a client to another remote node.
	// We're using ManagerRole as remote serverName for TLS host verification
	clientTLSCreds, err := rootCA.NewClientTLSCredentials(tlsKeyPair, ManagerRole)
	if err != nil {
		return nil, err
	}
	log.G(ctx).WithFields(logrus.Fields{
		"node.id":   clientTLSCreds.NodeID(),
		"node.role": clientTLSCreds.Role(),
	}).Debugf("new node credentials generated: %s", krw.Target())

	return NewSecurityConfig(&rootCA, krw, clientTLSCreds, serverTLSCreds), nil
}
Пример #4
0
func (tm *taskManager) Logs(ctx context.Context, options api.LogSubscriptionOptions, publisher exec.LogPublisher) {
	ctx = log.WithModule(ctx, "taskmanager")

	logCtlr, ok := tm.ctlr.(exec.ControllerLogs)
	if !ok {
		return // no logs available
	}
	if err := logCtlr.Logs(ctx, publisher, options); err != nil {
		log.G(ctx).WithError(err).Errorf("logs call failed")
	}
}
Пример #5
0
// RenewTLSConfigNow gets a new TLS cert and key, and updates the security config if provided.  This is similar to
// RenewTLSConfig, except while that monitors for expiry, and periodically renews, this renews once and is blocking
func RenewTLSConfigNow(ctx context.Context, s *SecurityConfig, r remotes.Remotes) error {
	s.renewalMu.Lock()
	defer s.renewalMu.Unlock()

	ctx = log.WithModule(ctx, "tls")
	log := log.G(ctx).WithFields(logrus.Fields{
		"node.id":   s.ClientTLSCreds.NodeID(),
		"node.role": s.ClientTLSCreds.Role(),
	})

	// Let's request new certs. Renewals don't require a token.
	rootCA := s.RootCA()
	tlsKeyPair, err := rootCA.RequestAndSaveNewCertificates(ctx,
		s.KeyWriter(),
		CertificateRequestConfig{
			Remotes:     r,
			Credentials: s.ClientTLSCreds,
		})
	if err != nil {
		log.WithError(err).Errorf("failed to renew the certificate")
		return err
	}

	clientTLSConfig, err := NewClientTLSConfig(tlsKeyPair, rootCA.Pool, CARole)
	if err != nil {
		log.WithError(err).Errorf("failed to create a new client config")
		return err
	}
	serverTLSConfig, err := NewServerTLSConfig(tlsKeyPair, rootCA.Pool)
	if err != nil {
		log.WithError(err).Errorf("failed to create a new server config")
		return err
	}

	if err = s.ClientTLSCreds.LoadNewTLSConfig(clientTLSConfig); err != nil {
		log.WithError(err).Errorf("failed to update the client credentials")
		return err
	}

	// Update the external CA to use the new client TLS
	// config using a copy without a serverName specified.
	s.externalCA.UpdateTLSConfig(&tls.Config{
		Certificates: clientTLSConfig.Certificates,
		RootCAs:      clientTLSConfig.RootCAs,
		MinVersion:   tls.VersionTLS12,
	})

	if err = s.ServerTLSCreds.LoadNewTLSConfig(serverTLSConfig); err != nil {
		log.WithError(err).Errorf("failed to update the server TLS credentials")
		return err
	}

	return nil
}
Пример #6
0
// Run starts the keymanager, it doesn't return
func (k *KeyManager) Run(ctx context.Context) error {
	k.mu.Lock()
	ctx = log.WithModule(ctx, "keymanager")
	var (
		clusters []*api.Cluster
		err      error
	)
	k.store.View(func(readTx store.ReadTx) {
		clusters, err = store.FindClusters(readTx, store.ByName(k.config.ClusterName))
	})

	if err != nil {
		log.G(ctx).Errorf("reading cluster config failed, %v", err)
		k.mu.Unlock()
		return err
	}

	cluster := clusters[0]
	if len(cluster.NetworkBootstrapKeys) == 0 {
		for _, subsys := range k.config.Subsystems {
			for i := 0; i < keyringSize; i++ {
				k.keyRing.keys = append(k.keyRing.keys, k.allocateKey(ctx, subsys))
			}
		}
		if err := k.updateKey(cluster); err != nil {
			log.G(ctx).Errorf("store update failed %v", err)
		}
	} else {
		k.keyRing.lClock = cluster.EncryptionKeyLamportClock
		k.keyRing.keys = cluster.NetworkBootstrapKeys

		k.rotateKey(ctx)
	}

	ticker := time.NewTicker(k.config.RotationInterval)
	defer ticker.Stop()

	k.ctx, k.cancel = context.WithCancel(ctx)
	k.mu.Unlock()

	for {
		select {
		case <-ticker.C:
			k.rotateKey(ctx)
		case <-k.ctx.Done():
			return nil
		}
	}
}
Пример #7
0
// Run runs dispatcher tasks which should be run on leader dispatcher.
// Dispatcher can be stopped with cancelling ctx or calling Stop().
func (d *Dispatcher) Run(ctx context.Context) error {
	d.mu.Lock()
	if d.isRunning() {
		d.mu.Unlock()
		return errors.New("dispatcher is already running")
	}
	ctx = log.WithModule(ctx, "dispatcher")
	if err := d.markNodesUnknown(ctx); err != nil {
		log.G(ctx).Errorf(`failed to move all nodes to "unknown" state: %v`, err)
	}
	configWatcher, cancel, err := store.ViewAndWatch(
		d.store,
		func(readTx store.ReadTx) error {
			clusters, err := store.FindClusters(readTx, store.ByName(store.DefaultClusterName))
			if err != nil {
				return err
			}
			if err == nil && len(clusters) == 1 {
				heartbeatPeriod, err := gogotypes.DurationFromProto(clusters[0].Spec.Dispatcher.HeartbeatPeriod)
				if err == nil && heartbeatPeriod > 0 {
					d.config.HeartbeatPeriod = heartbeatPeriod
				}
				if clusters[0].NetworkBootstrapKeys != nil {
					d.networkBootstrapKeys = clusters[0].NetworkBootstrapKeys
				}
			}
			return nil
		},
		state.EventUpdateCluster{},
	)
	if err != nil {
		d.mu.Unlock()
		return err
	}
	// set queues here to guarantee that Close will close them
	d.mgrQueue = watch.NewQueue()
	d.keyMgrQueue = watch.NewQueue()

	peerWatcher, peerCancel := d.cluster.SubscribePeers()
	defer peerCancel()
	d.lastSeenManagers = getWeightedPeers(d.cluster)

	defer cancel()
	d.ctx, d.cancel = context.WithCancel(ctx)
	ctx = d.ctx
	d.wg.Add(1)
	defer d.wg.Done()
	d.mu.Unlock()

	publishManagers := func(peers []*api.Peer) {
		var mgrs []*api.WeightedPeer
		for _, p := range peers {
			mgrs = append(mgrs, &api.WeightedPeer{
				Peer:   p,
				Weight: remotes.DefaultObservationWeight,
			})
		}
		d.mu.Lock()
		d.lastSeenManagers = mgrs
		d.mu.Unlock()
		d.mgrQueue.Publish(mgrs)
	}

	batchTimer := time.NewTimer(maxBatchInterval)
	defer batchTimer.Stop()

	for {
		select {
		case ev := <-peerWatcher:
			publishManagers(ev.([]*api.Peer))
		case <-d.processUpdatesTrigger:
			d.processUpdates(ctx)
			batchTimer.Reset(maxBatchInterval)
		case <-batchTimer.C:
			d.processUpdates(ctx)
			batchTimer.Reset(maxBatchInterval)
		case v := <-configWatcher:
			cluster := v.(state.EventUpdateCluster)
			d.mu.Lock()
			if cluster.Cluster.Spec.Dispatcher.HeartbeatPeriod != nil {
				// ignore error, since Spec has passed validation before
				heartbeatPeriod, _ := gogotypes.DurationFromProto(cluster.Cluster.Spec.Dispatcher.HeartbeatPeriod)
				if heartbeatPeriod != d.config.HeartbeatPeriod {
					// only call d.nodes.updatePeriod when heartbeatPeriod changes
					d.config.HeartbeatPeriod = heartbeatPeriod
					d.nodes.updatePeriod(d.config.HeartbeatPeriod, d.config.HeartbeatEpsilon, d.config.GracePeriodMultiplier)
				}
			}
			d.networkBootstrapKeys = cluster.Cluster.NetworkBootstrapKeys
			d.mu.Unlock()
			d.keyMgrQueue.Publish(cluster.Cluster.NetworkBootstrapKeys)
		case <-ctx.Done():
			return nil
		}
	}
}
Пример #8
0
// Run runs the CA signer main loop.
// The CA signer can be stopped with cancelling ctx or calling Stop().
func (s *Server) Run(ctx context.Context) error {
	s.mu.Lock()
	if s.isRunning() {
		s.mu.Unlock()
		return errors.New("CA signer is already running")
	}
	s.wg.Add(1)
	s.mu.Unlock()

	defer s.wg.Done()
	ctx = log.WithModule(ctx, "ca")

	// Retrieve the channels to keep track of changes in the cluster
	// Retrieve all the currently registered nodes
	var nodes []*api.Node
	updates, cancel, err := store.ViewAndWatch(
		s.store,
		func(readTx store.ReadTx) error {
			clusters, err := store.FindClusters(readTx, store.ByName(store.DefaultClusterName))
			if err != nil {
				return err
			}
			if len(clusters) != 1 {
				return errors.New("could not find cluster object")
			}
			s.updateCluster(ctx, clusters[0])

			nodes, err = store.FindNodes(readTx, store.All)
			return err
		},
		state.EventCreateNode{},
		state.EventUpdateNode{},
		state.EventUpdateCluster{},
	)

	// Do this after updateCluster has been called, so isRunning never
	// returns true without joinTokens being set correctly.
	s.mu.Lock()
	s.ctx, s.cancel = context.WithCancel(ctx)
	s.mu.Unlock()
	close(s.started)

	if err != nil {
		log.G(ctx).WithFields(logrus.Fields{
			"method": "(*Server).Run",
		}).WithError(err).Errorf("snapshot store view failed")
		return err
	}
	defer cancel()

	// We might have missed some updates if there was a leader election,
	// so let's pick up the slack.
	if err := s.reconcileNodeCertificates(ctx, nodes); err != nil {
		// We don't return here because that means the Run loop would
		// never run. Log an error instead.
		log.G(ctx).WithFields(logrus.Fields{
			"method": "(*Server).Run",
		}).WithError(err).Errorf("error attempting to reconcile certificates")
	}

	ticker := time.NewTicker(s.reconciliationRetryInterval)
	defer ticker.Stop()

	// Watch for new nodes being created, new nodes being updated, and changes
	// to the cluster
	for {
		select {
		case event := <-updates:
			switch v := event.(type) {
			case state.EventCreateNode:
				s.evaluateAndSignNodeCert(ctx, v.Node)
			case state.EventUpdateNode:
				// If this certificate is already at a final state
				// no need to evaluate and sign it.
				if !isFinalState(v.Node.Certificate.Status) {
					s.evaluateAndSignNodeCert(ctx, v.Node)
				}
			case state.EventUpdateCluster:
				s.updateCluster(ctx, v.Cluster)
			}
		case <-ticker.C:
			for _, node := range s.pending {
				if err := s.evaluateAndSignNodeCert(ctx, node); err != nil {
					// If this sign operation did not succeed, the rest are
					// unlikely to. Yield so that we don't hammer an external CA.
					// Since the map iteration order is randomized, there is no
					// risk of getting stuck on a problematic CSR.
					break
				}
			}
		case <-ctx.Done():
			return ctx.Err()
		case <-s.ctx.Done():
			return nil
		}
	}
}
Пример #9
0
func (n *Node) run(ctx context.Context) (err error) {
	defer func() {
		n.err = err
		close(n.closed)
	}()
	ctx, cancel := context.WithCancel(ctx)
	defer cancel()
	ctx = log.WithModule(ctx, "node")

	go func() {
		select {
		case <-ctx.Done():
		case <-n.stopped:
			cancel()
		}
	}()

	// NOTE: When this node is created by NewNode(), our nodeID is set if
	// n.loadCertificates() succeeded in loading TLS credentials.
	if n.config.JoinAddr == "" && n.nodeID == "" {
		if err := n.bootstrapCA(); err != nil {
			return err
		}
	}

	if n.config.JoinAddr != "" || n.config.ForceNewCluster {
		n.remotes = newPersistentRemotes(filepath.Join(n.config.StateDir, stateFilename))
		if n.config.JoinAddr != "" {
			n.remotes.Observe(api.Peer{Addr: n.config.JoinAddr}, remotes.DefaultObservationWeight)
		}
	}

	// Obtain new certs and setup TLS certificates renewal for this node:
	// - We call LoadOrCreateSecurityConfig which blocks until a valid certificate has been issued
	// - We retrieve the nodeID from LoadOrCreateSecurityConfig through the info channel. This allows
	// us to display the ID before the certificate gets issued (for potential approval).
	// - We wait for LoadOrCreateSecurityConfig to finish since we need a certificate to operate.
	// - Given a valid certificate, spin a renewal go-routine that will ensure that certificates stay
	// up to date.
	issueResponseChan := make(chan api.IssueNodeCertificateResponse, 1)
	go func() {
		select {
		case <-ctx.Done():
		case resp := <-issueResponseChan:
			log.G(log.WithModule(ctx, "tls")).WithFields(logrus.Fields{
				"node.id": resp.NodeID,
			}).Debugf("requesting certificate")
			n.Lock()
			n.nodeID = resp.NodeID
			n.nodeMembership = resp.NodeMembership
			n.Unlock()
			close(n.certificateRequested)
		}
	}()

	certDir := filepath.Join(n.config.StateDir, "certificates")
	securityConfig, err := ca.LoadOrCreateSecurityConfig(ctx, certDir, n.config.JoinToken, ca.ManagerRole, n.remotes, issueResponseChan)
	if err != nil {
		return err
	}

	taskDBPath := filepath.Join(n.config.StateDir, "worker/tasks.db")
	if err := os.MkdirAll(filepath.Dir(taskDBPath), 0777); err != nil {
		return err
	}

	db, err := bolt.Open(taskDBPath, 0666, nil)
	if err != nil {
		return err
	}
	defer db.Close()

	if err := n.loadCertificates(); err != nil {
		return err
	}

	forceCertRenewal := make(chan struct{})
	renewCert := func() {
		select {
		case forceCertRenewal <- struct{}{}:
		case <-ctx.Done():
		}
	}

	go func() {
		for {
			select {
			case <-ctx.Done():
				return
			case node := <-n.notifyNodeChange:
				// If the server is sending us a ForceRenewal State, renew
				if node.Certificate.Status.State == api.IssuanceStateRotate {
					renewCert()
					continue
				}
				n.Lock()
				// If we got a role change, renew
				lastRole := n.role
				role := ca.WorkerRole
				if node.Spec.Role == api.NodeRoleManager {
					role = ca.ManagerRole
				}
				if lastRole == role {
					n.Unlock()
					continue
				}
				// switch role to agent immediately to shutdown manager early
				if role == ca.WorkerRole {
					n.role = role
					n.roleCond.Broadcast()
				}
				n.Unlock()
				renewCert()
			}
		}
	}()

	updates := ca.RenewTLSConfig(ctx, securityConfig, certDir, n.remotes, forceCertRenewal)
	go func() {
		for {
			select {
			case certUpdate := <-updates:
				if certUpdate.Err != nil {
					logrus.Warnf("error renewing TLS certificate: %v", certUpdate.Err)
					continue
				}
				n.Lock()
				n.role = certUpdate.Role
				n.roleCond.Broadcast()
				n.Unlock()
			case <-ctx.Done():
				return
			}
		}
	}()

	role := n.role

	managerReady := make(chan struct{})
	agentReady := make(chan struct{})
	var managerErr error
	var agentErr error
	var wg sync.WaitGroup
	wg.Add(2)
	go func() {
		managerErr = n.runManager(ctx, securityConfig, managerReady) // store err and loop
		wg.Done()
		cancel()
	}()
	go func() {
		agentErr = n.runAgent(ctx, db, securityConfig.ClientTLSCreds, agentReady)
		wg.Done()
		cancel()
	}()

	go func() {
		<-agentReady
		if role == ca.ManagerRole {
			<-managerReady
		}
		close(n.ready)
	}()

	wg.Wait()
	if managerErr != nil && managerErr != context.Canceled {
		return managerErr
	}
	if agentErr != nil && agentErr != context.Canceled {
		return agentErr
	}
	return err
}
Пример #10
0
func (a *Agent) run(ctx context.Context) {
	ctx, cancel := context.WithCancel(ctx)
	defer cancel()
	defer close(a.closed) // full shutdown.

	ctx = log.WithModule(ctx, "agent")

	log.G(ctx).Debugf("(*Agent).run")
	defer log.G(ctx).Debugf("(*Agent).run exited")

	var (
		backoff    time.Duration
		session    = newSession(ctx, a, backoff) // start the initial session
		registered = session.registered
		ready      = a.ready // first session ready
		sessionq   chan sessionOperation
	)

	if err := a.worker.Init(ctx); err != nil {
		log.G(ctx).WithError(err).Error("worker initialization failed")
		a.err = err
		return // fatal?
	}

	// setup a reliable reporter to call back to us.
	reporter := newStatusReporter(ctx, a)
	defer reporter.Close()

	a.worker.Listen(ctx, reporter)

	for {
		select {
		case operation := <-sessionq:
			operation.response <- operation.fn(session)
		case msg := <-session.tasks:
			if err := a.worker.Assign(ctx, msg.Tasks); err != nil {
				log.G(ctx).WithError(err).Error("task assignment failed")
			}
		case msg := <-session.messages:
			if err := a.handleSessionMessage(ctx, msg); err != nil {
				log.G(ctx).WithError(err).Error("session message handler failed")
			}
		case <-registered:
			log.G(ctx).Debugln("agent: registered")
			if ready != nil {
				close(ready)
			}
			ready = nil
			registered = nil // we only care about this once per session
			backoff = 0      // reset backoff
			sessionq = a.sessionq
		case err := <-session.errs:
			// TODO(stevvooe): This may actually block if a session is closed
			// but no error was sent. Session.close must only be called here
			// for this to work.
			if err != nil {
				log.G(ctx).WithError(err).Error("agent: session failed")
				backoff = initialSessionFailureBackoff + 2*backoff
				if backoff > maxSessionFailureBackoff {
					backoff = maxSessionFailureBackoff
				}
			}

			if err := session.close(); err != nil {
				log.G(ctx).WithError(err).Error("agent: closing session failed")
			}
			sessionq = nil
			// if we're here before <-registered, do nothing for that event
			registered = nil
		case <-session.closed:
			log.G(ctx).Debugf("agent: rebuild session")

			// select a session registration delay from backoff range.
			delay := time.Duration(rand.Int63n(int64(backoff)))
			session = newSession(ctx, a, delay)
			registered = session.registered
			sessionq = a.sessionq
		case <-a.stopped:
			// TODO(stevvooe): Wait on shutdown and cleanup. May need to pump
			// this loop a few times.
			return
		case <-ctx.Done():
			if a.err == nil {
				a.err = ctx.Err()
			}
			session.close()

			return
		}
	}
}
Пример #11
0
// RenewTLSConfig will continuously monitor for the necessity of renewing the local certificates, either by
// issuing them locally if key-material is available, or requesting them from a remote CA.
func RenewTLSConfig(ctx context.Context, s *SecurityConfig, baseCertDir string, remotes remotes.Remotes, renew <-chan struct{}) <-chan CertificateUpdate {
	paths := NewConfigPaths(baseCertDir)
	updates := make(chan CertificateUpdate)

	go func() {
		var retry time.Duration
		defer close(updates)
		for {
			ctx = log.WithModule(ctx, "tls")
			log := log.G(ctx).WithFields(logrus.Fields{
				"node.id":   s.ClientTLSCreds.NodeID(),
				"node.role": s.ClientTLSCreds.Role(),
			})
			// Our starting default will be 5 minutes
			retry = 5 * time.Minute

			// Since the expiration of the certificate is managed remotely we should update our
			// retry timer on every iteration of this loop.
			// Retrieve the time until the certificate expires.
			expiresIn, err := readCertExpiration(paths.Node)
			if err != nil {
				// We failed to read the expiration, let's stick with the starting default
				log.Errorf("failed to read the expiration of the TLS certificate in: %s", paths.Node.Cert)
				updates <- CertificateUpdate{Err: fmt.Errorf("failed to read certificate expiration")}
			} else {
				// If we have an expired certificate, we let's stick with the starting default in
				// the hope that this is a temporary clock skew.
				if expiresIn.Minutes() < 0 {
					log.WithError(err).Errorf("failed to create a new client TLS config")
					updates <- CertificateUpdate{Err: fmt.Errorf("TLS certificate is expired")}
				} else {
					// Random retry time between 50% and 80% of the total time to expiration
					retry = calculateRandomExpiry(expiresIn)
				}
			}

			log.WithFields(logrus.Fields{
				"time": time.Now().Add(retry),
			}).Debugf("next certificate renewal scheduled")

			select {
			case <-time.After(retry):
				log.Infof("renewing certificate")
			case <-renew:
				log.Infof("forced certificate renewal")
			case <-ctx.Done():
				log.Infof("shuting down certificate renewal routine")
				return
			}

			// Let's request new certs. Renewals don't require a token.
			rootCA := s.RootCA()
			tlsKeyPair, err := rootCA.RequestAndSaveNewCertificates(ctx,
				paths.Node,
				"",
				remotes,
				s.ClientTLSCreds,
				nil)
			if err != nil {
				log.WithError(err).Errorf("failed to renew the certificate")
				updates <- CertificateUpdate{Err: err}
				continue
			}

			clientTLSConfig, err := NewClientTLSConfig(tlsKeyPair, rootCA.Pool, CARole)
			if err != nil {
				log.WithError(err).Errorf("failed to create a new client config")
				updates <- CertificateUpdate{Err: err}
			}
			serverTLSConfig, err := NewServerTLSConfig(tlsKeyPair, rootCA.Pool)
			if err != nil {
				log.WithError(err).Errorf("failed to create a new server config")
				updates <- CertificateUpdate{Err: err}
			}

			err = s.ClientTLSCreds.LoadNewTLSConfig(clientTLSConfig)
			if err != nil {
				log.WithError(err).Errorf("failed to update the client credentials")
				updates <- CertificateUpdate{Err: err}
			}

			// Update the external CA to use the new client TLS
			// config using a copy without a serverName specified.
			s.externalCA.UpdateTLSConfig(&tls.Config{
				Certificates: clientTLSConfig.Certificates,
				RootCAs:      clientTLSConfig.RootCAs,
				MinVersion:   tls.VersionTLS12,
			})

			err = s.ServerTLSCreds.LoadNewTLSConfig(serverTLSConfig)
			if err != nil {
				log.WithError(err).Errorf("failed to update the server TLS credentials")
				updates <- CertificateUpdate{Err: err}
			}

			updates <- CertificateUpdate{Role: s.ClientTLSCreds.Role()}
		}
	}()

	return updates
}
Пример #12
0
// LoadSecurityConfig loads TLS credentials from disk, or returns an error if
// these credentials do not exist or are unusable.
func LoadSecurityConfig(ctx context.Context, rootCA RootCA, krw *KeyReadWriter) (*SecurityConfig, error) {
	ctx = log.WithModule(ctx, "tls")

	// At this point we've successfully loaded the CA details from disk, or
	// successfully downloaded them remotely. The next step is to try to
	// load our certificates.

	// Read both the Cert and Key from disk
	cert, key, err := krw.Read()
	if err != nil {
		return nil, err
	}

	// Create an x509 certificate out of the contents on disk
	certBlock, _ := pem.Decode([]byte(cert))
	if certBlock == nil {
		return nil, errors.New("failed to parse certificate PEM")
	}

	// Create an X509Cert so we can .Verify()
	X509Cert, err := x509.ParseCertificate(certBlock.Bytes)
	if err != nil {
		return nil, err
	}

	// Include our root pool
	opts := x509.VerifyOptions{
		Roots: rootCA.Pool,
	}

	// Check to see if this certificate was signed by our CA, and isn't expired
	if _, err := X509Cert.Verify(opts); err != nil {
		return nil, err
	}

	// Now that we know this certificate is valid, create a TLS Certificate for our
	// credentials
	keyPair, err := tls.X509KeyPair(cert, key)
	if err != nil {
		return nil, err
	}

	// Load the Certificates as server credentials
	serverTLSCreds, err := rootCA.NewServerTLSCredentials(&keyPair)
	if err != nil {
		return nil, err
	}

	// Load the Certificates also as client credentials.
	// Both workers and managers always connect to remote managers,
	// so ServerName is always set to ManagerRole here.
	clientTLSCreds, err := rootCA.NewClientTLSCredentials(&keyPair, ManagerRole)
	if err != nil {
		return nil, err
	}

	log.G(ctx).WithFields(logrus.Fields{
		"node.id":   clientTLSCreds.NodeID(),
		"node.role": clientTLSCreds.Role(),
	}).Debug("loaded node credentials")

	return NewSecurityConfig(&rootCA, krw, clientTLSCreds, serverTLSCreds), nil
}
Пример #13
0
// Run runs dispatcher tasks which should be run on leader dispatcher.
// Dispatcher can be stopped with cancelling ctx or calling Stop().
func (d *Dispatcher) Run(ctx context.Context) error {
	d.mu.Lock()
	if d.isRunning() {
		d.mu.Unlock()
		return fmt.Errorf("dispatcher is already running")
	}
	ctx = log.WithModule(ctx, "dispatcher")
	if err := d.markNodesUnknown(ctx); err != nil {
		log.G(ctx).Errorf(`failed to move all nodes to "unknown" state: %v`, err)
	}
	configWatcher, cancel, err := store.ViewAndWatch(
		d.store,
		func(readTx store.ReadTx) error {
			clusters, err := store.FindClusters(readTx, store.ByName(store.DefaultClusterName))
			if err != nil {
				return err
			}
			if err == nil && len(clusters) == 1 {
				heartbeatPeriod, err := ptypes.Duration(clusters[0].Spec.Dispatcher.HeartbeatPeriod)
				if err == nil && heartbeatPeriod > 0 {
					d.config.HeartbeatPeriod = heartbeatPeriod
				}
				if clusters[0].NetworkBootstrapKeys != nil {
					d.networkBootstrapKeys = clusters[0].NetworkBootstrapKeys
				}
			}
			return nil
		},
		state.EventUpdateCluster{},
	)
	if err != nil {
		d.mu.Unlock()
		return err
	}
	defer cancel()
	d.ctx, d.cancel = context.WithCancel(ctx)
	d.mu.Unlock()

	publishManagers := func() {
		mgrs := getWeightedPeers(d.cluster)
		sort.Sort(weightedPeerByNodeID(mgrs))
		d.mu.Lock()
		if reflect.DeepEqual(mgrs, d.lastSeenManagers) {
			d.mu.Unlock()
			return
		}
		d.lastSeenManagers = mgrs
		d.mu.Unlock()
		d.mgrQueue.Publish(mgrs)
	}

	publishManagers()
	publishTicker := time.NewTicker(1 * time.Second)
	defer publishTicker.Stop()

	batchTimer := time.NewTimer(maxBatchInterval)
	defer batchTimer.Stop()

	for {
		select {
		case <-publishTicker.C:
			publishManagers()
		case <-d.processUpdatesTrigger:
			d.processUpdates()
			batchTimer.Reset(maxBatchInterval)
		case <-batchTimer.C:
			d.processUpdates()
			batchTimer.Reset(maxBatchInterval)
		case v := <-configWatcher:
			cluster := v.(state.EventUpdateCluster)
			d.mu.Lock()
			if cluster.Cluster.Spec.Dispatcher.HeartbeatPeriod != nil {
				// ignore error, since Spec has passed validation before
				heartbeatPeriod, _ := ptypes.Duration(cluster.Cluster.Spec.Dispatcher.HeartbeatPeriod)
				if heartbeatPeriod != d.config.HeartbeatPeriod {
					// only call d.nodes.updatePeriod when heartbeatPeriod changes
					d.config.HeartbeatPeriod = heartbeatPeriod
					d.nodes.updatePeriod(d.config.HeartbeatPeriod, d.config.HeartbeatEpsilon, d.config.GracePeriodMultiplier)
				}
			}
			d.networkBootstrapKeys = cluster.Cluster.NetworkBootstrapKeys
			d.mu.Unlock()
			d.keyMgrQueue.Publish(struct{}{})
		case <-d.ctx.Done():
			return nil
		}
	}
}
Пример #14
0
func (n *Node) loadSecurityConfig(ctx context.Context) (*ca.SecurityConfig, error) {
	paths := ca.NewConfigPaths(filepath.Join(n.config.StateDir, certDirectory))
	var securityConfig *ca.SecurityConfig

	krw := ca.NewKeyReadWriter(paths.Node, n.unlockKey, &manager.RaftDEKData{})
	if err := krw.Migrate(); err != nil {
		return nil, err
	}

	// Check if we already have a valid certificates on disk.
	rootCA, err := ca.GetLocalRootCA(paths.RootCA)
	if err != nil && err != ca.ErrNoLocalRootCA {
		return nil, err
	}
	if err == nil {
		clientTLSCreds, serverTLSCreds, err := ca.LoadTLSCreds(rootCA, krw)
		_, ok := errors.Cause(err).(ca.ErrInvalidKEK)
		switch {
		case err == nil:
			securityConfig = ca.NewSecurityConfig(&rootCA, krw, clientTLSCreds, serverTLSCreds)
			log.G(ctx).Debug("loaded CA and TLS certificates")
		case ok:
			return nil, ErrInvalidUnlockKey
		case os.IsNotExist(err):
			break
		default:
			return nil, errors.Wrapf(err, "error while loading TLS certificate in %s", paths.Node.Cert)
		}
	}

	if securityConfig == nil {
		if n.config.JoinAddr == "" {
			// if we're not joining a cluster, bootstrap a new one - and we have to set the unlock key
			n.unlockKey = nil
			if n.config.AutoLockManagers {
				n.unlockKey = encryption.GenerateSecretKey()
			}
			krw = ca.NewKeyReadWriter(paths.Node, n.unlockKey, &manager.RaftDEKData{})
			rootCA, err = ca.CreateRootCA(ca.DefaultRootCN, paths.RootCA)
			if err != nil {
				return nil, err
			}
			log.G(ctx).Debug("generated CA key and certificate")
		} else if err == ca.ErrNoLocalRootCA { // from previous error loading the root CA from disk
			rootCA, err = ca.DownloadRootCA(ctx, paths.RootCA, n.config.JoinToken, n.remotes)
			if err != nil {
				return nil, err
			}
			log.G(ctx).Debug("downloaded CA certificate")
		}

		// Obtain new certs and setup TLS certificates renewal for this node:
		// - We call LoadOrCreateSecurityConfig which blocks until a valid certificate has been issued
		// - We retrieve the nodeID from LoadOrCreateSecurityConfig through the info channel. This allows
		// us to display the ID before the certificate gets issued (for potential approval).
		// - We wait for LoadOrCreateSecurityConfig to finish since we need a certificate to operate.
		// - Given a valid certificate, spin a renewal go-routine that will ensure that certificates stay
		// up to date.
		issueResponseChan := make(chan api.IssueNodeCertificateResponse, 1)
		go func() {
			select {
			case <-ctx.Done():
			case resp := <-issueResponseChan:
				log.G(log.WithModule(ctx, "tls")).WithFields(logrus.Fields{
					"node.id": resp.NodeID,
				}).Debugf("loaded TLS certificate")
				n.Lock()
				n.nodeID = resp.NodeID
				n.nodeMembership = resp.NodeMembership
				n.Unlock()
				close(n.certificateRequested)
			}
		}()

		// LoadOrCreateSecurityConfig is the point at which a new node joining a cluster will retrieve TLS
		// certificates and write them to disk
		securityConfig, err = ca.LoadOrCreateSecurityConfig(
			ctx, rootCA, n.config.JoinToken, ca.ManagerRole, n.remotes, issueResponseChan, krw)
		if err != nil {
			if _, ok := errors.Cause(err).(ca.ErrInvalidKEK); ok {
				return nil, ErrInvalidUnlockKey
			}
			return nil, err
		}
	}

	n.Lock()
	n.role = securityConfig.ClientTLSCreds.Role()
	n.nodeID = securityConfig.ClientTLSCreds.NodeID()
	n.nodeMembership = api.NodeMembershipAccepted
	n.roleCond.Broadcast()
	n.Unlock()

	return securityConfig, nil
}
Пример #15
0
func (a *Agent) run(ctx context.Context) {
	ctx, cancel := context.WithCancel(ctx)
	defer cancel()
	defer close(a.closed) // full shutdown.

	ctx = log.WithModule(ctx, "agent")

	log.G(ctx).Debugf("(*Agent).run")
	defer log.G(ctx).Debugf("(*Agent).run exited")

	// get the node description
	nodeDescription, err := a.nodeDescriptionWithHostname(ctx)
	if err != nil {
		log.G(ctx).WithError(err).WithField("agent", a.config.Executor).Errorf("agent: node description unavailable")
	}
	// nodeUpdateTicker is used to periodically check for updates to node description
	nodeUpdateTicker := time.NewTicker(nodeUpdatePeriod)
	defer nodeUpdateTicker.Stop()

	var (
		backoff       time.Duration
		session       = newSession(ctx, a, backoff, "", nodeDescription) // start the initial session
		registered    = session.registered
		ready         = a.ready // first session ready
		sessionq      chan sessionOperation
		leaving       = a.leaving
		subscriptions = map[string]context.CancelFunc{}
	)

	if err := a.worker.Init(ctx); err != nil {
		log.G(ctx).WithError(err).Error("worker initialization failed")
		a.err = err
		return // fatal?
	}
	defer a.worker.Close()

	// setup a reliable reporter to call back to us.
	reporter := newStatusReporter(ctx, a)
	defer reporter.Close()

	a.worker.Listen(ctx, reporter)

	for {
		select {
		case operation := <-sessionq:
			operation.response <- operation.fn(session)
		case <-leaving:
			leaving = nil

			// TODO(stevvooe): Signal to the manager that the node is leaving.

			// when leaving we remove all assignments.
			if err := a.worker.Assign(ctx, nil); err != nil {
				log.G(ctx).WithError(err).Error("failed removing all assignments")
			}
		case msg := <-session.assignments:
			// if we have left, accept no more assignments
			if leaving == nil {
				continue
			}

			switch msg.Type {
			case api.AssignmentsMessage_COMPLETE:
				// Need to assign secrets before tasks, because tasks might depend on new secrets
				if err := a.worker.Assign(ctx, msg.Changes); err != nil {
					log.G(ctx).WithError(err).Error("failed to synchronize worker assignments")
				}
			case api.AssignmentsMessage_INCREMENTAL:
				if err := a.worker.Update(ctx, msg.Changes); err != nil {
					log.G(ctx).WithError(err).Error("failed to update worker assignments")
				}
			}
		case msg := <-session.messages:
			if err := a.handleSessionMessage(ctx, msg); err != nil {
				log.G(ctx).WithError(err).Error("session message handler failed")
			}
		case sub := <-session.subscriptions:
			if sub.Close {
				if cancel, ok := subscriptions[sub.ID]; ok {
					cancel()
				}
				delete(subscriptions, sub.ID)
				continue
			}

			if _, ok := subscriptions[sub.ID]; ok {
				// Duplicate subscription
				continue
			}

			subCtx, subCancel := context.WithCancel(ctx)
			subscriptions[sub.ID] = subCancel
			go a.worker.Subscribe(subCtx, sub)
		case <-registered:
			log.G(ctx).Debugln("agent: registered")
			if ready != nil {
				close(ready)
			}
			ready = nil
			registered = nil // we only care about this once per session
			backoff = 0      // reset backoff
			sessionq = a.sessionq
		case err := <-session.errs:
			// TODO(stevvooe): This may actually block if a session is closed
			// but no error was sent. Session.close must only be called here
			// for this to work.
			if err != nil {
				log.G(ctx).WithError(err).Error("agent: session failed")
				backoff = initialSessionFailureBackoff + 2*backoff
				if backoff > maxSessionFailureBackoff {
					backoff = maxSessionFailureBackoff
				}
			}

			if err := session.close(); err != nil {
				log.G(ctx).WithError(err).Error("agent: closing session failed")
			}
			sessionq = nil
			// if we're here before <-registered, do nothing for that event
			registered = nil
		case <-session.closed:
			log.G(ctx).Debugf("agent: rebuild session")

			// select a session registration delay from backoff range.
			delay := time.Duration(0)
			if backoff > 0 {
				delay = time.Duration(rand.Int63n(int64(backoff)))
			}
			session = newSession(ctx, a, delay, session.sessionID, nodeDescription)
			registered = session.registered
		case <-nodeUpdateTicker.C:
			// skip this case if the registration isn't finished
			if registered != nil {
				continue
			}
			// get the current node description
			newNodeDescription, err := a.nodeDescriptionWithHostname(ctx)
			if err != nil {
				log.G(ctx).WithError(err).WithField("agent", a.config.Executor).Errorf("agent: updated node description unavailable")
			}

			// if newNodeDescription is nil, it will cause a panic when
			// trying to create a session. Typically this can happen
			// if the engine goes down
			if newNodeDescription == nil {
				continue
			}

			// if the node description has changed, update it to the new one
			// and close the session. The old session will be stopped and a
			// new one will be created with the updated description
			if !reflect.DeepEqual(nodeDescription, newNodeDescription) {
				nodeDescription = newNodeDescription
				// close the session
				log.G(ctx).Info("agent: found node update")
				session.sendError(nil)
			}
		case <-a.stopped:
			// TODO(stevvooe): Wait on shutdown and cleanup. May need to pump
			// this loop a few times.
			return
		case <-ctx.Done():
			if a.err == nil {
				a.err = ctx.Err()
			}
			session.close()

			return
		}
	}
}
Пример #16
0
func (tm *taskManager) run(ctx context.Context) {
	ctx, cancelAll := context.WithCancel(ctx)
	defer cancelAll() // cancel all child operations on exit.

	ctx = log.WithModule(ctx, "taskmanager")

	var (
		opctx    context.Context
		cancel   context.CancelFunc
		run      = make(chan struct{}, 1)
		statusq  = make(chan *api.TaskStatus)
		errs     = make(chan error)
		shutdown = tm.shutdown
		updated  bool // true if the task was updated.
	)

	defer func() {
		// closure  picks up current value of cancel.
		if cancel != nil {
			cancel()
		}
	}()

	run <- struct{}{} // prime the pump
	for {
		select {
		case <-run:
			// always check for shutdown before running.
			select {
			case <-tm.shutdown:
				continue // ignore run request and handle shutdown
			case <-tm.closed:
				continue
			default:
			}

			opctx, cancel = context.WithCancel(ctx)

			// Several variables need to be snapshotted for the closure below.
			opcancel := cancel        // fork for the closure
			running := tm.task.Copy() // clone the task before dispatch
			statusqLocal := statusq
			updatedLocal := updated // capture state of update for goroutine
			updated = false
			go runctx(ctx, tm.closed, errs, func(ctx context.Context) error {
				defer opcancel()

				if updatedLocal {
					// before we do anything, update the task for the controller.
					// always update the controller before running.
					if err := tm.ctlr.Update(opctx, running); err != nil {
						log.G(ctx).WithError(err).Error("updating task controller failed")
						return err
					}
				}

				status, err := exec.Do(opctx, running, tm.ctlr)
				if status != nil {
					// always report the status if we get one back. This
					// returns to the manager loop, then reports the status
					// upstream.
					select {
					case statusqLocal <- status:
					case <-ctx.Done(): // not opctx, since that may have been cancelled.
					}

					if err := tm.reporter.UpdateTaskStatus(ctx, running.ID, status); err != nil {
						log.G(ctx).WithError(err).Error("failed reporting status to agent")
					}
				}

				return err
			})
		case err := <-errs:
			// This branch is always executed when an operations completes. The
			// goal is to decide whether or not we re-dispatch the operation.
			cancel = nil

			select {
			case <-tm.shutdown:
				shutdown = tm.shutdown // re-enable the shutdown branch
				continue               // no dispatch if we are in shutdown.
			default:
			}

			switch err {
			case exec.ErrTaskNoop:
				if !updated {
					continue // wait till getting pumped via update.
				}
			case exec.ErrTaskRetry:
				// TODO(stevvooe): Add exponential backoff with random jitter
				// here. For now, this backoff is enough to keep the task
				// manager from running away with the CPU.
				time.AfterFunc(time.Second, func() {
					errs <- nil // repump this branch, with no err
				})
				continue
			case nil, context.Canceled, context.DeadlineExceeded:
				// no log in this case
			default:
				log.G(ctx).WithError(err).Error("task operation failed")
			}

			select {
			case run <- struct{}{}:
			default:
			}
		case status := <-statusq:
			tm.task.Status = *status
		case task := <-tm.updateq:
			if equality.TasksEqualStable(task, tm.task) {
				continue // ignore the update
			}

			if task.ID != tm.task.ID {
				log.G(ctx).WithField("task.update.id", task.ID).Error("received update for incorrect task")
				continue
			}

			if task.DesiredState < tm.task.DesiredState {
				log.G(ctx).WithField("task.update.desiredstate", task.DesiredState).
					Error("ignoring task update with invalid desired state")
				continue
			}

			task = task.Copy()
			task.Status = tm.task.Status // overwrite our status, as it is canonical.
			tm.task = task
			updated = true

			// we have accepted the task update
			if cancel != nil {
				cancel() // cancel outstanding if necessary.
			} else {
				// If this channel op fails, it means there is already a
				// message on the run queue.
				select {
				case run <- struct{}{}:
				default:
				}
			}
		case <-shutdown:
			if cancel != nil {
				// cancel outstanding operation.
				cancel()

				// subtle: after a cancellation, we want to avoid busy wait
				// here. this gets renabled in the errs branch and we'll come
				// back around and try shutdown again.
				shutdown = nil // turn off this branch until op proceeds
				continue       // wait until operation actually exits.
			}

			// TODO(stevvooe): This should be left for the repear.

			// make an attempt at removing. this is best effort. any errors will be
			// retried by the reaper later.
			if err := tm.ctlr.Remove(ctx); err != nil {
				log.G(ctx).WithError(err).WithField("task.id", tm.task.ID).Error("remove task failed")
			}

			if err := tm.ctlr.Close(); err != nil {
				log.G(ctx).WithError(err).Error("error closing controller")
			}
			// disable everything, and prepare for closing.
			statusq = nil
			errs = nil
			shutdown = nil
			close(tm.closed)
		case <-tm.closed:
			return
		case <-ctx.Done():
			return
		}
	}
}
Пример #17
0
// LoadOrCreateSecurityConfig encapsulates the security logic behind joining a cluster.
// Every node requires at least a set of TLS certificates with which to join the cluster with.
// In the case of a manager, these certificates will be used both for client and server credentials.
func LoadOrCreateSecurityConfig(ctx context.Context, baseCertDir, token, proposedRole string, remotes remotes.Remotes, nodeInfo chan<- api.IssueNodeCertificateResponse) (*SecurityConfig, error) {
	ctx = log.WithModule(ctx, "tls")
	paths := NewConfigPaths(baseCertDir)

	var (
		rootCA                         RootCA
		serverTLSCreds, clientTLSCreds *MutableTLSCreds
		err                            error
	)

	// Check if we already have a CA certificate on disk. We need a CA to have a valid SecurityConfig
	rootCA, err = GetLocalRootCA(baseCertDir)
	switch err {
	case nil:
		log.G(ctx).Debug("loaded CA certificate")
	case ErrNoLocalRootCA:
		log.G(ctx).WithError(err).Debugf("failed to load local CA certificate")

		// Get a digest for the optional CA hash string that we've been provided
		// If we were provided a non-empty string, and it is an invalid hash, return
		// otherwise, allow the invalid digest through.
		var d digest.Digest
		if token != "" {
			d, err = getCAHashFromToken(token)
			if err != nil {
				return nil, err
			}
		}

		// Get the remote CA certificate, verify integrity with the
		// hash provided. Retry up to 5 times, in case the manager we
		// first try to contact is not responding properly (it may have
		// just been demoted, for example).

		for i := 0; i != 5; i++ {
			rootCA, err = GetRemoteCA(ctx, d, remotes)
			if err == nil {
				break
			}
			log.G(ctx).WithError(err).Errorf("failed to retrieve remote root CA certificate")
		}
		if err != nil {
			return nil, err
		}

		// Save root CA certificate to disk
		if err = saveRootCA(rootCA, paths.RootCA); err != nil {
			return nil, err
		}

		log.G(ctx).Debugf("retrieved remote CA certificate: %s", paths.RootCA.Cert)
	default:
		return nil, err
	}

	// At this point we've successfully loaded the CA details from disk, or
	// successfully downloaded them remotely. The next step is to try to
	// load our certificates.
	clientTLSCreds, serverTLSCreds, err = LoadTLSCreds(rootCA, paths.Node)
	if err != nil {
		log.G(ctx).WithError(err).Debugf("no node credentials found in: %s", paths.Node.Cert)

		var (
			tlsKeyPair *tls.Certificate
			err        error
		)

		if rootCA.CanSign() {
			// Create a new random ID for this certificate
			cn := identity.NewID()
			org := identity.NewID()

			if nodeInfo != nil {
				nodeInfo <- api.IssueNodeCertificateResponse{
					NodeID:         cn,
					NodeMembership: api.NodeMembershipAccepted,
				}
			}
			tlsKeyPair, err = rootCA.IssueAndSaveNewCertificates(paths.Node, cn, proposedRole, org)
			if err != nil {
				log.G(ctx).WithFields(logrus.Fields{
					"node.id":   cn,
					"node.role": proposedRole,
				}).WithError(err).Errorf("failed to issue and save new certificate")
				return nil, err
			}

			log.G(ctx).WithFields(logrus.Fields{
				"node.id":   cn,
				"node.role": proposedRole,
			}).Debug("issued new TLS certificate")
		} else {
			// There was an error loading our Credentials, let's get a new certificate issued
			// Last argument is nil because at this point we don't have any valid TLS creds
			tlsKeyPair, err = rootCA.RequestAndSaveNewCertificates(ctx, paths.Node, token, remotes, nil, nodeInfo)
			if err != nil {
				log.G(ctx).WithError(err).Error("failed to request save new certificate")
				return nil, err
			}
		}
		// Create the Server TLS Credentials for this node. These will not be used by workers.
		serverTLSCreds, err = rootCA.NewServerTLSCredentials(tlsKeyPair)
		if err != nil {
			return nil, err
		}

		// Create a TLSConfig to be used when this node connects as a client to another remote node.
		// We're using ManagerRole as remote serverName for TLS host verification
		clientTLSCreds, err = rootCA.NewClientTLSCredentials(tlsKeyPair, ManagerRole)
		if err != nil {
			return nil, err
		}
		log.G(ctx).WithFields(logrus.Fields{
			"node.id":   clientTLSCreds.NodeID(),
			"node.role": clientTLSCreds.Role(),
		}).Debugf("new node credentials generated: %s", paths.Node.Cert)
	} else {
		if nodeInfo != nil {
			nodeInfo <- api.IssueNodeCertificateResponse{
				NodeID:         clientTLSCreds.NodeID(),
				NodeMembership: api.NodeMembershipAccepted,
			}
		}
		log.G(ctx).WithFields(logrus.Fields{
			"node.id":   clientTLSCreds.NodeID(),
			"node.role": clientTLSCreds.Role(),
		}).Debug("loaded node credentials")
	}

	return NewSecurityConfig(&rootCA, clientTLSCreds, serverTLSCreds), nil
}
Пример #18
0
func (n *Node) run(ctx context.Context) (err error) {
	defer func() {
		n.err = err
		close(n.closed)
	}()
	ctx, cancel := context.WithCancel(ctx)
	defer cancel()
	ctx = log.WithModule(ctx, "node")

	go func() {
		select {
		case <-ctx.Done():
		case <-n.stopped:
			cancel()
		}
	}()

	securityConfig, err := n.loadSecurityConfig(ctx)
	if err != nil {
		return err
	}

	taskDBPath := filepath.Join(n.config.StateDir, "worker/tasks.db")
	if err := os.MkdirAll(filepath.Dir(taskDBPath), 0777); err != nil {
		return err
	}

	db, err := bolt.Open(taskDBPath, 0666, nil)
	if err != nil {
		return err
	}
	defer db.Close()

	forceCertRenewal := make(chan struct{})
	renewCert := func() {
		select {
		case forceCertRenewal <- struct{}{}:
		case <-ctx.Done():
		}
	}

	go func() {
		for {
			select {
			case <-ctx.Done():
				return
			case node := <-n.notifyNodeChange:
				// If the server is sending us a ForceRenewal State, renew
				if node.Certificate.Status.State == api.IssuanceStateRotate {
					renewCert()
					continue
				}
				n.Lock()
				// If we got a role change, renew
				lastRole := n.role
				role := ca.WorkerRole
				if node.Spec.Role == api.NodeRoleManager {
					role = ca.ManagerRole
				}
				if lastRole == role {
					n.Unlock()
					continue
				}
				// switch role to agent immediately to shutdown manager early
				if role == ca.WorkerRole {
					n.role = role
					n.roleCond.Broadcast()
				}
				n.Unlock()
				renewCert()
			}
		}
	}()

	updates := ca.RenewTLSConfig(ctx, securityConfig, n.remotes, forceCertRenewal)
	go func() {
		for {
			select {
			case certUpdate := <-updates:
				if certUpdate.Err != nil {
					logrus.Warnf("error renewing TLS certificate: %v", certUpdate.Err)
					continue
				}
				n.Lock()
				n.role = certUpdate.Role
				n.roleCond.Broadcast()
				n.Unlock()
			case <-ctx.Done():
				return
			}
		}
	}()

	role := n.role

	managerReady := make(chan struct{})
	agentReady := make(chan struct{})
	var managerErr error
	var agentErr error
	var wg sync.WaitGroup
	wg.Add(2)
	go func() {
		managerErr = n.superviseManager(ctx, securityConfig, managerReady) // store err and loop
		wg.Done()
	}()
	go func() {
		agentErr = n.runAgent(ctx, db, securityConfig.ClientTLSCreds, agentReady)
		wg.Done()
	}()

	go func() {
		<-agentReady
		if role == ca.ManagerRole {
			<-managerReady
		}
		close(n.ready)
	}()

	wg.Wait()
	if managerErr != nil && managerErr != context.Canceled {
		return managerErr
	}
	if agentErr != nil && agentErr != context.Canceled {
		return agentErr
	}
	return err
}
Пример #19
0
// LoadOrCreateSecurityConfig encapsulates the security logic behind joining a cluster.
// Every node requires at least a set of TLS certificates with which to join the cluster with.
// In the case of a manager, these certificates will be used both for client and server credentials.
func LoadOrCreateSecurityConfig(ctx context.Context, rootCA RootCA, token, proposedRole string, remotes remotes.Remotes, nodeInfo chan<- api.IssueNodeCertificateResponse, krw *KeyReadWriter) (*SecurityConfig, error) {
	ctx = log.WithModule(ctx, "tls")

	// At this point we've successfully loaded the CA details from disk, or
	// successfully downloaded them remotely. The next step is to try to
	// load our certificates.
	clientTLSCreds, serverTLSCreds, err := LoadTLSCreds(rootCA, krw)
	if err != nil {
		if _, ok := errors.Cause(err).(ErrInvalidKEK); ok {
			return nil, err
		}

		log.G(ctx).WithError(err).Debugf("no node credentials found in: %s", krw.Target())

		var (
			tlsKeyPair *tls.Certificate
			err        error
		)

		if rootCA.CanSign() {
			// Create a new random ID for this certificate
			cn := identity.NewID()
			org := identity.NewID()

			if nodeInfo != nil {
				nodeInfo <- api.IssueNodeCertificateResponse{
					NodeID:         cn,
					NodeMembership: api.NodeMembershipAccepted,
				}
			}
			tlsKeyPair, err = rootCA.IssueAndSaveNewCertificates(krw, cn, proposedRole, org)
			if err != nil {
				log.G(ctx).WithFields(logrus.Fields{
					"node.id":   cn,
					"node.role": proposedRole,
				}).WithError(err).Errorf("failed to issue and save new certificate")
				return nil, err
			}

			log.G(ctx).WithFields(logrus.Fields{
				"node.id":   cn,
				"node.role": proposedRole,
			}).Debug("issued new TLS certificate")
		} else {
			// There was an error loading our Credentials, let's get a new certificate issued
			// Last argument is nil because at this point we don't have any valid TLS creds
			tlsKeyPair, err = rootCA.RequestAndSaveNewCertificates(ctx, krw, token, remotes, nil, nodeInfo)
			if err != nil {
				log.G(ctx).WithError(err).Error("failed to request save new certificate")
				return nil, err
			}
		}
		// Create the Server TLS Credentials for this node. These will not be used by workers.
		serverTLSCreds, err = rootCA.NewServerTLSCredentials(tlsKeyPair)
		if err != nil {
			return nil, err
		}

		// Create a TLSConfig to be used when this node connects as a client to another remote node.
		// We're using ManagerRole as remote serverName for TLS host verification
		clientTLSCreds, err = rootCA.NewClientTLSCredentials(tlsKeyPair, ManagerRole)
		if err != nil {
			return nil, err
		}
		log.G(ctx).WithFields(logrus.Fields{
			"node.id":   clientTLSCreds.NodeID(),
			"node.role": clientTLSCreds.Role(),
		}).Debugf("new node credentials generated: %s", krw.Target())
	} else {
		if nodeInfo != nil {
			nodeInfo <- api.IssueNodeCertificateResponse{
				NodeID:         clientTLSCreds.NodeID(),
				NodeMembership: api.NodeMembershipAccepted,
			}
		}
		log.G(ctx).WithFields(logrus.Fields{
			"node.id":   clientTLSCreds.NodeID(),
			"node.role": clientTLSCreds.Role(),
		}).Debug("loaded node credentials")
	}

	return NewSecurityConfig(&rootCA, krw, clientTLSCreds, serverTLSCreds), nil
}
Пример #20
0
// RenewTLSConfig will continuously monitor for the necessity of renewing the local certificates, either by
// issuing them locally if key-material is available, or requesting them from a remote CA.
func RenewTLSConfig(ctx context.Context, s *SecurityConfig, connBroker *connectionbroker.Broker, renew <-chan struct{}) <-chan CertificateUpdate {
	updates := make(chan CertificateUpdate)

	go func() {
		var retry time.Duration
		expBackoff := events.NewExponentialBackoff(RenewTLSExponentialBackoff)
		defer close(updates)
		for {
			ctx = log.WithModule(ctx, "tls")
			log := log.G(ctx).WithFields(logrus.Fields{
				"node.id":   s.ClientTLSCreds.NodeID(),
				"node.role": s.ClientTLSCreds.Role(),
			})
			// Our starting default will be 5 minutes
			retry = 5 * time.Minute

			// Since the expiration of the certificate is managed remotely we should update our
			// retry timer on every iteration of this loop.
			// Retrieve the current certificate expiration information.
			validFrom, validUntil, err := readCertValidity(s.KeyReader())
			if err != nil {
				// We failed to read the expiration, let's stick with the starting default
				log.Errorf("failed to read the expiration of the TLS certificate in: %s", s.KeyReader().Target())

				select {
				case updates <- CertificateUpdate{Err: errors.New("failed to read certificate expiration")}:
				case <-ctx.Done():
					log.Info("shutting down certificate renewal routine")
					return
				}
			} else {
				// If we have an expired certificate, try to renew immediately: the hope that this is a temporary clock skew, or
				// we can issue our own TLS certs.
				if validUntil.Before(time.Now()) {
					log.Warn("the current TLS certificate is expired, so an attempt to renew it will be made immediately")
					// retry immediately(ish) with exponential backoff
					retry = expBackoff.Proceed(nil)
				} else {
					// Random retry time between 50% and 80% of the total time to expiration
					retry = calculateRandomExpiry(validFrom, validUntil)
				}
			}

			log.WithFields(logrus.Fields{
				"time": time.Now().Add(retry),
			}).Debugf("next certificate renewal scheduled for %v from now", retry)

			select {
			case <-time.After(retry):
				log.Info("renewing certificate")
			case <-renew:
				log.Info("forced certificate renewal")
			case <-ctx.Done():
				log.Info("shutting down certificate renewal routine")
				return
			}

			// ignore errors - it will just try again later
			var certUpdate CertificateUpdate
			if err := RenewTLSConfigNow(ctx, s, connBroker); err != nil {
				certUpdate.Err = err
				expBackoff.Failure(nil, nil)
			} else {
				certUpdate.Role = s.ClientTLSCreds.Role()
				expBackoff = events.NewExponentialBackoff(RenewTLSExponentialBackoff)
			}

			select {
			case updates <- certUpdate:
			case <-ctx.Done():
				log.Info("shutting down certificate renewal routine")
				return
			}
		}
	}()

	return updates
}