func (u *Updater) useExistingTask(ctx context.Context, slot slot, existing *api.Task) { var removeTasks []*api.Task for _, t := range slot { if t != existing { removeTasks = append(removeTasks, t) } } if len(removeTasks) != 0 || existing.DesiredState != api.TaskStateRunning { _, err := u.store.Batch(func(batch *store.Batch) error { u.removeOldTasks(ctx, batch, removeTasks) if existing.DesiredState != api.TaskStateRunning { err := batch.Update(func(tx store.Tx) error { t := store.GetTask(tx, existing.ID) if t == nil { return fmt.Errorf("task %s not found while trying to start it", existing.ID) } if t.DesiredState >= api.TaskStateRunning { return fmt.Errorf("task %s was already started when reached by updater", existing.ID) } t.DesiredState = api.TaskStateRunning return store.UpdateTask(tx, t) }) if err != nil { log.G(ctx).WithError(err).Errorf("starting task %s failed", existing.ID) } } return nil }) if err != nil { log.G(ctx).WithError(err).Error("updater batch transaction failed") } } }
func deleteServiceTasks(ctx context.Context, s *store.MemoryStore, service *api.Service) { var ( tasks []*api.Task err error ) s.View(func(tx store.ReadTx) { tasks, err = store.FindTasks(tx, store.ByServiceID(service.ID)) }) if err != nil { log.G(ctx).WithError(err).Errorf("failed to list tasks") return } _, err = s.Batch(func(batch *store.Batch) error { for _, t := range tasks { err := batch.Update(func(tx store.Tx) error { if err := store.DeleteTask(tx, t.ID); err != nil { log.G(ctx).WithError(err).Errorf("failed to delete task") } return nil }) if err != nil { return err } } return nil }) if err != nil { log.G(ctx).WithError(err).Errorf("task search transaction failed") } }
func (u *Updater) rollbackUpdate(ctx context.Context, serviceID, message string) { log.G(ctx).Debugf("starting rollback of service %s", serviceID) var service *api.Service err := u.store.Update(func(tx store.Tx) error { service = store.GetService(tx, serviceID) if service == nil { return nil } if service.UpdateStatus == nil { // The service was updated since we started this update return nil } service.UpdateStatus.State = api.UpdateStatus_ROLLBACK_STARTED service.UpdateStatus.Message = message if service.PreviousSpec == nil { return errors.New("cannot roll back service because no previous spec is available") } service.Spec = *service.PreviousSpec service.PreviousSpec = nil return store.UpdateService(tx, service) }) if err != nil { log.G(ctx).WithError(err).Errorf("failed to start rollback of service %s", serviceID) return } }
func (g *GlobalOrchestrator) removeTasksFromNode(ctx context.Context, node *api.Node) { var ( tasks []*api.Task err error ) g.store.View(func(tx store.ReadTx) { tasks, err = store.FindTasks(tx, store.ByNodeID(node.ID)) }) if err != nil { log.G(ctx).WithError(err).Errorf("global orchestrator: removeTasksFromNode failed finding tasks") return } _, err = g.store.Batch(func(batch *store.Batch) error { for _, t := range tasks { // GlobalOrchestrator only removes tasks from globalServices if _, exists := g.globalServices[t.ServiceID]; exists { g.removeTask(ctx, batch, t) } } return nil }) if err != nil { log.G(ctx).WithError(err).Errorf("global orchestrator: removeTasksFromNode failed") } }
func (na *NetworkAllocator) deallocateVIP(vip *api.Endpoint_VirtualIP) error { localNet := na.getNetwork(vip.NetworkID) if localNet == nil { return fmt.Errorf("networkallocator: could not find local network state") } ipam, _, err := na.resolveIPAM(localNet.nw) if err != nil { return fmt.Errorf("failed to resolve IPAM while allocating : %v", err) } // Retrieve the poolID and immediately nuke // out the mapping. poolID := localNet.endpoints[vip.Addr] delete(localNet.endpoints, vip.Addr) ip, _, err := net.ParseCIDR(vip.Addr) if err != nil { log.G(context.TODO()).Errorf("Could not parse VIP address %s while releasing", vip.Addr) return err } if err := ipam.ReleaseAddress(poolID, ip); err != nil { log.G(context.TODO()).Errorf("IPAM failure while releasing VIP address %s: %v", vip.Addr, err) return err } return nil }
// getIDs returns an ordered set of IDs included in the given snapshot and // the entries. The given snapshot/entries can contain two kinds of // ID-related entry: // - ConfChangeAddNode, in which case the contained ID will be added into the set. // - ConfChangeRemoveNode, in which case the contained ID will be removed from the set. func getIDs(snap *raftpb.Snapshot, ents []raftpb.Entry) []uint64 { ids := make(map[uint64]bool) if snap != nil { for _, id := range snap.Metadata.ConfState.Nodes { ids[id] = true } } for _, e := range ents { if e.Type != raftpb.EntryConfChange { continue } if snap != nil && e.Index < snap.Metadata.Index { continue } var cc raftpb.ConfChange if err := cc.Unmarshal(e.Data); err != nil { log.G(context.Background()).Panicf("unmarshal configuration change should never fail: %v", err) } switch cc.Type { case raftpb.ConfChangeAddNode: ids[cc.NodeID] = true case raftpb.ConfChangeRemoveNode: delete(ids, cc.NodeID) case raftpb.ConfChangeUpdateNode: // do nothing default: log.G(context.Background()).Panic("ConfChange Type should be either ConfChangeAddNode or ConfChangeRemoveNode!") } } var sids []uint64 for id := range ids { sids = append(sids, id) } return sids }
func (u *Updater) pauseUpdate(ctx context.Context, serviceID, message string) { log.G(ctx).Debugf("pausing update of service %s", serviceID) err := u.store.Update(func(tx store.Tx) error { service := store.GetService(tx, serviceID) if service == nil { return nil } if service.UpdateStatus == nil { // The service was updated since we started this update return nil } if service.UpdateStatus.State == api.UpdateStatus_ROLLBACK_STARTED { service.UpdateStatus.State = api.UpdateStatus_ROLLBACK_PAUSED } else { service.UpdateStatus.State = api.UpdateStatus_PAUSED } service.UpdateStatus.Message = message return store.UpdateService(tx, service) }) if err != nil { log.G(ctx).WithError(err).Errorf("failed to pause update of service %s", serviceID) } }
// start begins the session and returns the first SessionMessage. func (s *session) start(ctx context.Context) error { log.G(ctx).Debugf("(*session).start") client := api.NewDispatcherClient(s.agent.config.Conn) description, err := s.agent.config.Executor.Describe(ctx) if err != nil { log.G(ctx).WithError(err).WithField("executor", s.agent.config.Executor). Errorf("node description unavailable") return err } // Override hostname if s.agent.config.Hostname != "" { description.Hostname = s.agent.config.Hostname } stream, err := client.Session(ctx, &api.SessionRequest{ Description: description, }) if err != nil { return err } msg, err := stream.Recv() if err != nil { return err } s.sessionID = msg.SessionID s.session = stream return s.handleSessionMessage(ctx, msg) }
func (a *Allocator) procUnallocatedServices(ctx context.Context) { nc := a.netCtx var allocatedServices []*api.Service for _, s := range nc.unallocatedServices { if !nc.nwkAllocator.IsServiceAllocated(s) { if err := a.allocateService(ctx, s); err != nil { log.G(ctx).WithError(err).Debugf("Failed allocation of unallocated service %s", s.ID) continue } allocatedServices = append(allocatedServices, s) } } if len(allocatedServices) == 0 { return } committed, err := a.store.Batch(func(batch *store.Batch) error { for _, s := range allocatedServices { if err := a.commitAllocatedService(ctx, batch, s); err != nil { log.G(ctx).WithError(err).Debugf("Failed to commit allocation of unallocated service %s", s.ID) continue } } return nil }) if err != nil { log.G(ctx).WithError(err).Error("Failed to commit allocation of unallocated services") } for _, s := range allocatedServices[:committed] { delete(nc.unallocatedServices, s.ID) } }
// reconcileServiceOneNode checks one service on one node func (g *GlobalOrchestrator) reconcileServiceOneNode(ctx context.Context, serviceID string, nodeID string) { _, exists := g.nodes[nodeID] if !exists { return } service, exists := g.globalServices[serviceID] if !exists { return } // the node has completed this servie completed := false // tasks for this node and service var ( tasks []*api.Task err error ) g.store.View(func(tx store.ReadTx) { var tasksOnNode []*api.Task tasksOnNode, err = store.FindTasks(tx, store.ByNodeID(nodeID)) if err != nil { return } for _, t := range tasksOnNode { // only interested in one service if t.ServiceID != serviceID { continue } if isTaskRunning(t) { tasks = append(tasks, t) } else { if isTaskCompleted(t, restartCondition(t)) { completed = true } } } }) if err != nil { log.G(ctx).WithError(err).Errorf("global orchestrator: reconcile failed finding tasks") return } _, err = g.store.Batch(func(batch *store.Batch) error { // if restart policy considers this node has finished its task // it should remove all running tasks if completed { g.removeTasks(ctx, batch, service, tasks) return nil } // this node needs to run 1 copy of the task if len(tasks) == 0 { g.addTask(ctx, batch, service, nodeID) } else { g.removeTasks(ctx, batch, service, tasks[1:]) } return nil }) if err != nil { log.G(ctx).WithError(err).Errorf("global orchestrator: reconcileServiceOneNode batch failed") } }
// updateCluster is called when there are cluster changes, and it ensures that the local RootCA is // always aware of changes in clusterExpiry and the Root CA key material func (s *Server) updateCluster(ctx context.Context, cluster *api.Cluster) { s.mu.Lock() s.joinTokens = cluster.RootCA.JoinTokens.Copy() s.mu.Unlock() var err error // If the cluster has a RootCA, let's try to update our SecurityConfig to reflect the latest values rCA := cluster.RootCA if len(rCA.CACert) != 0 && len(rCA.CAKey) != 0 { expiry := DefaultNodeCertExpiration if cluster.Spec.CAConfig.NodeCertExpiry != nil { // NodeCertExpiry exists, let's try to parse the duration out of it clusterExpiry, err := ptypes.Duration(cluster.Spec.CAConfig.NodeCertExpiry) if err != nil { log.G(ctx).WithFields(logrus.Fields{ "cluster.id": cluster.ID, "method": "(*Server).updateCluster", }).WithError(err).Warn("failed to parse certificate expiration, using default") } else { // We were able to successfully parse the expiration out of the cluster. expiry = clusterExpiry } } else { // NodeCertExpiry seems to be nil log.G(ctx).WithFields(logrus.Fields{ "cluster.id": cluster.ID, "method": "(*Server).updateCluster", }).WithError(err).Warn("failed to parse certificate expiration, using default") } // Attempt to update our local RootCA with the new parameters err = s.securityConfig.UpdateRootCA(rCA.CACert, rCA.CAKey, expiry) if err != nil { log.G(ctx).WithFields(logrus.Fields{ "cluster.id": cluster.ID, "method": "(*Server).updateCluster", }).WithError(err).Error("updating Root CA failed") } else { log.G(ctx).WithFields(logrus.Fields{ "cluster.id": cluster.ID, "method": "(*Server).updateCluster", }).Debugf("Root CA updated successfully") } } // Update our security config with the list of External CA URLs // from the new cluster state. // TODO(aaronl): In the future, this will be abstracted with an // ExternalCA interface that has different implementations for // different CA types. At the moment, only CFSSL is supported. var cfsslURLs []string for _, ca := range cluster.Spec.CAConfig.ExternalCAs { if ca.Protocol == api.ExternalCA_CAProtocolCFSSL { cfsslURLs = append(cfsslURLs, ca.URL) } } s.securityConfig.externalCA.UpdateURLs(cfsslURLs...) }
// Init prepares the worker for assignments. func (w *worker) Init(ctx context.Context) error { w.mu.Lock() defer w.mu.Unlock() ctx = log.WithModule(ctx, "worker") // TODO(stevvooe): Start task cleanup process. // read the tasks from the database and start any task managers that may be needed. return w.db.Update(func(tx *bolt.Tx) error { return WalkTasks(tx, func(task *api.Task) error { if !TaskAssigned(tx, task.ID) { // NOTE(stevvooe): If tasks can survive worker restart, we need // to startup the controller and ensure they are removed. For // now, we can simply remove them from the database. if err := DeleteTask(tx, task.ID); err != nil { log.G(ctx).WithError(err).Errorf("error removing task %v", task.ID) } return nil } status, err := GetTaskStatus(tx, task.ID) if err != nil { log.G(ctx).WithError(err).Error("unable to read tasks status") return nil } task.Status = *status // merges the status into the task, ensuring we start at the right point. return w.startTask(ctx, tx, task) }) }) }
func (u *Updater) worker(ctx context.Context, queue <-chan *api.Task) { for t := range queue { updated := newTask(u.cluster, u.newService, t.Slot) updated.DesiredState = api.TaskStateReady if isGlobalService(u.newService) { updated.NodeID = t.NodeID } if err := u.updateTask(ctx, t, updated); err != nil { log.G(ctx).WithError(err).WithField("task.id", t.ID).Error("update failed") } if u.newService.Spec.Update != nil && (u.newService.Spec.Update.Delay.Seconds != 0 || u.newService.Spec.Update.Delay.Nanos != 0) { delay, err := ptypes.Duration(&u.newService.Spec.Update.Delay) if err != nil { log.G(ctx).WithError(err).Error("invalid update delay") continue } select { case <-time.After(delay): case <-u.stopChan: return } } } }
// events issues a call to the events API and returns a channel with all // events. The stream of events can be shutdown by cancelling the context. func (c *containerAdapter) events(ctx context.Context) <-chan events.Message { log.G(ctx).Debugf("waiting on events") buffer, l := c.backend.SubscribeToEvents(time.Time{}, time.Time{}, c.container.eventFilter()) eventsq := make(chan events.Message, len(buffer)) for _, event := range buffer { eventsq <- event } go func() { defer c.backend.UnsubscribeFromEvents(l) for { select { case ev := <-l: jev, ok := ev.(events.Message) if !ok { log.G(ctx).Warnf("unexpected event message: %q", ev) continue } select { case eventsq <- jev: case <-ctx.Done(): return } case <-ctx.Done(): return } } }() return eventsq }
// UpdateTaskStatus attempts to send a task status update over the current session, // blocking until the operation is completed. // // If an error is returned, the operation should be retried. func (a *Agent) UpdateTaskStatus(ctx context.Context, taskID string, status *api.TaskStatus) error { log.G(ctx).WithField("task.id", taskID).Debugf("(*Agent).UpdateTaskStatus") ctx, cancel := context.WithCancel(ctx) defer cancel() errs := make(chan error, 1) if err := a.withSession(ctx, func(session *session) error { go func() { err := session.sendTaskStatus(ctx, taskID, status) if err != nil { if err == errTaskUnknown { err = nil // dispatcher no longer cares about this task. } else { log.G(ctx).WithError(err).Error("sending task status update failed") } } else { log.G(ctx).Debug("task status reported") } errs <- err }() return nil }); err != nil { return err } select { case err := <-errs: return err case <-ctx.Done(): return ctx.Err() } }
func (a *Allocator) procUnallocatedNetworks(ctx context.Context) { nc := a.netCtx var allocatedNetworks []*api.Network for _, n := range nc.unallocatedNetworks { if !nc.nwkAllocator.IsAllocated(n) { if err := a.allocateNetwork(ctx, n); err != nil { log.G(ctx).WithError(err).Debugf("Failed allocation of unallocated network %s", n.ID) continue } allocatedNetworks = append(allocatedNetworks, n) } } if len(allocatedNetworks) == 0 { return } committed, err := a.store.Batch(func(batch *store.Batch) error { for _, n := range allocatedNetworks { if err := a.commitAllocatedNetwork(ctx, batch, n); err != nil { log.G(ctx).WithError(err).Debugf("Failed to commit allocation of unallocated network %s", n.ID) continue } } return nil }) if err != nil { log.G(ctx).WithError(err).Error("Failed to commit allocation of unallocated networks") } for _, n := range allocatedNetworks[:committed] { delete(nc.unallocatedNetworks, n.ID) } }
func (u *Updater) completeUpdate(ctx context.Context, serviceID string) { log.G(ctx).Debugf("update of service %s complete", serviceID) err := u.store.Update(func(tx store.Tx) error { service := store.GetService(tx, serviceID) if service == nil { return nil } if service.UpdateStatus == nil { // The service was changed since we started this update return nil } if service.UpdateStatus.State == api.UpdateStatus_ROLLBACK_STARTED { service.UpdateStatus.State = api.UpdateStatus_ROLLBACK_COMPLETED service.UpdateStatus.Message = "rollback completed" } else { service.UpdateStatus.State = api.UpdateStatus_COMPLETED service.UpdateStatus.Message = "update completed" } service.UpdateStatus.CompletedAt = ptypes.MustTimestampProto(time.Now()) return store.UpdateService(tx, service) }) if err != nil { log.G(ctx).WithError(err).Errorf("failed to mark update of service %s complete", serviceID) } }
// start begins the session and returns the first SessionMessage. func (s *session) start(ctx context.Context) error { log.G(ctx).Debugf("(*session).start") description, err := s.agent.config.Executor.Describe(ctx) if err != nil { log.G(ctx).WithError(err).WithField("executor", s.agent.config.Executor). Errorf("node description unavailable") return err } // Override hostname if s.agent.config.Hostname != "" { description.Hostname = s.agent.config.Hostname } errChan := make(chan error, 1) var ( msg *api.SessionMessage stream api.Dispatcher_SessionClient ) // Note: we don't defer cancellation of this context, because the // streaming RPC is used after this function returned. We only cancel // it in the timeout case to make sure the goroutine completes. sessionCtx, cancelSession := context.WithCancel(ctx) // Need to run Session in a goroutine since there's no way to set a // timeout for an individual Recv call in a stream. go func() { client := api.NewDispatcherClient(s.conn) stream, err = client.Session(sessionCtx, &api.SessionRequest{ Description: description, SessionID: s.sessionID, }) if err != nil { errChan <- err return } msg, err = stream.Recv() errChan <- err }() select { case err := <-errChan: if err != nil { return err } case <-time.After(dispatcherRPCTimeout): cancelSession() return errors.New("session initiation timed out") } s.sessionID = msg.SessionID s.session = stream return s.handleSessionMessage(ctx, msg) }
// Stop stops the manager. It immediately closes all open connections and // active RPCs as well as stopping the scheduler. func (m *Manager) Stop(ctx context.Context) { log.G(ctx).Info("Stopping manager") // It's not safe to start shutting down while the manager is still // starting up. <-m.started // the mutex stops us from trying to stop while we're alrady stopping, or // from returning before we've finished stopping. m.mu.Lock() defer m.mu.Unlock() select { // check to see that we've already stopped case <-m.stopped: return default: // do nothing, we're stopping for the first time } // once we start stopping, send a signal that we're doing so. this tells // Run that we've started stopping, when it gets the error from errServe // it also prevents the loop from processing any more stuff. close(m.stopped) m.Dispatcher.Stop() m.caserver.Stop() if m.allocator != nil { m.allocator.Stop() } if m.replicatedOrchestrator != nil { m.replicatedOrchestrator.Stop() } if m.globalOrchestrator != nil { m.globalOrchestrator.Stop() } if m.taskReaper != nil { m.taskReaper.Stop() } if m.scheduler != nil { m.scheduler.Stop() } if m.keyManager != nil { m.keyManager.Stop() } if m.connSelector != nil { m.connSelector.Stop() } m.RaftNode.Shutdown() // some time after this point, Run will receive an error from one of these m.server.Stop() m.localserver.Stop() log.G(ctx).Info("Manager shut down") // mutex is released and Run can return now }
func (r *ReplicatedOrchestrator) reconcile(ctx context.Context, service *api.Service) { var ( tasks []*api.Task err error ) r.store.View(func(tx store.ReadTx) { tasks, err = store.FindTasks(tx, store.ByServiceID(service.ID)) }) if err != nil { log.G(ctx).WithError(err).Errorf("reconcile failed finding tasks") return } runningTasks := make([]*api.Task, 0, len(tasks)) runningInstances := make(map[uint64]struct{}) // this could be a bitfield... for _, t := range tasks { // Technically the check below could just be // t.DesiredState <= api.TaskStateRunning, but ignoring tasks // with DesiredState == NEW simplifies the drainer unit tests. if t.DesiredState > api.TaskStateNew && t.DesiredState <= api.TaskStateRunning { runningTasks = append(runningTasks, t) runningInstances[t.Slot] = struct{}{} } } numTasks := len(runningTasks) deploy := service.Spec.GetMode().(*api.ServiceSpec_Replicated) specifiedInstances := int(deploy.Replicated.Replicas) // TODO(aaronl): Add support for restart delays. _, err = r.store.Batch(func(batch *store.Batch) error { switch { case specifiedInstances > numTasks: log.G(ctx).Debugf("Service %s was scaled up from %d to %d instances", service.ID, numTasks, specifiedInstances) // Update all current tasks then add missing tasks r.updater.Update(ctx, service, runningTasks) r.addTasks(ctx, batch, service, runningInstances, specifiedInstances-numTasks) case specifiedInstances < numTasks: // Update up to N tasks then remove the extra log.G(ctx).Debugf("Service %s was scaled down from %d to %d instances", service.ID, numTasks, specifiedInstances) r.updater.Update(ctx, service, runningTasks[:specifiedInstances]) r.removeTasks(ctx, batch, service, runningTasks[specifiedInstances:]) case specifiedInstances == numTasks: // Simple update, no scaling - update all tasks. r.updater.Update(ctx, service, runningTasks) } return nil }) if err != nil { log.G(ctx).WithError(err).Errorf("reconcile batch failed") } }
func (g *GlobalOrchestrator) reconcileOneService(ctx context.Context, service *api.Service) { var ( tasks []*api.Task err error ) g.store.View(func(tx store.ReadTx) { tasks, err = store.FindTasks(tx, store.ByServiceID(service.ID)) }) if err != nil { log.G(ctx).WithError(err).Errorf("global orchestrator: reconcileOneService failed finding tasks") return } // a node may have completed this service nodeCompleted := make(map[string]struct{}) // nodeID -> task list nodeTasks := make(map[string][]*api.Task) for _, t := range tasks { if isTaskRunning(t) { // Collect all running instances of this service nodeTasks[t.NodeID] = append(nodeTasks[t.NodeID], t) } else { // for finished tasks, check restartPolicy if isTaskCompleted(t, restartCondition(t)) { nodeCompleted[t.NodeID] = struct{}{} } } } _, err = g.store.Batch(func(batch *store.Batch) error { var updateTasks []*api.Task for nodeID := range g.nodes { ntasks := nodeTasks[nodeID] // if restart policy considers this node has finished its task // it should remove all running tasks if _, exists := nodeCompleted[nodeID]; exists { g.removeTasks(ctx, batch, service, ntasks) return nil } // this node needs to run 1 copy of the task if len(ntasks) == 0 { g.addTask(ctx, batch, service, nodeID) } else { updateTasks = append(updateTasks, ntasks[0]) g.removeTasks(ctx, batch, service, ntasks[1:]) } } if len(updateTasks) > 0 { g.updater.Update(ctx, service, updateTasks) } return nil }) if err != nil { log.G(ctx).WithError(err).Errorf("global orchestrator: reconcileOneService transaction failed") } }
func (u *Updater) worker(ctx context.Context, queue <-chan slot) { for slot := range queue { // Do we have a task with the new spec in desired state = RUNNING? // If so, all we have to do to complete the update is remove the // other tasks. Or if we have a task with the new spec that has // desired state < RUNNING, advance it to running and remove the // other tasks. var ( runningTask *api.Task cleanTask *api.Task ) for _, t := range slot { if !u.isTaskDirty(t) { if t.DesiredState == api.TaskStateRunning { runningTask = t break } if t.DesiredState < api.TaskStateRunning { cleanTask = t } } } if runningTask != nil { if err := u.useExistingTask(ctx, slot, runningTask); err != nil { log.G(ctx).WithError(err).Error("update failed") } } else if cleanTask != nil { if err := u.useExistingTask(ctx, slot, cleanTask); err != nil { log.G(ctx).WithError(err).Error("update failed") } } else { updated := newTask(u.cluster, u.newService, slot[0].Slot) updated.DesiredState = api.TaskStateReady if isGlobalService(u.newService) { updated.NodeID = slot[0].NodeID } if err := u.updateTask(ctx, slot, updated); err != nil { log.G(ctx).WithError(err).WithField("task.id", updated.ID).Error("update failed") } } if u.newService.Spec.Update != nil && (u.newService.Spec.Update.Delay.Seconds != 0 || u.newService.Spec.Update.Delay.Nanos != 0) { delay, err := ptypes.Duration(&u.newService.Spec.Update.Delay) if err != nil { log.G(ctx).WithError(err).Error("invalid update delay") continue } select { case <-time.After(delay): case <-u.stopChan: return } } } }
// CreateSecurityConfig creates a new key and cert for this node, either locally // or via a remote CA. func (rootCA RootCA) CreateSecurityConfig(ctx context.Context, krw *KeyReadWriter, config CertificateRequestConfig) (*SecurityConfig, error) { ctx = log.WithModule(ctx, "tls") var ( tlsKeyPair *tls.Certificate err error ) if rootCA.CanSign() { // Create a new random ID for this certificate cn := identity.NewID() org := identity.NewID() proposedRole := ManagerRole tlsKeyPair, err = rootCA.IssueAndSaveNewCertificates(krw, cn, proposedRole, org) if err != nil { log.G(ctx).WithFields(logrus.Fields{ "node.id": cn, "node.role": proposedRole, }).WithError(err).Errorf("failed to issue and save new certificate") return nil, err } log.G(ctx).WithFields(logrus.Fields{ "node.id": cn, "node.role": proposedRole, }).Debug("issued new TLS certificate") } else { // Request certificate issuance from a remote CA. // Last argument is nil because at this point we don't have any valid TLS creds tlsKeyPair, err = rootCA.RequestAndSaveNewCertificates(ctx, krw, config) if err != nil { log.G(ctx).WithError(err).Error("failed to request save new certificate") return nil, err } } // Create the Server TLS Credentials for this node. These will not be used by workers. serverTLSCreds, err := rootCA.NewServerTLSCredentials(tlsKeyPair) if err != nil { return nil, err } // Create a TLSConfig to be used when this node connects as a client to another remote node. // We're using ManagerRole as remote serverName for TLS host verification clientTLSCreds, err := rootCA.NewClientTLSCredentials(tlsKeyPair, ManagerRole) if err != nil { return nil, err } log.G(ctx).WithFields(logrus.Fields{ "node.id": clientTLSCreds.NodeID(), "node.role": clientTLSCreds.Role(), }).Debugf("new node credentials generated: %s", krw.Target()) return NewSecurityConfig(&rootCA, krw, clientTLSCreds, serverTLSCreds), nil }
// events issues a call to the events API and returns a channel with all // events. The stream of events can be shutdown by cancelling the context. // // A chan struct{} is returned that will be closed if the event processing // fails and needs to be restarted. func (c *containerAdapter) events(ctx context.Context) (<-chan events.Message, <-chan struct{}, error) { // TODO(stevvooe): Move this to a single, global event dispatch. For // now, we create a connection per container. var ( eventsq = make(chan events.Message) closed = make(chan struct{}) ) log.G(ctx).Debugf("waiting on events") // TODO(stevvooe): For long running tasks, it is likely that we will have // to restart this under failure. rc, err := c.client.Events(ctx, types.EventsOptions{ Since: "0", Filters: c.container.eventFilter(), }) if err != nil { return nil, nil, err } go func(rc io.ReadCloser) { defer rc.Close() defer close(closed) select { case <-ctx.Done(): // exit return default: } dec := json.NewDecoder(rc) for { var event events.Message if err := dec.Decode(&event); err != nil { // TODO(stevvooe): This error handling isn't quite right. if err == io.EOF { return } log.G(ctx).Errorf("error decoding event: %v", err) return } select { case eventsq <- event: case <-ctx.Done(): return } } }(rc) return eventsq, closed, nil }
func (a *Agent) handleSessionMessage(ctx context.Context, message *api.SessionMessage) error { seen := map[api.Peer]struct{}{} for _, manager := range message.Managers { if manager.Peer.Addr == "" { log.G(ctx).WithField("manager.addr", manager.Peer.Addr). Warnf("skipping bad manager address") continue } a.config.Managers.Observe(*manager.Peer, int(manager.Weight)) seen[*manager.Peer] = struct{}{} } if message.Node != nil { if a.node == nil || !nodesEqual(a.node, message.Node) { if a.config.NotifyRoleChange != nil { a.config.NotifyRoleChange <- message.Node.Spec.Role } a.node = message.Node.Copy() if err := a.config.Executor.Configure(ctx, a.node); err != nil { log.G(ctx).WithError(err).Error("node configure failed") } } } // prune managers not in list. for peer := range a.config.Managers.Weights() { if _, ok := seen[peer]; !ok { a.config.Managers.Remove(peer) } } if message.NetworkBootstrapKeys == nil { return nil } for _, key := range message.NetworkBootstrapKeys { same := false for _, agentKey := range a.keys { if agentKey.LamportTime == key.LamportTime { same = true } } if !same { a.keys = message.NetworkBootstrapKeys if err := a.config.Executor.SetNetworkBootstrapKeys(a.keys); err != nil { panic(fmt.Errorf("configuring network key failed")) } } } return nil }
func releasePools(ipam ipamapi.Ipam, icList []*api.IPAMConfig, pools map[string]string) { for _, ic := range icList { if err := ipam.ReleaseAddress(pools[ic.Subnet], net.ParseIP(ic.Gateway)); err != nil { log.G(context.TODO()).Errorf("Failed to release address %s: %v", ic.Subnet, err) } } for k, p := range pools { if err := ipam.ReleasePool(p); err != nil { log.G(context.TODO()).Errorf("Failed to release pool %s: %v", k, err) } } }
// createConfigChangeEnts creates a series of Raft entries (i.e. // EntryConfChange) to remove the set of given IDs from the cluster. The ID // `self` is _not_ removed, even if present in the set. // If `self` is not inside the given ids, it creates a Raft entry to add a // default member with the given `self`. func createConfigChangeEnts(ids []uint64, self uint64, term, index uint64) []raftpb.Entry { var ents []raftpb.Entry next := index + 1 found := false for _, id := range ids { if id == self { found = true continue } cc := &raftpb.ConfChange{ Type: raftpb.ConfChangeRemoveNode, NodeID: id, } data, err := cc.Marshal() if err != nil { log.G(context.Background()).Panicf("marshal configuration change should never fail: %v", err) } e := raftpb.Entry{ Type: raftpb.EntryConfChange, Data: data, Term: term, Index: next, } ents = append(ents, e) next++ } if !found { node := &api.RaftMember{RaftID: self} meta, err := node.Marshal() if err != nil { log.G(context.Background()).Panicf("marshal member should never fail: %v", err) } cc := &raftpb.ConfChange{ Type: raftpb.ConfChangeAddNode, NodeID: self, Context: meta, } data, err := cc.Marshal() if err != nil { log.G(context.Background()).Panicf("marshal configuration change should never fail: %v", err) } e := raftpb.Entry{ Type: raftpb.EntryConfChange, Data: data, Term: term, Index: next, } ents = append(ents, e) } return ents }
// register is used for registration of node with particular dispatcher. func (d *Dispatcher) register(ctx context.Context, nodeID string, description *api.NodeDescription) (string, error) { // prevent register until we're ready to accept it if err := d.isRunningLocked(); err != nil { return "", err } if err := d.nodes.CheckRateLimit(nodeID); err != nil { return "", err } // create or update node in store // TODO(stevvooe): Validate node specification. var node *api.Node err := d.store.Update(func(tx store.Tx) error { node = store.GetNode(tx, nodeID) if node == nil { return ErrNodeNotFound } node.Description = description node.Status = api.NodeStatus{ State: api.NodeStatus_READY, } return store.UpdateNode(tx, node) }) if err != nil { return "", err } expireFunc := func() { nodeStatus := api.NodeStatus{State: api.NodeStatus_DOWN, Message: "heartbeat failure"} log.G(ctx).Debugf("heartbeat expiration") if err := d.nodeRemove(nodeID, nodeStatus); err != nil { log.G(ctx).WithError(err).Errorf("failed deregistering node after heartbeat expiration") } } rn := d.nodes.Add(node, expireFunc) // NOTE(stevvooe): We need be a little careful with re-registration. The // current implementation just matches the node id and then gives away the // sessionID. If we ever want to use sessionID as a secret, which we may // want to, this is giving away the keys to the kitchen. // // The right behavior is going to be informed by identity. Basically, each // time a node registers, we invalidate the session and issue a new // session, once identity is proven. This will cause misbehaved agents to // be kicked when multiple connections are made. return rn.SessionID, nil }
// Run starts the keymanager, it doesn't return func (k *KeyManager) Run(ctx context.Context) error { k.mu.Lock() ctx = log.WithModule(ctx, "keymanager") var ( clusters []*api.Cluster err error ) k.store.View(func(readTx store.ReadTx) { clusters, err = store.FindClusters(readTx, store.ByName(k.config.ClusterName)) }) if err != nil { log.G(ctx).Errorf("reading cluster config failed, %v", err) k.mu.Unlock() return err } cluster := clusters[0] if len(cluster.NetworkBootstrapKeys) == 0 { for _, subsys := range k.config.Subsystems { for i := 0; i < keyringSize; i++ { k.keyRing.keys = append(k.keyRing.keys, k.allocateKey(ctx, subsys)) } } if err := k.updateKey(cluster); err != nil { log.G(ctx).Errorf("store update failed %v", err) } } else { k.keyRing.lClock = cluster.EncryptionKeyLamportClock k.keyRing.keys = cluster.NetworkBootstrapKeys k.rotateKey(ctx) } ticker := time.NewTicker(k.config.RotationInterval) defer ticker.Stop() k.ctx, k.cancel = context.WithCancel(ctx) k.mu.Unlock() for { select { case <-ticker.C: k.rotateKey(ctx) case <-k.ctx.Done(): return nil } } }
// updateTaskStatus reports statuses to listeners, read lock must be held. func (w *worker) updateTaskStatus(ctx context.Context, tx *bolt.Tx, taskID string, status *api.TaskStatus) error { if err := PutTaskStatus(tx, taskID, status); err != nil { log.G(ctx).WithError(err).Error("failed writing status to disk") return err } // broadcast the task status out. for key := range w.listeners { if err := key.StatusReporter.UpdateTaskStatus(ctx, taskID, status); err != nil { log.G(ctx).WithError(err).Errorf("failed updating status for reporter %v", key.StatusReporter) } } return nil }