Exemple #1
0
func (u *Updater) useExistingTask(ctx context.Context, slot slot, existing *api.Task) {
	var removeTasks []*api.Task
	for _, t := range slot {
		if t != existing {
			removeTasks = append(removeTasks, t)
		}
	}
	if len(removeTasks) != 0 || existing.DesiredState != api.TaskStateRunning {
		_, err := u.store.Batch(func(batch *store.Batch) error {
			u.removeOldTasks(ctx, batch, removeTasks)

			if existing.DesiredState != api.TaskStateRunning {
				err := batch.Update(func(tx store.Tx) error {
					t := store.GetTask(tx, existing.ID)
					if t == nil {
						return fmt.Errorf("task %s not found while trying to start it", existing.ID)
					}
					if t.DesiredState >= api.TaskStateRunning {
						return fmt.Errorf("task %s was already started when reached by updater", existing.ID)
					}
					t.DesiredState = api.TaskStateRunning
					return store.UpdateTask(tx, t)
				})
				if err != nil {
					log.G(ctx).WithError(err).Errorf("starting task %s failed", existing.ID)
				}
			}
			return nil
		})
		if err != nil {
			log.G(ctx).WithError(err).Error("updater batch transaction failed")
		}
	}
}
Exemple #2
0
func deleteServiceTasks(ctx context.Context, s *store.MemoryStore, service *api.Service) {
	var (
		tasks []*api.Task
		err   error
	)
	s.View(func(tx store.ReadTx) {
		tasks, err = store.FindTasks(tx, store.ByServiceID(service.ID))
	})
	if err != nil {
		log.G(ctx).WithError(err).Errorf("failed to list tasks")
		return
	}

	_, err = s.Batch(func(batch *store.Batch) error {
		for _, t := range tasks {
			err := batch.Update(func(tx store.Tx) error {
				if err := store.DeleteTask(tx, t.ID); err != nil {
					log.G(ctx).WithError(err).Errorf("failed to delete task")
				}
				return nil
			})
			if err != nil {
				return err
			}
		}
		return nil
	})
	if err != nil {
		log.G(ctx).WithError(err).Errorf("task search transaction failed")
	}
}
Exemple #3
0
func (u *Updater) rollbackUpdate(ctx context.Context, serviceID, message string) {
	log.G(ctx).Debugf("starting rollback of service %s", serviceID)

	var service *api.Service
	err := u.store.Update(func(tx store.Tx) error {
		service = store.GetService(tx, serviceID)
		if service == nil {
			return nil
		}
		if service.UpdateStatus == nil {
			// The service was updated since we started this update
			return nil
		}

		service.UpdateStatus.State = api.UpdateStatus_ROLLBACK_STARTED
		service.UpdateStatus.Message = message

		if service.PreviousSpec == nil {
			return errors.New("cannot roll back service because no previous spec is available")
		}
		service.Spec = *service.PreviousSpec
		service.PreviousSpec = nil

		return store.UpdateService(tx, service)
	})

	if err != nil {
		log.G(ctx).WithError(err).Errorf("failed to start rollback of service %s", serviceID)
		return
	}
}
Exemple #4
0
func (g *GlobalOrchestrator) removeTasksFromNode(ctx context.Context, node *api.Node) {
	var (
		tasks []*api.Task
		err   error
	)
	g.store.View(func(tx store.ReadTx) {
		tasks, err = store.FindTasks(tx, store.ByNodeID(node.ID))
	})
	if err != nil {
		log.G(ctx).WithError(err).Errorf("global orchestrator: removeTasksFromNode failed finding tasks")
		return
	}

	_, err = g.store.Batch(func(batch *store.Batch) error {
		for _, t := range tasks {
			// GlobalOrchestrator only removes tasks from globalServices
			if _, exists := g.globalServices[t.ServiceID]; exists {
				g.removeTask(ctx, batch, t)
			}
		}
		return nil
	})
	if err != nil {
		log.G(ctx).WithError(err).Errorf("global orchestrator: removeTasksFromNode failed")
	}
}
func (na *NetworkAllocator) deallocateVIP(vip *api.Endpoint_VirtualIP) error {
	localNet := na.getNetwork(vip.NetworkID)
	if localNet == nil {
		return fmt.Errorf("networkallocator: could not find local network state")
	}

	ipam, _, err := na.resolveIPAM(localNet.nw)
	if err != nil {
		return fmt.Errorf("failed to resolve IPAM while allocating : %v", err)
	}

	// Retrieve the poolID and immediately nuke
	// out the mapping.
	poolID := localNet.endpoints[vip.Addr]
	delete(localNet.endpoints, vip.Addr)

	ip, _, err := net.ParseCIDR(vip.Addr)
	if err != nil {
		log.G(context.TODO()).Errorf("Could not parse VIP address %s while releasing", vip.Addr)
		return err
	}

	if err := ipam.ReleaseAddress(poolID, ip); err != nil {
		log.G(context.TODO()).Errorf("IPAM failure while releasing VIP address %s: %v", vip.Addr, err)
		return err
	}

	return nil
}
Exemple #6
0
// getIDs returns an ordered set of IDs included in the given snapshot and
// the entries. The given snapshot/entries can contain two kinds of
// ID-related entry:
// - ConfChangeAddNode, in which case the contained ID will be added into the set.
// - ConfChangeRemoveNode, in which case the contained ID will be removed from the set.
func getIDs(snap *raftpb.Snapshot, ents []raftpb.Entry) []uint64 {
	ids := make(map[uint64]bool)
	if snap != nil {
		for _, id := range snap.Metadata.ConfState.Nodes {
			ids[id] = true
		}
	}
	for _, e := range ents {
		if e.Type != raftpb.EntryConfChange {
			continue
		}
		if snap != nil && e.Index < snap.Metadata.Index {
			continue
		}
		var cc raftpb.ConfChange
		if err := cc.Unmarshal(e.Data); err != nil {
			log.G(context.Background()).Panicf("unmarshal configuration change should never fail: %v", err)
		}
		switch cc.Type {
		case raftpb.ConfChangeAddNode:
			ids[cc.NodeID] = true
		case raftpb.ConfChangeRemoveNode:
			delete(ids, cc.NodeID)
		case raftpb.ConfChangeUpdateNode:
			// do nothing
		default:
			log.G(context.Background()).Panic("ConfChange Type should be either ConfChangeAddNode or ConfChangeRemoveNode!")
		}
	}
	var sids []uint64
	for id := range ids {
		sids = append(sids, id)
	}
	return sids
}
Exemple #7
0
func (u *Updater) pauseUpdate(ctx context.Context, serviceID, message string) {
	log.G(ctx).Debugf("pausing update of service %s", serviceID)

	err := u.store.Update(func(tx store.Tx) error {
		service := store.GetService(tx, serviceID)
		if service == nil {
			return nil
		}
		if service.UpdateStatus == nil {
			// The service was updated since we started this update
			return nil
		}

		if service.UpdateStatus.State == api.UpdateStatus_ROLLBACK_STARTED {
			service.UpdateStatus.State = api.UpdateStatus_ROLLBACK_PAUSED
		} else {
			service.UpdateStatus.State = api.UpdateStatus_PAUSED
		}
		service.UpdateStatus.Message = message

		return store.UpdateService(tx, service)
	})

	if err != nil {
		log.G(ctx).WithError(err).Errorf("failed to pause update of service %s", serviceID)
	}
}
Exemple #8
0
// start begins the session and returns the first SessionMessage.
func (s *session) start(ctx context.Context) error {
	log.G(ctx).Debugf("(*session).start")

	client := api.NewDispatcherClient(s.agent.config.Conn)

	description, err := s.agent.config.Executor.Describe(ctx)
	if err != nil {
		log.G(ctx).WithError(err).WithField("executor", s.agent.config.Executor).
			Errorf("node description unavailable")
		return err
	}
	// Override hostname
	if s.agent.config.Hostname != "" {
		description.Hostname = s.agent.config.Hostname
	}

	stream, err := client.Session(ctx, &api.SessionRequest{
		Description: description,
	})
	if err != nil {
		return err
	}

	msg, err := stream.Recv()
	if err != nil {
		return err
	}

	s.sessionID = msg.SessionID
	s.session = stream

	return s.handleSessionMessage(ctx, msg)
}
Exemple #9
0
func (a *Allocator) procUnallocatedServices(ctx context.Context) {
	nc := a.netCtx
	var allocatedServices []*api.Service
	for _, s := range nc.unallocatedServices {
		if !nc.nwkAllocator.IsServiceAllocated(s) {
			if err := a.allocateService(ctx, s); err != nil {
				log.G(ctx).WithError(err).Debugf("Failed allocation of unallocated service %s", s.ID)
				continue
			}
			allocatedServices = append(allocatedServices, s)
		}
	}

	if len(allocatedServices) == 0 {
		return
	}

	committed, err := a.store.Batch(func(batch *store.Batch) error {
		for _, s := range allocatedServices {
			if err := a.commitAllocatedService(ctx, batch, s); err != nil {
				log.G(ctx).WithError(err).Debugf("Failed to commit allocation of unallocated service %s", s.ID)
				continue
			}
		}
		return nil
	})

	if err != nil {
		log.G(ctx).WithError(err).Error("Failed to commit allocation of unallocated services")
	}

	for _, s := range allocatedServices[:committed] {
		delete(nc.unallocatedServices, s.ID)
	}
}
Exemple #10
0
// reconcileServiceOneNode checks one service on one node
func (g *GlobalOrchestrator) reconcileServiceOneNode(ctx context.Context, serviceID string, nodeID string) {
	_, exists := g.nodes[nodeID]
	if !exists {
		return
	}
	service, exists := g.globalServices[serviceID]
	if !exists {
		return
	}
	// the node has completed this servie
	completed := false
	// tasks for this node and service
	var (
		tasks []*api.Task
		err   error
	)
	g.store.View(func(tx store.ReadTx) {
		var tasksOnNode []*api.Task
		tasksOnNode, err = store.FindTasks(tx, store.ByNodeID(nodeID))
		if err != nil {
			return
		}
		for _, t := range tasksOnNode {
			// only interested in one service
			if t.ServiceID != serviceID {
				continue
			}
			if isTaskRunning(t) {
				tasks = append(tasks, t)
			} else {
				if isTaskCompleted(t, restartCondition(t)) {
					completed = true
				}
			}
		}
	})
	if err != nil {
		log.G(ctx).WithError(err).Errorf("global orchestrator: reconcile failed finding tasks")
		return
	}

	_, err = g.store.Batch(func(batch *store.Batch) error {
		// if restart policy considers this node has finished its task
		// it should remove all running tasks
		if completed {
			g.removeTasks(ctx, batch, service, tasks)
			return nil
		}
		// this node needs to run 1 copy of the task
		if len(tasks) == 0 {
			g.addTask(ctx, batch, service, nodeID)
		} else {
			g.removeTasks(ctx, batch, service, tasks[1:])
		}
		return nil
	})
	if err != nil {
		log.G(ctx).WithError(err).Errorf("global orchestrator: reconcileServiceOneNode batch failed")
	}
}
// updateCluster is called when there are cluster changes, and it ensures that the local RootCA is
// always aware of changes in clusterExpiry and the Root CA key material
func (s *Server) updateCluster(ctx context.Context, cluster *api.Cluster) {
	s.mu.Lock()
	s.joinTokens = cluster.RootCA.JoinTokens.Copy()
	s.mu.Unlock()
	var err error

	// If the cluster has a RootCA, let's try to update our SecurityConfig to reflect the latest values
	rCA := cluster.RootCA
	if len(rCA.CACert) != 0 && len(rCA.CAKey) != 0 {
		expiry := DefaultNodeCertExpiration
		if cluster.Spec.CAConfig.NodeCertExpiry != nil {
			// NodeCertExpiry exists, let's try to parse the duration out of it
			clusterExpiry, err := ptypes.Duration(cluster.Spec.CAConfig.NodeCertExpiry)
			if err != nil {
				log.G(ctx).WithFields(logrus.Fields{
					"cluster.id": cluster.ID,
					"method":     "(*Server).updateCluster",
				}).WithError(err).Warn("failed to parse certificate expiration, using default")
			} else {
				// We were able to successfully parse the expiration out of the cluster.
				expiry = clusterExpiry
			}
		} else {
			// NodeCertExpiry seems to be nil
			log.G(ctx).WithFields(logrus.Fields{
				"cluster.id": cluster.ID,
				"method":     "(*Server).updateCluster",
			}).WithError(err).Warn("failed to parse certificate expiration, using default")

		}
		// Attempt to update our local RootCA with the new parameters
		err = s.securityConfig.UpdateRootCA(rCA.CACert, rCA.CAKey, expiry)
		if err != nil {
			log.G(ctx).WithFields(logrus.Fields{
				"cluster.id": cluster.ID,
				"method":     "(*Server).updateCluster",
			}).WithError(err).Error("updating Root CA failed")
		} else {
			log.G(ctx).WithFields(logrus.Fields{
				"cluster.id": cluster.ID,
				"method":     "(*Server).updateCluster",
			}).Debugf("Root CA updated successfully")
		}
	}

	// Update our security config with the list of External CA URLs
	// from the new cluster state.

	// TODO(aaronl): In the future, this will be abstracted with an
	// ExternalCA interface that has different implementations for
	// different CA types. At the moment, only CFSSL is supported.
	var cfsslURLs []string
	for _, ca := range cluster.Spec.CAConfig.ExternalCAs {
		if ca.Protocol == api.ExternalCA_CAProtocolCFSSL {
			cfsslURLs = append(cfsslURLs, ca.URL)
		}
	}

	s.securityConfig.externalCA.UpdateURLs(cfsslURLs...)
}
Exemple #12
0
// Init prepares the worker for assignments.
func (w *worker) Init(ctx context.Context) error {
	w.mu.Lock()
	defer w.mu.Unlock()

	ctx = log.WithModule(ctx, "worker")

	// TODO(stevvooe): Start task cleanup process.

	// read the tasks from the database and start any task managers that may be needed.
	return w.db.Update(func(tx *bolt.Tx) error {
		return WalkTasks(tx, func(task *api.Task) error {
			if !TaskAssigned(tx, task.ID) {
				// NOTE(stevvooe): If tasks can survive worker restart, we need
				// to startup the controller and ensure they are removed. For
				// now, we can simply remove them from the database.
				if err := DeleteTask(tx, task.ID); err != nil {
					log.G(ctx).WithError(err).Errorf("error removing task %v", task.ID)
				}
				return nil
			}

			status, err := GetTaskStatus(tx, task.ID)
			if err != nil {
				log.G(ctx).WithError(err).Error("unable to read tasks status")
				return nil
			}

			task.Status = *status // merges the status into the task, ensuring we start at the right point.
			return w.startTask(ctx, tx, task)
		})
	})
}
func (u *Updater) worker(ctx context.Context, queue <-chan *api.Task) {
	for t := range queue {
		updated := newTask(u.cluster, u.newService, t.Slot)
		updated.DesiredState = api.TaskStateReady
		if isGlobalService(u.newService) {
			updated.NodeID = t.NodeID
		}

		if err := u.updateTask(ctx, t, updated); err != nil {
			log.G(ctx).WithError(err).WithField("task.id", t.ID).Error("update failed")
		}

		if u.newService.Spec.Update != nil && (u.newService.Spec.Update.Delay.Seconds != 0 || u.newService.Spec.Update.Delay.Nanos != 0) {
			delay, err := ptypes.Duration(&u.newService.Spec.Update.Delay)
			if err != nil {
				log.G(ctx).WithError(err).Error("invalid update delay")
				continue
			}
			select {
			case <-time.After(delay):
			case <-u.stopChan:
				return
			}
		}
	}
}
Exemple #14
0
// events issues a call to the events API and returns a channel with all
// events. The stream of events can be shutdown by cancelling the context.
func (c *containerAdapter) events(ctx context.Context) <-chan events.Message {
	log.G(ctx).Debugf("waiting on events")
	buffer, l := c.backend.SubscribeToEvents(time.Time{}, time.Time{}, c.container.eventFilter())
	eventsq := make(chan events.Message, len(buffer))

	for _, event := range buffer {
		eventsq <- event
	}

	go func() {
		defer c.backend.UnsubscribeFromEvents(l)

		for {
			select {
			case ev := <-l:
				jev, ok := ev.(events.Message)
				if !ok {
					log.G(ctx).Warnf("unexpected event message: %q", ev)
					continue
				}
				select {
				case eventsq <- jev:
				case <-ctx.Done():
					return
				}
			case <-ctx.Done():
				return
			}
		}
	}()

	return eventsq
}
// UpdateTaskStatus attempts to send a task status update over the current session,
// blocking until the operation is completed.
//
// If an error is returned, the operation should be retried.
func (a *Agent) UpdateTaskStatus(ctx context.Context, taskID string, status *api.TaskStatus) error {
	log.G(ctx).WithField("task.id", taskID).Debugf("(*Agent).UpdateTaskStatus")
	ctx, cancel := context.WithCancel(ctx)
	defer cancel()

	errs := make(chan error, 1)
	if err := a.withSession(ctx, func(session *session) error {
		go func() {
			err := session.sendTaskStatus(ctx, taskID, status)
			if err != nil {
				if err == errTaskUnknown {
					err = nil // dispatcher no longer cares about this task.
				} else {
					log.G(ctx).WithError(err).Error("sending task status update failed")
				}
			} else {
				log.G(ctx).Debug("task status reported")
			}

			errs <- err
		}()

		return nil
	}); err != nil {
		return err
	}

	select {
	case err := <-errs:
		return err
	case <-ctx.Done():
		return ctx.Err()
	}
}
Exemple #16
0
func (a *Allocator) procUnallocatedNetworks(ctx context.Context) {
	nc := a.netCtx
	var allocatedNetworks []*api.Network
	for _, n := range nc.unallocatedNetworks {
		if !nc.nwkAllocator.IsAllocated(n) {
			if err := a.allocateNetwork(ctx, n); err != nil {
				log.G(ctx).WithError(err).Debugf("Failed allocation of unallocated network %s", n.ID)
				continue
			}
			allocatedNetworks = append(allocatedNetworks, n)
		}
	}

	if len(allocatedNetworks) == 0 {
		return
	}

	committed, err := a.store.Batch(func(batch *store.Batch) error {
		for _, n := range allocatedNetworks {
			if err := a.commitAllocatedNetwork(ctx, batch, n); err != nil {
				log.G(ctx).WithError(err).Debugf("Failed to commit allocation of unallocated network %s", n.ID)
				continue
			}
		}
		return nil
	})

	if err != nil {
		log.G(ctx).WithError(err).Error("Failed to commit allocation of unallocated networks")
	}

	for _, n := range allocatedNetworks[:committed] {
		delete(nc.unallocatedNetworks, n.ID)
	}
}
Exemple #17
0
func (u *Updater) completeUpdate(ctx context.Context, serviceID string) {
	log.G(ctx).Debugf("update of service %s complete", serviceID)

	err := u.store.Update(func(tx store.Tx) error {
		service := store.GetService(tx, serviceID)
		if service == nil {
			return nil
		}
		if service.UpdateStatus == nil {
			// The service was changed since we started this update
			return nil
		}
		if service.UpdateStatus.State == api.UpdateStatus_ROLLBACK_STARTED {
			service.UpdateStatus.State = api.UpdateStatus_ROLLBACK_COMPLETED
			service.UpdateStatus.Message = "rollback completed"
		} else {
			service.UpdateStatus.State = api.UpdateStatus_COMPLETED
			service.UpdateStatus.Message = "update completed"
		}
		service.UpdateStatus.CompletedAt = ptypes.MustTimestampProto(time.Now())

		return store.UpdateService(tx, service)
	})

	if err != nil {
		log.G(ctx).WithError(err).Errorf("failed to mark update of service %s complete", serviceID)
	}
}
Exemple #18
0
// start begins the session and returns the first SessionMessage.
func (s *session) start(ctx context.Context) error {
	log.G(ctx).Debugf("(*session).start")

	description, err := s.agent.config.Executor.Describe(ctx)
	if err != nil {
		log.G(ctx).WithError(err).WithField("executor", s.agent.config.Executor).
			Errorf("node description unavailable")
		return err
	}
	// Override hostname
	if s.agent.config.Hostname != "" {
		description.Hostname = s.agent.config.Hostname
	}

	errChan := make(chan error, 1)
	var (
		msg    *api.SessionMessage
		stream api.Dispatcher_SessionClient
	)
	// Note: we don't defer cancellation of this context, because the
	// streaming RPC is used after this function returned. We only cancel
	// it in the timeout case to make sure the goroutine completes.
	sessionCtx, cancelSession := context.WithCancel(ctx)

	// Need to run Session in a goroutine since there's no way to set a
	// timeout for an individual Recv call in a stream.
	go func() {
		client := api.NewDispatcherClient(s.conn)

		stream, err = client.Session(sessionCtx, &api.SessionRequest{
			Description: description,
			SessionID:   s.sessionID,
		})
		if err != nil {
			errChan <- err
			return
		}

		msg, err = stream.Recv()
		errChan <- err
	}()

	select {
	case err := <-errChan:
		if err != nil {
			return err
		}
	case <-time.After(dispatcherRPCTimeout):
		cancelSession()
		return errors.New("session initiation timed out")
	}

	s.sessionID = msg.SessionID
	s.session = stream

	return s.handleSessionMessage(ctx, msg)
}
Exemple #19
0
// Stop stops the manager. It immediately closes all open connections and
// active RPCs as well as stopping the scheduler.
func (m *Manager) Stop(ctx context.Context) {
	log.G(ctx).Info("Stopping manager")

	// It's not safe to start shutting down while the manager is still
	// starting up.
	<-m.started

	// the mutex stops us from trying to stop while we're alrady stopping, or
	// from returning before we've finished stopping.
	m.mu.Lock()
	defer m.mu.Unlock()
	select {
	// check to see that we've already stopped
	case <-m.stopped:
		return
	default:
		// do nothing, we're stopping for the first time
	}

	// once we start stopping, send a signal that we're doing so. this tells
	// Run that we've started stopping, when it gets the error from errServe
	// it also prevents the loop from processing any more stuff.
	close(m.stopped)

	m.Dispatcher.Stop()
	m.caserver.Stop()

	if m.allocator != nil {
		m.allocator.Stop()
	}
	if m.replicatedOrchestrator != nil {
		m.replicatedOrchestrator.Stop()
	}
	if m.globalOrchestrator != nil {
		m.globalOrchestrator.Stop()
	}
	if m.taskReaper != nil {
		m.taskReaper.Stop()
	}
	if m.scheduler != nil {
		m.scheduler.Stop()
	}
	if m.keyManager != nil {
		m.keyManager.Stop()
	}

	if m.connSelector != nil {
		m.connSelector.Stop()
	}
	m.RaftNode.Shutdown()
	// some time after this point, Run will receive an error from one of these
	m.server.Stop()
	m.localserver.Stop()

	log.G(ctx).Info("Manager shut down")
	// mutex is released and Run can return now
}
Exemple #20
0
func (r *ReplicatedOrchestrator) reconcile(ctx context.Context, service *api.Service) {
	var (
		tasks []*api.Task
		err   error
	)
	r.store.View(func(tx store.ReadTx) {
		tasks, err = store.FindTasks(tx, store.ByServiceID(service.ID))
	})
	if err != nil {
		log.G(ctx).WithError(err).Errorf("reconcile failed finding tasks")
		return
	}

	runningTasks := make([]*api.Task, 0, len(tasks))
	runningInstances := make(map[uint64]struct{}) // this could be a bitfield...
	for _, t := range tasks {
		// Technically the check below could just be
		// t.DesiredState <= api.TaskStateRunning, but ignoring tasks
		// with DesiredState == NEW simplifies the drainer unit tests.
		if t.DesiredState > api.TaskStateNew && t.DesiredState <= api.TaskStateRunning {
			runningTasks = append(runningTasks, t)
			runningInstances[t.Slot] = struct{}{}
		}
	}
	numTasks := len(runningTasks)

	deploy := service.Spec.GetMode().(*api.ServiceSpec_Replicated)
	specifiedInstances := int(deploy.Replicated.Replicas)

	// TODO(aaronl): Add support for restart delays.

	_, err = r.store.Batch(func(batch *store.Batch) error {
		switch {
		case specifiedInstances > numTasks:
			log.G(ctx).Debugf("Service %s was scaled up from %d to %d instances", service.ID, numTasks, specifiedInstances)
			// Update all current tasks then add missing tasks
			r.updater.Update(ctx, service, runningTasks)
			r.addTasks(ctx, batch, service, runningInstances, specifiedInstances-numTasks)

		case specifiedInstances < numTasks:
			// Update up to N tasks then remove the extra
			log.G(ctx).Debugf("Service %s was scaled down from %d to %d instances", service.ID, numTasks, specifiedInstances)
			r.updater.Update(ctx, service, runningTasks[:specifiedInstances])
			r.removeTasks(ctx, batch, service, runningTasks[specifiedInstances:])

		case specifiedInstances == numTasks:
			// Simple update, no scaling - update all tasks.
			r.updater.Update(ctx, service, runningTasks)
		}
		return nil
	})

	if err != nil {
		log.G(ctx).WithError(err).Errorf("reconcile batch failed")
	}
}
Exemple #21
0
func (g *GlobalOrchestrator) reconcileOneService(ctx context.Context, service *api.Service) {
	var (
		tasks []*api.Task
		err   error
	)
	g.store.View(func(tx store.ReadTx) {
		tasks, err = store.FindTasks(tx, store.ByServiceID(service.ID))
	})
	if err != nil {
		log.G(ctx).WithError(err).Errorf("global orchestrator: reconcileOneService failed finding tasks")
		return
	}
	// a node may have completed this service
	nodeCompleted := make(map[string]struct{})
	// nodeID -> task list
	nodeTasks := make(map[string][]*api.Task)

	for _, t := range tasks {
		if isTaskRunning(t) {
			// Collect all running instances of this service
			nodeTasks[t.NodeID] = append(nodeTasks[t.NodeID], t)
		} else {
			// for finished tasks, check restartPolicy
			if isTaskCompleted(t, restartCondition(t)) {
				nodeCompleted[t.NodeID] = struct{}{}
			}
		}
	}

	_, err = g.store.Batch(func(batch *store.Batch) error {
		var updateTasks []*api.Task
		for nodeID := range g.nodes {
			ntasks := nodeTasks[nodeID]
			// if restart policy considers this node has finished its task
			// it should remove all running tasks
			if _, exists := nodeCompleted[nodeID]; exists {
				g.removeTasks(ctx, batch, service, ntasks)
				return nil
			}
			// this node needs to run 1 copy of the task
			if len(ntasks) == 0 {
				g.addTask(ctx, batch, service, nodeID)
			} else {
				updateTasks = append(updateTasks, ntasks[0])
				g.removeTasks(ctx, batch, service, ntasks[1:])
			}
		}
		if len(updateTasks) > 0 {
			g.updater.Update(ctx, service, updateTasks)
		}
		return nil
	})
	if err != nil {
		log.G(ctx).WithError(err).Errorf("global orchestrator: reconcileOneService transaction failed")
	}
}
Exemple #22
0
func (u *Updater) worker(ctx context.Context, queue <-chan slot) {
	for slot := range queue {
		// Do we have a task with the new spec in desired state = RUNNING?
		// If so, all we have to do to complete the update is remove the
		// other tasks. Or if we have a task with the new spec that has
		// desired state < RUNNING, advance it to running and remove the
		// other tasks.
		var (
			runningTask *api.Task
			cleanTask   *api.Task
		)
		for _, t := range slot {
			if !u.isTaskDirty(t) {
				if t.DesiredState == api.TaskStateRunning {
					runningTask = t
					break
				}
				if t.DesiredState < api.TaskStateRunning {
					cleanTask = t
				}
			}
		}
		if runningTask != nil {
			if err := u.useExistingTask(ctx, slot, runningTask); err != nil {
				log.G(ctx).WithError(err).Error("update failed")
			}
		} else if cleanTask != nil {
			if err := u.useExistingTask(ctx, slot, cleanTask); err != nil {
				log.G(ctx).WithError(err).Error("update failed")
			}
		} else {
			updated := newTask(u.cluster, u.newService, slot[0].Slot)
			updated.DesiredState = api.TaskStateReady
			if isGlobalService(u.newService) {
				updated.NodeID = slot[0].NodeID
			}

			if err := u.updateTask(ctx, slot, updated); err != nil {
				log.G(ctx).WithError(err).WithField("task.id", updated.ID).Error("update failed")
			}
		}

		if u.newService.Spec.Update != nil && (u.newService.Spec.Update.Delay.Seconds != 0 || u.newService.Spec.Update.Delay.Nanos != 0) {
			delay, err := ptypes.Duration(&u.newService.Spec.Update.Delay)
			if err != nil {
				log.G(ctx).WithError(err).Error("invalid update delay")
				continue
			}
			select {
			case <-time.After(delay):
			case <-u.stopChan:
				return
			}
		}
	}
}
Exemple #23
0
// CreateSecurityConfig creates a new key and cert for this node, either locally
// or via a remote CA.
func (rootCA RootCA) CreateSecurityConfig(ctx context.Context, krw *KeyReadWriter, config CertificateRequestConfig) (*SecurityConfig, error) {
	ctx = log.WithModule(ctx, "tls")

	var (
		tlsKeyPair *tls.Certificate
		err        error
	)

	if rootCA.CanSign() {
		// Create a new random ID for this certificate
		cn := identity.NewID()
		org := identity.NewID()

		proposedRole := ManagerRole
		tlsKeyPair, err = rootCA.IssueAndSaveNewCertificates(krw, cn, proposedRole, org)
		if err != nil {
			log.G(ctx).WithFields(logrus.Fields{
				"node.id":   cn,
				"node.role": proposedRole,
			}).WithError(err).Errorf("failed to issue and save new certificate")
			return nil, err
		}

		log.G(ctx).WithFields(logrus.Fields{
			"node.id":   cn,
			"node.role": proposedRole,
		}).Debug("issued new TLS certificate")
	} else {
		// Request certificate issuance from a remote CA.
		// Last argument is nil because at this point we don't have any valid TLS creds
		tlsKeyPair, err = rootCA.RequestAndSaveNewCertificates(ctx, krw, config)
		if err != nil {
			log.G(ctx).WithError(err).Error("failed to request save new certificate")
			return nil, err
		}
	}
	// Create the Server TLS Credentials for this node. These will not be used by workers.
	serverTLSCreds, err := rootCA.NewServerTLSCredentials(tlsKeyPair)
	if err != nil {
		return nil, err
	}

	// Create a TLSConfig to be used when this node connects as a client to another remote node.
	// We're using ManagerRole as remote serverName for TLS host verification
	clientTLSCreds, err := rootCA.NewClientTLSCredentials(tlsKeyPair, ManagerRole)
	if err != nil {
		return nil, err
	}
	log.G(ctx).WithFields(logrus.Fields{
		"node.id":   clientTLSCreds.NodeID(),
		"node.role": clientTLSCreds.Role(),
	}).Debugf("new node credentials generated: %s", krw.Target())

	return NewSecurityConfig(&rootCA, krw, clientTLSCreds, serverTLSCreds), nil
}
Exemple #24
0
// events issues a call to the events API and returns a channel with all
// events. The stream of events can be shutdown by cancelling the context.
//
// A chan struct{} is returned that will be closed if the event processing
// fails and needs to be restarted.
func (c *containerAdapter) events(ctx context.Context) (<-chan events.Message, <-chan struct{}, error) {
	// TODO(stevvooe): Move this to a single, global event dispatch. For
	// now, we create a connection per container.
	var (
		eventsq = make(chan events.Message)
		closed  = make(chan struct{})
	)

	log.G(ctx).Debugf("waiting on events")
	// TODO(stevvooe): For long running tasks, it is likely that we will have
	// to restart this under failure.
	rc, err := c.client.Events(ctx, types.EventsOptions{
		Since:   "0",
		Filters: c.container.eventFilter(),
	})
	if err != nil {
		return nil, nil, err
	}

	go func(rc io.ReadCloser) {
		defer rc.Close()
		defer close(closed)

		select {
		case <-ctx.Done():
			// exit
			return
		default:
		}

		dec := json.NewDecoder(rc)

		for {
			var event events.Message
			if err := dec.Decode(&event); err != nil {
				// TODO(stevvooe): This error handling isn't quite right.
				if err == io.EOF {
					return
				}

				log.G(ctx).Errorf("error decoding event: %v", err)
				return
			}

			select {
			case eventsq <- event:
			case <-ctx.Done():
				return
			}
		}
	}(rc)

	return eventsq, closed, nil
}
func (a *Agent) handleSessionMessage(ctx context.Context, message *api.SessionMessage) error {
	seen := map[api.Peer]struct{}{}
	for _, manager := range message.Managers {
		if manager.Peer.Addr == "" {
			log.G(ctx).WithField("manager.addr", manager.Peer.Addr).
				Warnf("skipping bad manager address")
			continue
		}

		a.config.Managers.Observe(*manager.Peer, int(manager.Weight))
		seen[*manager.Peer] = struct{}{}
	}

	if message.Node != nil {
		if a.node == nil || !nodesEqual(a.node, message.Node) {
			if a.config.NotifyRoleChange != nil {
				a.config.NotifyRoleChange <- message.Node.Spec.Role
			}
			a.node = message.Node.Copy()
			if err := a.config.Executor.Configure(ctx, a.node); err != nil {
				log.G(ctx).WithError(err).Error("node configure failed")
			}
		}
	}

	// prune managers not in list.
	for peer := range a.config.Managers.Weights() {
		if _, ok := seen[peer]; !ok {
			a.config.Managers.Remove(peer)
		}
	}

	if message.NetworkBootstrapKeys == nil {
		return nil
	}

	for _, key := range message.NetworkBootstrapKeys {
		same := false
		for _, agentKey := range a.keys {
			if agentKey.LamportTime == key.LamportTime {
				same = true
			}
		}
		if !same {
			a.keys = message.NetworkBootstrapKeys
			if err := a.config.Executor.SetNetworkBootstrapKeys(a.keys); err != nil {
				panic(fmt.Errorf("configuring network key failed"))
			}
		}
	}

	return nil
}
func releasePools(ipam ipamapi.Ipam, icList []*api.IPAMConfig, pools map[string]string) {
	for _, ic := range icList {
		if err := ipam.ReleaseAddress(pools[ic.Subnet], net.ParseIP(ic.Gateway)); err != nil {
			log.G(context.TODO()).Errorf("Failed to release address %s: %v", ic.Subnet, err)
		}
	}

	for k, p := range pools {
		if err := ipam.ReleasePool(p); err != nil {
			log.G(context.TODO()).Errorf("Failed to release pool %s: %v", k, err)
		}
	}
}
Exemple #27
0
// createConfigChangeEnts creates a series of Raft entries (i.e.
// EntryConfChange) to remove the set of given IDs from the cluster. The ID
// `self` is _not_ removed, even if present in the set.
// If `self` is not inside the given ids, it creates a Raft entry to add a
// default member with the given `self`.
func createConfigChangeEnts(ids []uint64, self uint64, term, index uint64) []raftpb.Entry {
	var ents []raftpb.Entry
	next := index + 1
	found := false
	for _, id := range ids {
		if id == self {
			found = true
			continue
		}
		cc := &raftpb.ConfChange{
			Type:   raftpb.ConfChangeRemoveNode,
			NodeID: id,
		}
		data, err := cc.Marshal()
		if err != nil {
			log.G(context.Background()).Panicf("marshal configuration change should never fail: %v", err)
		}
		e := raftpb.Entry{
			Type:  raftpb.EntryConfChange,
			Data:  data,
			Term:  term,
			Index: next,
		}
		ents = append(ents, e)
		next++
	}
	if !found {
		node := &api.RaftMember{RaftID: self}
		meta, err := node.Marshal()
		if err != nil {
			log.G(context.Background()).Panicf("marshal member should never fail: %v", err)
		}
		cc := &raftpb.ConfChange{
			Type:    raftpb.ConfChangeAddNode,
			NodeID:  self,
			Context: meta,
		}
		data, err := cc.Marshal()
		if err != nil {
			log.G(context.Background()).Panicf("marshal configuration change should never fail: %v", err)
		}
		e := raftpb.Entry{
			Type:  raftpb.EntryConfChange,
			Data:  data,
			Term:  term,
			Index: next,
		}
		ents = append(ents, e)
	}
	return ents
}
Exemple #28
0
// register is used for registration of node with particular dispatcher.
func (d *Dispatcher) register(ctx context.Context, nodeID string, description *api.NodeDescription) (string, error) {
	// prevent register until we're ready to accept it
	if err := d.isRunningLocked(); err != nil {
		return "", err
	}

	if err := d.nodes.CheckRateLimit(nodeID); err != nil {
		return "", err
	}

	// create or update node in store
	// TODO(stevvooe): Validate node specification.
	var node *api.Node
	err := d.store.Update(func(tx store.Tx) error {
		node = store.GetNode(tx, nodeID)
		if node == nil {
			return ErrNodeNotFound
		}

		node.Description = description
		node.Status = api.NodeStatus{
			State: api.NodeStatus_READY,
		}
		return store.UpdateNode(tx, node)

	})
	if err != nil {
		return "", err
	}

	expireFunc := func() {
		nodeStatus := api.NodeStatus{State: api.NodeStatus_DOWN, Message: "heartbeat failure"}
		log.G(ctx).Debugf("heartbeat expiration")
		if err := d.nodeRemove(nodeID, nodeStatus); err != nil {
			log.G(ctx).WithError(err).Errorf("failed deregistering node after heartbeat expiration")
		}
	}

	rn := d.nodes.Add(node, expireFunc)

	// NOTE(stevvooe): We need be a little careful with re-registration. The
	// current implementation just matches the node id and then gives away the
	// sessionID. If we ever want to use sessionID as a secret, which we may
	// want to, this is giving away the keys to the kitchen.
	//
	// The right behavior is going to be informed by identity. Basically, each
	// time a node registers, we invalidate the session and issue a new
	// session, once identity is proven. This will cause misbehaved agents to
	// be kicked when multiple connections are made.
	return rn.SessionID, nil
}
Exemple #29
0
// Run starts the keymanager, it doesn't return
func (k *KeyManager) Run(ctx context.Context) error {
	k.mu.Lock()
	ctx = log.WithModule(ctx, "keymanager")
	var (
		clusters []*api.Cluster
		err      error
	)
	k.store.View(func(readTx store.ReadTx) {
		clusters, err = store.FindClusters(readTx, store.ByName(k.config.ClusterName))
	})

	if err != nil {
		log.G(ctx).Errorf("reading cluster config failed, %v", err)
		k.mu.Unlock()
		return err
	}

	cluster := clusters[0]
	if len(cluster.NetworkBootstrapKeys) == 0 {
		for _, subsys := range k.config.Subsystems {
			for i := 0; i < keyringSize; i++ {
				k.keyRing.keys = append(k.keyRing.keys, k.allocateKey(ctx, subsys))
			}
		}
		if err := k.updateKey(cluster); err != nil {
			log.G(ctx).Errorf("store update failed %v", err)
		}
	} else {
		k.keyRing.lClock = cluster.EncryptionKeyLamportClock
		k.keyRing.keys = cluster.NetworkBootstrapKeys

		k.rotateKey(ctx)
	}

	ticker := time.NewTicker(k.config.RotationInterval)
	defer ticker.Stop()

	k.ctx, k.cancel = context.WithCancel(ctx)
	k.mu.Unlock()

	for {
		select {
		case <-ticker.C:
			k.rotateKey(ctx)
		case <-k.ctx.Done():
			return nil
		}
	}
}
Exemple #30
0
// updateTaskStatus reports statuses to listeners, read lock must be held.
func (w *worker) updateTaskStatus(ctx context.Context, tx *bolt.Tx, taskID string, status *api.TaskStatus) error {
	if err := PutTaskStatus(tx, taskID, status); err != nil {
		log.G(ctx).WithError(err).Error("failed writing status to disk")
		return err
	}

	// broadcast the task status out.
	for key := range w.listeners {
		if err := key.StatusReporter.UpdateTaskStatus(ctx, taskID, status); err != nil {
			log.G(ctx).WithError(err).Errorf("failed updating status for reporter %v", key.StatusReporter)
		}
	}

	return nil
}