Example #1
0
func (ce *ConstraintEnforcer) shutdownNoncompliantTasks(node *api.Node) {
	// If the availability is "drain", the orchestrator will
	// shut down all tasks.
	// If the availability is "pause", we shouldn't touch
	// the tasks on this node.
	if node.Spec.Availability != api.NodeAvailabilityActive {
		return
	}

	var (
		tasks []*api.Task
		err   error
	)

	ce.store.View(func(tx store.ReadTx) {
		tasks, err = store.FindTasks(tx, store.ByNodeID(node.ID))
	})

	if err != nil {
		log.L.WithError(err).Errorf("failed to list tasks for node ID %s", node.ID)
	}

	var availableMemoryBytes, availableNanoCPUs int64
	if node.Description != nil && node.Description.Resources != nil {
		availableMemoryBytes = node.Description.Resources.MemoryBytes
		availableNanoCPUs = node.Description.Resources.NanoCPUs
	}

	removeTasks := make(map[string]*api.Task)

	// TODO(aaronl): The set of tasks removed will be
	// nondeterministic because it depends on the order of
	// the slice returned from FindTasks. We could do
	// a separate pass over the tasks for each type of
	// resource, and sort by the size of the reservation
	// to remove the most resource-intensive tasks.
	for _, t := range tasks {
		if t.DesiredState < api.TaskStateAssigned || t.DesiredState > api.TaskStateRunning {
			continue
		}

		// Ensure that the task still meets scheduling
		// constraints.
		if t.Spec.Placement != nil && len(t.Spec.Placement.Constraints) != 0 {
			constraints, _ := constraint.Parse(t.Spec.Placement.Constraints)
			if !constraint.NodeMatches(constraints, node) {
				removeTasks[t.ID] = t
				continue
			}
		}

		// Ensure that the task assigned to the node
		// still satisfies the resource limits.
		if t.Spec.Resources != nil && t.Spec.Resources.Reservations != nil {
			if t.Spec.Resources.Reservations.MemoryBytes > availableMemoryBytes {
				removeTasks[t.ID] = t
				continue
			}
			if t.Spec.Resources.Reservations.NanoCPUs > availableNanoCPUs {
				removeTasks[t.ID] = t
				continue
			}
			availableMemoryBytes -= t.Spec.Resources.Reservations.MemoryBytes
			availableNanoCPUs -= t.Spec.Resources.Reservations.NanoCPUs
		}
	}

	if len(removeTasks) != 0 {
		_, err := ce.store.Batch(func(batch *store.Batch) error {
			for _, t := range removeTasks {
				err := batch.Update(func(tx store.Tx) error {
					t = store.GetTask(tx, t.ID)
					if t == nil || t.DesiredState > api.TaskStateRunning {
						return nil
					}

					t.DesiredState = api.TaskStateShutdown
					return store.UpdateTask(tx, t)
				})
				if err != nil {
					log.L.WithError(err).Errorf("failed to shut down task %s", t.ID)
				}
			}
			return nil
		})

		if err != nil {
			log.L.WithError(err).Errorf("failed to shut down tasks")
		}
	}
}
Example #2
0
func (g *Orchestrator) reconcileServices(ctx context.Context, serviceIDs []string) {
	nodeCompleted := make(map[string]map[string]struct{})
	nodeTasks := make(map[string]map[string][]*api.Task)

	g.store.View(func(tx store.ReadTx) {
		for _, serviceID := range serviceIDs {
			tasks, err := store.FindTasks(tx, store.ByServiceID(serviceID))
			if err != nil {
				log.G(ctx).WithError(err).Errorf("global orchestrator: reconcileServices failed finding tasks for service %s", serviceID)
				continue
			}

			// a node may have completed this service
			nodeCompleted[serviceID] = make(map[string]struct{})
			// nodeID -> task list
			nodeTasks[serviceID] = make(map[string][]*api.Task)

			for _, t := range tasks {
				if isTaskRunning(t) {
					// Collect all running instances of this service
					nodeTasks[serviceID][t.NodeID] = append(nodeTasks[serviceID][t.NodeID], t)
				} else {
					// for finished tasks, check restartPolicy
					if isTaskCompleted(t, orchestrator.RestartCondition(t)) {
						nodeCompleted[serviceID][t.NodeID] = struct{}{}
					}
				}
			}
		}
	})

	_, err := g.store.Batch(func(batch *store.Batch) error {
		var updateTasks []orchestrator.Slot
		for _, serviceID := range serviceIDs {
			if _, exists := nodeTasks[serviceID]; !exists {
				continue
			}

			service := g.globalServices[serviceID]

			for nodeID, node := range g.nodes {
				meetsConstraints := constraint.NodeMatches(service.constraints, node)
				ntasks := nodeTasks[serviceID][nodeID]
				delete(nodeTasks[serviceID], nodeID)

				// if restart policy considers this node has finished its task
				// it should remove all running tasks
				if _, exists := nodeCompleted[serviceID][nodeID]; exists || !meetsConstraints {
					g.removeTasks(ctx, batch, ntasks)
					continue
				}

				if node.Spec.Availability == api.NodeAvailabilityPause {
					// the node is paused, so we won't add or update
					// any tasks
					continue
				}

				// this node needs to run 1 copy of the task
				if len(ntasks) == 0 {
					g.addTask(ctx, batch, service.Service, nodeID)
				} else {
					updateTasks = append(updateTasks, ntasks)
				}
			}
			if len(updateTasks) > 0 {
				g.updater.Update(ctx, g.cluster, service.Service, updateTasks)
			}

			// Remove any tasks assigned to nodes not found in g.nodes.
			// These must be associated with nodes that are drained, or
			// nodes that no longer exist.
			for _, ntasks := range nodeTasks[serviceID] {
				g.removeTasks(ctx, batch, ntasks)
			}
		}
		return nil
	})
	if err != nil {
		log.G(ctx).WithError(err).Errorf("global orchestrator: reconcileServices transaction failed")
	}
}
Example #3
0
// reconcileServicesOneNode checks the specified services on one node
func (g *Orchestrator) reconcileServicesOneNode(ctx context.Context, serviceIDs []string, nodeID string) {
	node, exists := g.nodes[nodeID]
	if !exists {
		return
	}

	// whether each service has completed on the node
	completed := make(map[string]bool)
	// tasks by service
	tasks := make(map[string][]*api.Task)

	var (
		tasksOnNode []*api.Task
		err         error
	)

	g.store.View(func(tx store.ReadTx) {
		tasksOnNode, err = store.FindTasks(tx, store.ByNodeID(nodeID))
	})
	if err != nil {
		log.G(ctx).WithError(err).Errorf("global orchestrator: reconcile failed finding tasks on node %s", nodeID)
		return
	}

	for _, serviceID := range serviceIDs {
		for _, t := range tasksOnNode {
			if t.ServiceID != serviceID {
				continue
			}
			if isTaskRunning(t) {
				tasks[serviceID] = append(tasks[serviceID], t)
			} else {
				if isTaskCompleted(t, orchestrator.RestartCondition(t)) {
					completed[serviceID] = true
				}
			}
		}
	}

	_, err = g.store.Batch(func(batch *store.Batch) error {
		for _, serviceID := range serviceIDs {
			service, exists := g.globalServices[serviceID]
			if !exists {
				continue
			}

			if !constraint.NodeMatches(service.constraints, node) {
				continue
			}

			// if restart policy considers this node has finished its task
			// it should remove all running tasks
			if completed[serviceID] {
				g.removeTasks(ctx, batch, tasks[serviceID])
				continue
			}

			if node.Spec.Availability == api.NodeAvailabilityPause {
				// the node is paused, so we won't add or update tasks
				continue
			}

			if len(tasks) == 0 {
				g.addTask(ctx, batch, service.Service, nodeID)
			} else {
				// If task is out of date, update it. This can happen
				// on node reconciliation if, for example, we pause a
				// node, update the service, and then activate the node
				// later.

				// We don't use g.updater here for two reasons:
				// - This is not a rolling update. Since it was not
				//   triggered directly by updating the service, it
				//   should not observe the rolling update parameters
				//   or show status in UpdateStatus.
				// - Calling Update cancels any current rolling updates
				//   for the service, such as one triggered by service
				//   reconciliation.

				var (
					dirtyTasks []*api.Task
					cleanTasks []*api.Task
				)

				for _, t := range tasks[serviceID] {
					if orchestrator.IsTaskDirty(service.Service, t) {
						dirtyTasks = append(dirtyTasks, t)
					} else {
						cleanTasks = append(cleanTasks, t)
					}
				}

				if len(cleanTasks) == 0 {
					g.addTask(ctx, batch, service.Service, nodeID)
				} else {
					dirtyTasks = append(dirtyTasks, cleanTasks[1:]...)
				}
				g.removeTasks(ctx, batch, dirtyTasks)
			}
		}
		return nil
	})
	if err != nil {
		log.G(ctx).WithError(err).Errorf("global orchestrator: reconcileServiceOneNode batch failed")
	}
}
Example #4
0
// Check returns true if the task's constraint is supported by the given node.
func (f *ConstraintFilter) Check(n *NodeInfo) bool {
	return constraint.NodeMatches(f.constraints, n.Node)
}