Пример #1
0
func (g *Orchestrator) reconcileServices(ctx context.Context, serviceIDs []string) {
	nodeCompleted := make(map[string]map[string]struct{})
	nodeTasks := make(map[string]map[string][]*api.Task)

	g.store.View(func(tx store.ReadTx) {
		for _, serviceID := range serviceIDs {
			tasks, err := store.FindTasks(tx, store.ByServiceID(serviceID))
			if err != nil {
				log.G(ctx).WithError(err).Errorf("global orchestrator: reconcileServices failed finding tasks for service %s", serviceID)
				continue
			}

			// a node may have completed this service
			nodeCompleted[serviceID] = make(map[string]struct{})
			// nodeID -> task list
			nodeTasks[serviceID] = make(map[string][]*api.Task)

			for _, t := range tasks {
				if isTaskRunning(t) {
					// Collect all running instances of this service
					nodeTasks[serviceID][t.NodeID] = append(nodeTasks[serviceID][t.NodeID], t)
				} else {
					// for finished tasks, check restartPolicy
					if isTaskCompleted(t, orchestrator.RestartCondition(t)) {
						nodeCompleted[serviceID][t.NodeID] = struct{}{}
					}
				}
			}
		}
	})

	_, err := g.store.Batch(func(batch *store.Batch) error {
		var updateTasks []orchestrator.Slot
		for _, serviceID := range serviceIDs {
			if _, exists := nodeTasks[serviceID]; !exists {
				continue
			}

			service := g.globalServices[serviceID]

			for nodeID, node := range g.nodes {
				meetsConstraints := constraint.NodeMatches(service.constraints, node)
				ntasks := nodeTasks[serviceID][nodeID]
				delete(nodeTasks[serviceID], nodeID)

				// if restart policy considers this node has finished its task
				// it should remove all running tasks
				if _, exists := nodeCompleted[serviceID][nodeID]; exists || !meetsConstraints {
					g.removeTasks(ctx, batch, ntasks)
					continue
				}

				if node.Spec.Availability == api.NodeAvailabilityPause {
					// the node is paused, so we won't add or update
					// any tasks
					continue
				}

				// this node needs to run 1 copy of the task
				if len(ntasks) == 0 {
					g.addTask(ctx, batch, service.Service, nodeID)
				} else {
					updateTasks = append(updateTasks, ntasks)
				}
			}
			if len(updateTasks) > 0 {
				g.updater.Update(ctx, g.cluster, service.Service, updateTasks)
			}

			// Remove any tasks assigned to nodes not found in g.nodes.
			// These must be associated with nodes that are drained, or
			// nodes that no longer exist.
			for _, ntasks := range nodeTasks[serviceID] {
				g.removeTasks(ctx, batch, ntasks)
			}
		}
		return nil
	})
	if err != nil {
		log.G(ctx).WithError(err).Errorf("global orchestrator: reconcileServices transaction failed")
	}
}
Пример #2
0
// reconcileServicesOneNode checks the specified services on one node
func (g *Orchestrator) reconcileServicesOneNode(ctx context.Context, serviceIDs []string, nodeID string) {
	node, exists := g.nodes[nodeID]
	if !exists {
		return
	}

	// whether each service has completed on the node
	completed := make(map[string]bool)
	// tasks by service
	tasks := make(map[string][]*api.Task)

	var (
		tasksOnNode []*api.Task
		err         error
	)

	g.store.View(func(tx store.ReadTx) {
		tasksOnNode, err = store.FindTasks(tx, store.ByNodeID(nodeID))
	})
	if err != nil {
		log.G(ctx).WithError(err).Errorf("global orchestrator: reconcile failed finding tasks on node %s", nodeID)
		return
	}

	for _, serviceID := range serviceIDs {
		for _, t := range tasksOnNode {
			if t.ServiceID != serviceID {
				continue
			}
			if isTaskRunning(t) {
				tasks[serviceID] = append(tasks[serviceID], t)
			} else {
				if isTaskCompleted(t, orchestrator.RestartCondition(t)) {
					completed[serviceID] = true
				}
			}
		}
	}

	_, err = g.store.Batch(func(batch *store.Batch) error {
		for _, serviceID := range serviceIDs {
			service, exists := g.globalServices[serviceID]
			if !exists {
				continue
			}

			if !constraint.NodeMatches(service.constraints, node) {
				continue
			}

			// if restart policy considers this node has finished its task
			// it should remove all running tasks
			if completed[serviceID] {
				g.removeTasks(ctx, batch, tasks[serviceID])
				continue
			}

			if node.Spec.Availability == api.NodeAvailabilityPause {
				// the node is paused, so we won't add or update tasks
				continue
			}

			if len(tasks) == 0 {
				g.addTask(ctx, batch, service.Service, nodeID)
			} else {
				// If task is out of date, update it. This can happen
				// on node reconciliation if, for example, we pause a
				// node, update the service, and then activate the node
				// later.

				// We don't use g.updater here for two reasons:
				// - This is not a rolling update. Since it was not
				//   triggered directly by updating the service, it
				//   should not observe the rolling update parameters
				//   or show status in UpdateStatus.
				// - Calling Update cancels any current rolling updates
				//   for the service, such as one triggered by service
				//   reconciliation.

				var (
					dirtyTasks []*api.Task
					cleanTasks []*api.Task
				)

				for _, t := range tasks[serviceID] {
					if orchestrator.IsTaskDirty(service.Service, t) {
						dirtyTasks = append(dirtyTasks, t)
					} else {
						cleanTasks = append(cleanTasks, t)
					}
				}

				if len(cleanTasks) == 0 {
					g.addTask(ctx, batch, service.Service, nodeID)
				} else {
					dirtyTasks = append(dirtyTasks, cleanTasks[1:]...)
				}
				g.removeTasks(ctx, batch, dirtyTasks)
			}
		}
		return nil
	})
	if err != nil {
		log.G(ctx).WithError(err).Errorf("global orchestrator: reconcileServiceOneNode batch failed")
	}
}
Пример #3
0
func (r *Supervisor) shouldRestart(ctx context.Context, t *api.Task, service *api.Service) bool {
	// TODO(aluzzardi): This function should not depend on `service`.

	condition := orchestrator.RestartCondition(t)

	if condition != api.RestartOnAny &&
		(condition != api.RestartOnFailure || t.Status.State == api.TaskStateCompleted) {
		return false
	}

	if t.Spec.Restart == nil || t.Spec.Restart.MaxAttempts == 0 {
		return true
	}

	instanceTuple := instanceTuple{
		instance:  t.Slot,
		serviceID: t.ServiceID,
	}

	// Instance is not meaningful for "global" tasks, so they need to be
	// indexed by NodeID.
	if orchestrator.IsGlobalService(service) {
		instanceTuple.nodeID = t.NodeID
	}

	r.mu.Lock()
	defer r.mu.Unlock()

	restartInfo := r.history[instanceTuple]
	if restartInfo == nil {
		return true
	}

	if t.Spec.Restart.Window == nil || (t.Spec.Restart.Window.Seconds == 0 && t.Spec.Restart.Window.Nanos == 0) {
		return restartInfo.totalRestarts < t.Spec.Restart.MaxAttempts
	}

	if restartInfo.restartedInstances == nil {
		return true
	}

	window, err := ptypes.Duration(t.Spec.Restart.Window)
	if err != nil {
		log.G(ctx).WithError(err).Error("invalid restart lookback window")
		return restartInfo.totalRestarts < t.Spec.Restart.MaxAttempts
	}
	lookback := time.Now().Add(-window)

	var next *list.Element
	for e := restartInfo.restartedInstances.Front(); e != nil; e = next {
		next = e.Next()

		if e.Value.(restartedInstance).timestamp.After(lookback) {
			break
		}
		restartInfo.restartedInstances.Remove(e)
	}

	numRestarts := uint64(restartInfo.restartedInstances.Len())

	if numRestarts == 0 {
		restartInfo.restartedInstances = nil
	}

	return numRestarts < t.Spec.Restart.MaxAttempts
}