Example #1
0
// WaitForCluster waits until node observes that the cluster wide config is
// committed to raft. This ensures that we can see and serve informations
// related to the cluster.
func WaitForCluster(ctx context.Context, n *Node) (cluster *api.Cluster, err error) {
	watch, cancel := state.Watch(n.MemoryStore().WatchQueue(), state.EventCreateCluster{})
	defer cancel()

	var clusters []*api.Cluster
	n.MemoryStore().View(func(readTx store.ReadTx) {
		clusters, err = store.FindClusters(readTx, store.ByName(store.DefaultClusterName))
	})

	if err != nil {
		return nil, err
	}

	if len(clusters) == 1 {
		cluster = clusters[0]
	} else {
		select {
		case e := <-watch:
			cluster = e.(state.EventCreateCluster).Cluster
		case <-ctx.Done():
			return nil, ctx.Err()
		}
	}

	return cluster, nil
}
Example #2
0
func TestRemoveTask(t *testing.T) {
	store := store.NewMemoryStore(nil)
	assert.NotNil(t, store)
	defer store.Close()

	watch, cancel := state.Watch(store.WatchQueue() /*state.EventCreateTask{}, state.EventUpdateTask{}*/)
	defer cancel()

	SetupCluster(t, store)

	// get the task
	observedTask1 := watchTaskCreate(t, watch)
	assert.Equal(t, observedTask1.Status.State, api.TaskStateNew)
	assert.Equal(t, observedTask1.ServiceAnnotations.Name, "name1")
	assert.Equal(t, observedTask1.NodeID, "id1")

	// delete the task
	deleteTask(t, store, observedTask1)

	// the task should be recreated
	observedTask2 := watchTaskCreate(t, watch)
	assert.Equal(t, observedTask2.Status.State, api.TaskStateNew)
	assert.Equal(t, observedTask2.ServiceAnnotations.Name, "name1")
	assert.Equal(t, observedTask2.NodeID, "id1")
}
Example #3
0
// Run is the ConstraintEnforcer's main loop.
func (ce *ConstraintEnforcer) Run() {
	defer close(ce.doneChan)

	watcher, cancelWatch := state.Watch(ce.store.WatchQueue(), state.EventUpdateNode{})
	defer cancelWatch()

	var (
		nodes []*api.Node
		err   error
	)
	ce.store.View(func(readTx store.ReadTx) {
		nodes, err = store.FindNodes(readTx, store.All)
	})
	if err != nil {
		log.L.WithError(err).Error("failed to check nodes for noncompliant tasks")
	} else {
		for _, node := range nodes {
			ce.shutdownNoncompliantTasks(node)
		}
	}

	for {
		select {
		case event := <-watcher:
			node := event.(state.EventUpdateNode).Node
			ce.shutdownNoncompliantTasks(node)
		case <-ce.stopChan:
			return
		}
	}
}
Example #4
0
func TestGetRemoteSignedCertificateWithPending(t *testing.T) {
	tc := testutils.NewTestCA(t)
	defer tc.Stop()

	// Create a new CSR to be signed
	csr, _, err := ca.GenerateAndWriteNewKey(tc.Paths.Node)
	assert.NoError(t, err)

	updates, cancel := state.Watch(tc.MemoryStore.WatchQueue(), state.EventCreateNode{})
	defer cancel()

	completed := make(chan error)
	go func() {
		_, err := ca.GetRemoteSignedCertificate(context.Background(), csr, tc.WorkerToken, tc.RootCA.Pool, tc.Remotes, nil, nil)
		completed <- err
	}()

	event := <-updates
	node := event.(state.EventCreateNode).Node.Copy()

	// Directly update the status of the store
	err = tc.MemoryStore.Update(func(tx store.Tx) error {
		node.Certificate.Status.State = api.IssuanceStateIssued

		return store.UpdateNode(tx, node)
	})
	assert.NoError(t, err)

	// Make sure GetRemoteSignedCertificate didn't return an error
	assert.NoError(t, <-completed)
}
Example #5
0
func TestNodeAvailability(t *testing.T) {
	store := store.NewMemoryStore(nil)
	assert.NotNil(t, store)
	defer store.Close()

	SetupCluster(t, store)

	watch, cancel := state.Watch(store.WatchQueue())
	defer cancel()

	skipEvents(t, watch)

	node1.Status.State = api.NodeStatus_READY
	node1.Spec.Availability = api.NodeAvailabilityActive

	// set node1 to drain
	updateNodeAvailability(t, store, node1, api.NodeAvailabilityDrain)

	// task should be set to dead
	observedTask1 := watchShutdownTask(t, watch)
	assert.Equal(t, observedTask1.ServiceAnnotations.Name, "name1")
	assert.Equal(t, observedTask1.NodeID, "id1")

	// set node1 to active
	updateNodeAvailability(t, store, node1, api.NodeAvailabilityActive)
	// task should be added back
	observedTask2 := watchTaskCreate(t, watch)
	assert.Equal(t, observedTask2.Status.State, api.TaskStateNew)
	assert.Equal(t, observedTask2.ServiceAnnotations.Name, "name1")
	assert.Equal(t, observedTask2.NodeID, "id1")
}
Example #6
0
func TestSchedulerNoReadyNodes(t *testing.T) {
	ctx := context.Background()
	initialTask := &api.Task{
		ID:           "id1",
		DesiredState: api.TaskStateRunning,
		ServiceAnnotations: api.Annotations{
			Name: "name1",
		},
		Status: api.TaskStatus{
			State: api.TaskStatePending,
		},
	}

	s := store.NewMemoryStore(nil)
	assert.NotNil(t, s)
	defer s.Close()

	err := s.Update(func(tx store.Tx) error {
		// Add initial task
		assert.NoError(t, store.CreateTask(tx, initialTask))
		return nil
	})
	assert.NoError(t, err)

	scheduler := New(s)

	watch, cancel := state.Watch(s.WatchQueue(), state.EventUpdateTask{})
	defer cancel()

	go func() {
		assert.NoError(t, scheduler.Run(ctx))
	}()
	defer scheduler.Stop()

	failure := watchAssignmentFailure(t, watch)
	assert.Equal(t, "no suitable node", failure.Status.Message)

	err = s.Update(func(tx store.Tx) error {
		// Create a ready node. The task should get assigned to this
		// node.
		node := &api.Node{
			ID: "newnode",
			Spec: api.NodeSpec{
				Annotations: api.Annotations{
					Name: "newnode",
				},
			},
			Status: api.NodeStatus{
				State: api.NodeStatus_READY,
			},
		}
		assert.NoError(t, store.CreateNode(tx, node))
		return nil
	})
	assert.NoError(t, err)

	assignment := watchAssignment(t, watch)
	assert.Equal(t, "newnode", assignment.NodeID)
}
Example #7
0
func (u *Updater) updateTask(ctx context.Context, slot slot, updated *api.Task) error {
	// Kick off the watch before even creating the updated task. This is in order to avoid missing any event.
	taskUpdates, cancel := state.Watch(u.watchQueue, state.EventUpdateTask{
		Task:   &api.Task{ID: updated.ID},
		Checks: []state.TaskCheckFunc{state.TaskCheckID},
	})
	defer cancel()

	var delayStartCh <-chan struct{}
	// Atomically create the updated task and bring down the old one.
	_, err := u.store.Batch(func(batch *store.Batch) error {
		err := batch.Update(func(tx store.Tx) error {
			if err := store.CreateTask(tx, updated); err != nil {
				return err
			}
			return nil
		})
		if err != nil {
			return err
		}

		u.removeOldTasks(ctx, batch, slot)

		for _, t := range slot {
			if t.DesiredState == api.TaskStateRunning {
				// Wait for the old task to stop or time out, and then set the new one
				// to RUNNING.
				delayStartCh = u.restarts.DelayStart(ctx, nil, t, updated.ID, 0, true)
				break
			}
		}

		return nil

	})
	if err != nil {
		return err
	}

	if delayStartCh != nil {
		<-delayStartCh
	}

	// Wait for the new task to come up.
	// TODO(aluzzardi): Consider adding a timeout here.
	for {
		select {
		case e := <-taskUpdates:
			updated = e.(state.EventUpdateTask).Task
			if updated.Status.State >= api.TaskStateRunning {
				return nil
			}
		case <-u.stopChan:
			return nil
		}
	}
}
Example #8
0
func (u *Updater) updateTask(ctx context.Context, original, updated *api.Task) error {
	log.G(ctx).Debugf("replacing %s with %s", original.ID, updated.ID)
	// Kick off the watch before even creating the updated task. This is in order to avoid missing any event.
	taskUpdates, cancel := state.Watch(u.watchQueue, state.EventUpdateTask{
		Task:   &api.Task{ID: updated.ID},
		Checks: []state.TaskCheckFunc{state.TaskCheckID},
	})
	defer cancel()

	var delayStartCh <-chan struct{}
	// Atomically create the updated task and bring down the old one.
	err := u.store.Update(func(tx store.Tx) error {
		t := store.GetTask(tx, original.ID)
		if t == nil {
			return fmt.Errorf("task %s not found while trying to update it", original.ID)
		}
		if t.DesiredState > api.TaskStateRunning {
			return fmt.Errorf("task %s was already shut down when reached by updater", original.ID)
		}
		t.DesiredState = api.TaskStateShutdown
		if err := store.UpdateTask(tx, t); err != nil {
			return err
		}

		if err := store.CreateTask(tx, updated); err != nil {
			return err
		}

		// Wait for the old task to stop or time out, and then set the new one
		// to RUNNING.
		delayStartCh = u.restarts.DelayStart(ctx, tx, original, updated.ID, 0, true)

		return nil

	})
	if err != nil {
		return err
	}

	<-delayStartCh

	// Wait for the new task to come up.
	// TODO(aluzzardi): Consider adding a timeout here.
	for {
		select {
		case e := <-taskUpdates:
			updated = e.(state.EventUpdateTask).Task
			if updated.Status.State >= api.TaskStateRunning {
				return nil
			}
		case <-u.stopChan:
			return nil
		}
	}
}
Example #9
0
// NewTaskReaper creates a new TaskReaper.
func NewTaskReaper(store *store.MemoryStore) *TaskReaper {
	watcher, cancel := state.Watch(store.WatchQueue(), state.EventCreateTask{}, state.EventUpdateCluster{})

	return &TaskReaper{
		store:       store,
		watcher:     watcher,
		cancelWatch: cancel,
		dirty:       make(map[instanceTuple]struct{}),
		stopChan:    make(chan struct{}),
		doneChan:    make(chan struct{}),
	}
}
Example #10
0
func (s *subscription) Run(ctx context.Context) {
	s.ctx, s.cancel = context.WithCancel(ctx)

	if s.follow() {
		wq := s.store.WatchQueue()
		ch, cancel := state.Watch(wq, state.EventCreateTask{}, state.EventUpdateTask{})
		go func() {
			defer cancel()
			s.watch(ch)
		}()
	}

	s.match()
}
Example #11
0
func TestSetup(t *testing.T) {
	store := store.NewMemoryStore(nil)
	assert.NotNil(t, store)
	defer store.Close()

	watch, cancel := state.Watch(store.WatchQueue() /*state.EventCreateTask{}, state.EventUpdateTask{}*/)
	defer cancel()

	observedTask1 := SetupCluster(t, store, watch)

	assert.Equal(t, observedTask1.Status.State, api.TaskStateNew)
	assert.Equal(t, observedTask1.ServiceAnnotations.Name, "name1")
	assert.Equal(t, observedTask1.NodeID, "id1")
}
Example #12
0
func TestAddNode(t *testing.T) {
	store := store.NewMemoryStore(nil)
	assert.NotNil(t, store)
	defer store.Close()

	watch, cancel := state.Watch(store.WatchQueue())
	defer cancel()

	SetupCluster(t, store, watch)

	addNode(t, store, node2)
	observedTask2 := testutils.WatchTaskCreate(t, watch)
	assert.Equal(t, observedTask2.Status.State, api.TaskStateNew)
	assert.Equal(t, observedTask2.ServiceAnnotations.Name, "name1")
	assert.Equal(t, observedTask2.NodeID, "id2")
}
Example #13
0
func TestDeleteService(t *testing.T) {
	store := store.NewMemoryStore(nil)
	assert.NotNil(t, store)
	defer store.Close()

	watch, cancel := state.Watch(store.WatchQueue())
	defer cancel()

	SetupCluster(t, store, watch)

	deleteService(t, store, service1)
	// task should be deleted
	observedTask := testutils.WatchTaskDelete(t, watch)
	assert.Equal(t, observedTask.ServiceAnnotations.Name, "name1")
	assert.Equal(t, observedTask.NodeID, "id1")
}
Example #14
0
func TestDeleteNode(t *testing.T) {
	store := store.NewMemoryStore(nil)
	assert.NotNil(t, store)

	SetupCluster(t, store)

	watch, cancel := state.Watch(store.WatchQueue())
	defer cancel()

	skipEvents(t, watch)

	deleteNode(t, store, node1)
	// task should be set to dead
	observedTask := watchShutdownTask(t, watch)
	assert.Equal(t, observedTask.ServiceAnnotations.Name, "name1")
	assert.Equal(t, observedTask.NodeID, "id1")
}
Example #15
0
func TestAddService(t *testing.T) {
	store := store.NewMemoryStore(nil)
	assert.NotNil(t, store)

	SetupCluster(t, store)

	watch, cancel := state.Watch(store.WatchQueue())
	defer cancel()

	skipEvents(t, watch)

	addService(t, store, service2)
	observedTask := watchTaskCreate(t, watch)
	assert.Equal(t, observedTask.Status.State, api.TaskStateNew)
	assert.Equal(t, observedTask.ServiceAnnotations.Name, "name2")
	assert.True(t, observedTask.NodeID == "id1")
}
Example #16
0
// ViewAndWatch calls a callback which can observe the state of this
// MemoryStore. It also returns a channel that will return further events from
// this point so the snapshot can be kept up to date. The watch channel must be
// released with watch.StopWatch when it is no longer needed. The channel is
// guaranteed to get all events after the moment of the snapshot, and only
// those events.
func ViewAndWatch(store *MemoryStore, cb func(ReadTx) error, specifiers ...state.Event) (watch chan events.Event, cancel func(), err error) {
	// Using Update to lock the store and guarantee consistency between
	// the watcher and the the state seen by the callback. snapshotReadTx
	// exposes this Tx as a ReadTx so the callback can't modify it.
	err = store.Update(func(tx Tx) error {
		if err := cb(tx); err != nil {
			return err
		}
		watch, cancel = state.Watch(store.WatchQueue(), specifiers...)
		return nil
	})
	if watch != nil && err != nil {
		cancel()
		cancel = nil
		watch = nil
	}
	return
}
Example #17
0
func TestSchedulerFaultyNode(t *testing.T) {
	ctx := context.Background()

	taskTemplate := &api.Task{
		ServiceID:    "service1",
		DesiredState: api.TaskStateRunning,
		ServiceAnnotations: api.Annotations{
			Name: "name1",
		},
		Status: api.TaskStatus{
			State: api.TaskStatePending,
		},
	}

	node1 := &api.Node{
		ID: "id1",
		Spec: api.NodeSpec{
			Annotations: api.Annotations{
				Name: "id1",
			},
		},
		Status: api.NodeStatus{
			State: api.NodeStatus_READY,
		},
	}

	node2 := &api.Node{
		ID: "id2",
		Spec: api.NodeSpec{
			Annotations: api.Annotations{
				Name: "id2",
			},
		},
		Status: api.NodeStatus{
			State: api.NodeStatus_READY,
		},
	}

	s := store.NewMemoryStore(nil)
	assert.NotNil(t, s)
	defer s.Close()

	err := s.Update(func(tx store.Tx) error {
		// Add initial nodes, and one task assigned to node id1
		assert.NoError(t, store.CreateNode(tx, node1))
		assert.NoError(t, store.CreateNode(tx, node2))

		task1 := taskTemplate.Copy()
		task1.ID = "id1"
		task1.NodeID = "id1"
		task1.Status.State = api.TaskStateRunning
		assert.NoError(t, store.CreateTask(tx, task1))
		return nil
	})
	assert.NoError(t, err)

	scheduler := New(s)

	watch, cancel := state.Watch(s.WatchQueue(), state.EventUpdateTask{})
	defer cancel()

	go func() {
		assert.NoError(t, scheduler.Run(ctx))
	}()
	defer scheduler.Stop()

	for i := 0; i != 8; i++ {
		// Simulate a task failure cycle
		newTask := taskTemplate.Copy()
		newTask.ID = identity.NewID()

		err = s.Update(func(tx store.Tx) error {
			assert.NoError(t, store.CreateTask(tx, newTask))
			return nil
		})
		assert.NoError(t, err)

		assignment := watchAssignment(t, watch)
		assert.Equal(t, newTask.ID, assignment.ID)

		if i < 5 {
			// The first 5 attempts should be assigned to node id2 because
			// it has no replicas of the service.
			assert.Equal(t, "id2", assignment.NodeID)
		} else {
			// The next ones should be assigned to id1, since we'll
			// flag id2 as potentially faulty.
			assert.Equal(t, "id1", assignment.NodeID)
		}

		err = s.Update(func(tx store.Tx) error {
			newTask := store.GetTask(tx, newTask.ID)
			require.NotNil(t, newTask)
			newTask.Status.State = api.TaskStateFailed
			assert.NoError(t, store.UpdateTask(tx, newTask))
			return nil
		})
		assert.NoError(t, err)
	}
}
Example #18
0
func benchScheduler(b *testing.B, nodes, tasks int, networkConstraints bool) {
	ctx := context.Background()

	for iters := 0; iters < b.N; iters++ {
		b.StopTimer()
		s := store.NewMemoryStore(nil)
		scheduler := New(s)

		watch, cancel := state.Watch(s.WatchQueue(), state.EventUpdateTask{})

		go func() {
			_ = scheduler.Run(ctx)
		}()

		// Let the scheduler get started
		runtime.Gosched()

		_ = s.Update(func(tx store.Tx) error {
			// Create initial nodes and tasks
			for i := 0; i < nodes; i++ {
				n := &api.Node{
					ID: identity.NewID(),
					Spec: api.NodeSpec{
						Annotations: api.Annotations{
							Name:   "name" + strconv.Itoa(i),
							Labels: make(map[string]string),
						},
					},
					Status: api.NodeStatus{
						State: api.NodeStatus_READY,
					},
					Description: &api.NodeDescription{
						Engine: &api.EngineDescription{},
					},
				}
				// Give every third node a special network
				if i%3 == 0 {
					n.Description.Engine.Plugins = []api.PluginDescription{
						{
							Name: "network",
							Type: "Network",
						},
					}

				}
				err := store.CreateNode(tx, n)
				if err != nil {
					panic(err)
				}
			}
			for i := 0; i < tasks; i++ {
				id := "task" + strconv.Itoa(i)
				t := &api.Task{
					ID:           id,
					DesiredState: api.TaskStateRunning,
					ServiceAnnotations: api.Annotations{
						Name: id,
					},
					Status: api.TaskStatus{
						State: api.TaskStatePending,
					},
				}
				if networkConstraints {
					t.Networks = []*api.NetworkAttachment{
						{
							Network: &api.Network{
								DriverState: &api.Driver{
									Name: "network",
								},
							},
						},
					}
				}
				err := store.CreateTask(tx, t)
				if err != nil {
					panic(err)
				}
			}
			b.StartTimer()
			return nil
		})

		for i := 0; i != tasks; i++ {
			<-watch
		}

		scheduler.Stop()
		cancel()
		s.Close()
	}
}
Example #19
0
func TestDrain(t *testing.T) {
	ctx := context.Background()
	initialService := &api.Service{
		ID: "id1",
		Spec: api.ServiceSpec{
			Annotations: api.Annotations{
				Name: "name1",
			},
			Task: api.TaskSpec{
				Runtime: &api.TaskSpec_Container{
					Container: &api.ContainerSpec{},
				},
				Restart: &api.RestartPolicy{
					Condition: api.RestartOnNone,
				},
			},
			Mode: &api.ServiceSpec_Replicated{
				Replicated: &api.ReplicatedService{
					Replicas: 6,
				},
			},
		},
	}
	initialNodeSet := []*api.Node{
		{
			ID: "id1",
			Spec: api.NodeSpec{
				Annotations: api.Annotations{
					Name: "name1",
				},
				Availability: api.NodeAvailabilityActive,
			},
			Status: api.NodeStatus{
				State: api.NodeStatus_READY,
			},
		},
		{
			ID: "id2",
			Spec: api.NodeSpec{
				Annotations: api.Annotations{
					Name: "name2",
				},
				Availability: api.NodeAvailabilityActive,
			},
			Status: api.NodeStatus{
				State: api.NodeStatus_DOWN,
			},
		},
		// We should NOT kick out tasks on UNKNOWN nodes.
		{
			ID: "id3",
			Spec: api.NodeSpec{
				Annotations: api.Annotations{
					Name: "name3",
				},
				Availability: api.NodeAvailabilityActive,
			},
			Status: api.NodeStatus{
				State: api.NodeStatus_UNKNOWN,
			},
		},
		{
			ID: "id4",
			Spec: api.NodeSpec{
				Annotations: api.Annotations{
					Name: "name4",
				},
				Availability: api.NodeAvailabilityPause,
			},
			Status: api.NodeStatus{
				State: api.NodeStatus_READY,
			},
		},
		{
			ID: "id5",
			Spec: api.NodeSpec{
				Annotations: api.Annotations{
					Name: "name5",
				},
				Availability: api.NodeAvailabilityDrain,
			},
			Status: api.NodeStatus{
				State: api.NodeStatus_READY,
			},
		},
	}

	initialTaskSet := []*api.Task{
		// Task not assigned to any node
		{
			ID: "id0",
			Status: api.TaskStatus{
				State: api.TaskStateNew,
			},
			Slot: 1,
			ServiceAnnotations: api.Annotations{
				Name: "name0",
			},
			ServiceID: "id1",
		},
		// Tasks assigned to the nodes defined above
		{
			ID: "id1",
			Status: api.TaskStatus{
				State: api.TaskStateNew,
			},
			Slot: 2,
			ServiceAnnotations: api.Annotations{
				Name: "name1",
			},
			ServiceID: "id1",
			NodeID:    "id1",
		},
		{
			ID: "id2",
			Status: api.TaskStatus{
				State: api.TaskStateNew,
			},
			Slot: 3,
			ServiceAnnotations: api.Annotations{
				Name: "name2",
			},
			ServiceID: "id1",
			NodeID:    "id2",
		},
		{
			ID: "id3",
			Status: api.TaskStatus{
				State: api.TaskStateNew,
			},
			Slot: 4,
			ServiceAnnotations: api.Annotations{
				Name: "name3",
			},
			ServiceID: "id1",
			NodeID:    "id3",
		},
		{
			ID: "id4",
			Status: api.TaskStatus{
				State: api.TaskStateNew,
			},
			Slot: 5,
			ServiceAnnotations: api.Annotations{
				Name: "name4",
			},
			ServiceID: "id1",
			NodeID:    "id4",
		},
		{
			ID: "id5",
			Status: api.TaskStatus{
				State: api.TaskStateNew,
			},
			Slot: 6,
			ServiceAnnotations: api.Annotations{
				Name: "name5",
			},
			ServiceID: "id1",
			NodeID:    "id5",
		},
	}

	s := store.NewMemoryStore(nil)
	assert.NotNil(t, s)
	defer s.Close()

	err := s.Update(func(tx store.Tx) error {
		// Prepopulate service
		assert.NoError(t, store.CreateService(tx, initialService))
		// Prepoulate nodes
		for _, n := range initialNodeSet {
			assert.NoError(t, store.CreateNode(tx, n))
		}

		// Prepopulate tasks
		for _, task := range initialTaskSet {
			assert.NoError(t, store.CreateTask(tx, task))
		}
		return nil
	})
	assert.NoError(t, err)

	watch, cancel := state.Watch(s.WatchQueue(), state.EventUpdateTask{})
	defer cancel()

	orchestrator := NewReplicatedOrchestrator(s)
	defer orchestrator.Stop()

	go func() {
		assert.NoError(t, orchestrator.Run(ctx))
	}()

	// id2 and id5 should be killed immediately
	deletion1 := watchShutdownTask(t, watch)
	deletion2 := watchShutdownTask(t, watch)

	assert.Regexp(t, "id(2|5)", deletion1.ID)
	assert.Regexp(t, "id(2|5)", deletion1.NodeID)
	assert.Regexp(t, "id(2|5)", deletion2.ID)
	assert.Regexp(t, "id(2|5)", deletion2.NodeID)

	// Create a new task, assigned to node id2
	err = s.Update(func(tx store.Tx) error {
		task := initialTaskSet[2].Copy()
		task.ID = "newtask"
		task.NodeID = "id2"
		assert.NoError(t, store.CreateTask(tx, task))
		return nil
	})
	assert.NoError(t, err)

	deletion3 := watchShutdownTask(t, watch)
	assert.Equal(t, "newtask", deletion3.ID)
	assert.Equal(t, "id2", deletion3.NodeID)

	// Set node id4 to the DRAINED state
	err = s.Update(func(tx store.Tx) error {
		n := initialNodeSet[3].Copy()
		n.Spec.Availability = api.NodeAvailabilityDrain
		assert.NoError(t, store.UpdateNode(tx, n))
		return nil
	})
	assert.NoError(t, err)

	deletion4 := watchShutdownTask(t, watch)
	assert.Equal(t, "id4", deletion4.ID)
	assert.Equal(t, "id4", deletion4.NodeID)

	// Delete node id1
	err = s.Update(func(tx store.Tx) error {
		assert.NoError(t, store.DeleteNode(tx, "id1"))
		return nil
	})
	assert.NoError(t, err)

	deletion5 := watchShutdownTask(t, watch)
	assert.Equal(t, "id1", deletion5.ID)
	assert.Equal(t, "id1", deletion5.NodeID)
}
Example #20
0
func TestUpdaterRollback(t *testing.T) {
	ctx := context.Background()
	s := store.NewMemoryStore(nil)
	assert.NotNil(t, s)
	defer s.Close()

	orchestrator := NewReplicatedOrchestrator(s)
	defer orchestrator.Stop()

	var (
		failImage1 uint32
		failImage2 uint32
	)

	watchCreate, cancelCreate := state.Watch(s.WatchQueue(), state.EventCreateTask{})
	defer cancelCreate()

	watchServiceUpdate, cancelServiceUpdate := state.Watch(s.WatchQueue(), state.EventUpdateService{})
	defer cancelServiceUpdate()

	// Fail new tasks the updater tries to run
	watchUpdate, cancelUpdate := state.Watch(s.WatchQueue(), state.EventUpdateTask{})
	defer cancelUpdate()
	go func() {
		failedLast := false
		for {
			select {
			case e := <-watchUpdate:
				task := e.(state.EventUpdateTask).Task
				if task.DesiredState == task.Status.State {
					continue
				}
				if task.DesiredState == api.TaskStateRunning && task.Status.State != api.TaskStateFailed && task.Status.State != api.TaskStateRunning {
					err := s.Update(func(tx store.Tx) error {
						task = store.GetTask(tx, task.ID)
						// Never fail two image2 tasks in a row, so there's a mix of
						// failed and successful tasks for the rollback.
						if task.Spec.GetContainer().Image == "image1" && atomic.LoadUint32(&failImage1) == 1 {
							task.Status.State = api.TaskStateFailed
							failedLast = true
						} else if task.Spec.GetContainer().Image == "image2" && atomic.LoadUint32(&failImage2) == 1 && !failedLast {
							task.Status.State = api.TaskStateFailed
							failedLast = true
						} else {
							task.Status.State = task.DesiredState
							failedLast = false
						}
						return store.UpdateTask(tx, task)
					})
					assert.NoError(t, err)
				} else if task.DesiredState > api.TaskStateRunning {
					err := s.Update(func(tx store.Tx) error {
						task = store.GetTask(tx, task.ID)
						task.Status.State = task.DesiredState
						return store.UpdateTask(tx, task)
					})
					assert.NoError(t, err)
				}
			}
		}
	}()

	// Create a service with four replicas specified before the orchestrator
	// is started. This should result in two tasks when the orchestrator
	// starts up.
	err := s.Update(func(tx store.Tx) error {
		s1 := &api.Service{
			ID: "id1",
			Spec: api.ServiceSpec{
				Annotations: api.Annotations{
					Name: "name1",
				},
				Task: api.TaskSpec{
					Runtime: &api.TaskSpec_Container{
						Container: &api.ContainerSpec{
							Image: "image1",
						},
					},
					Restart: &api.RestartPolicy{
						Condition: api.RestartOnNone,
					},
				},
				Mode: &api.ServiceSpec_Replicated{
					Replicated: &api.ReplicatedService{
						Replicas: 4,
					},
				},
				Update: &api.UpdateConfig{
					FailureAction:   api.UpdateConfig_ROLLBACK,
					Parallelism:     1,
					Delay:           *ptypes.DurationProto(10 * time.Millisecond),
					Monitor:         ptypes.DurationProto(500 * time.Millisecond),
					MaxFailureRatio: 0.4,
				},
			},
		}

		assert.NoError(t, store.CreateService(tx, s1))
		return nil
	})
	assert.NoError(t, err)

	// Start the orchestrator.
	go func() {
		assert.NoError(t, orchestrator.Run(ctx))
	}()

	observedTask := testutils.WatchTaskCreate(t, watchCreate)
	assert.Equal(t, observedTask.Status.State, api.TaskStateNew)
	assert.Equal(t, observedTask.Spec.GetContainer().Image, "image1")

	observedTask = testutils.WatchTaskCreate(t, watchCreate)
	assert.Equal(t, observedTask.Status.State, api.TaskStateNew)
	assert.Equal(t, observedTask.Spec.GetContainer().Image, "image1")

	observedTask = testutils.WatchTaskCreate(t, watchCreate)
	assert.Equal(t, observedTask.Status.State, api.TaskStateNew)
	assert.Equal(t, observedTask.Spec.GetContainer().Image, "image1")

	observedTask = testutils.WatchTaskCreate(t, watchCreate)
	assert.Equal(t, observedTask.Status.State, api.TaskStateNew)
	assert.Equal(t, observedTask.Spec.GetContainer().Image, "image1")

	atomic.StoreUint32(&failImage2, 1)

	// Start a rolling update
	err = s.Update(func(tx store.Tx) error {
		s1 := store.GetService(tx, "id1")
		require.NotNil(t, s1)
		s1.PreviousSpec = s1.Spec.Copy()
		s1.UpdateStatus = nil
		s1.Spec.Task.GetContainer().Image = "image2"
		assert.NoError(t, store.UpdateService(tx, s1))
		return nil
	})
	assert.NoError(t, err)

	// Should see three tasks started, then a rollback

	observedTask = testutils.WatchTaskCreate(t, watchCreate)
	assert.Equal(t, observedTask.Status.State, api.TaskStateNew)
	assert.Equal(t, observedTask.Spec.GetContainer().Image, "image2")

	observedTask = testutils.WatchTaskCreate(t, watchCreate)
	assert.Equal(t, observedTask.Status.State, api.TaskStateNew)
	assert.Equal(t, observedTask.Spec.GetContainer().Image, "image2")

	observedTask = testutils.WatchTaskCreate(t, watchCreate)
	assert.Equal(t, observedTask.Status.State, api.TaskStateNew)
	assert.Equal(t, observedTask.Spec.GetContainer().Image, "image2")

	// Should get to the ROLLBACK_STARTED state
	for {
		e := <-watchServiceUpdate
		if e.(state.EventUpdateService).Service.UpdateStatus == nil {
			continue
		}
		if e.(state.EventUpdateService).Service.UpdateStatus.State == api.UpdateStatus_ROLLBACK_STARTED {
			break
		}
	}

	observedTask = testutils.WatchTaskCreate(t, watchCreate)
	assert.Equal(t, observedTask.Status.State, api.TaskStateNew)
	assert.Equal(t, observedTask.Spec.GetContainer().Image, "image1")

	observedTask = testutils.WatchTaskCreate(t, watchCreate)
	assert.Equal(t, observedTask.Status.State, api.TaskStateNew)
	assert.Equal(t, observedTask.Spec.GetContainer().Image, "image1")

	observedTask = testutils.WatchTaskCreate(t, watchCreate)
	assert.Equal(t, observedTask.Status.State, api.TaskStateNew)
	assert.Equal(t, observedTask.Spec.GetContainer().Image, "image1")

	// Should end up in ROLLBACK_COMPLETED state
	for {
		e := <-watchServiceUpdate
		if e.(state.EventUpdateService).Service.UpdateStatus.State == api.UpdateStatus_ROLLBACK_COMPLETED {
			break
		}
	}

	atomic.StoreUint32(&failImage1, 1)

	// Repeat the rolling update but this time fail the tasks that the
	// rollback creates. It should end up in ROLLBACK_PAUSED.
	err = s.Update(func(tx store.Tx) error {
		s1 := store.GetService(tx, "id1")
		require.NotNil(t, s1)
		s1.PreviousSpec = s1.Spec.Copy()
		s1.UpdateStatus = nil
		s1.Spec.Task.GetContainer().Image = "image2"
		assert.NoError(t, store.UpdateService(tx, s1))
		return nil
	})
	assert.NoError(t, err)

	// Should see three tasks started, then a rollback

	observedTask = testutils.WatchTaskCreate(t, watchCreate)
	assert.Equal(t, observedTask.Status.State, api.TaskStateNew)
	assert.Equal(t, observedTask.Spec.GetContainer().Image, "image2")

	observedTask = testutils.WatchTaskCreate(t, watchCreate)
	assert.Equal(t, observedTask.Status.State, api.TaskStateNew)
	assert.Equal(t, observedTask.Spec.GetContainer().Image, "image2")

	observedTask = testutils.WatchTaskCreate(t, watchCreate)
	assert.Equal(t, observedTask.Status.State, api.TaskStateNew)
	assert.Equal(t, observedTask.Spec.GetContainer().Image, "image2")

	// Should get to the ROLLBACK_STARTED state
	for {
		e := <-watchServiceUpdate
		if e.(state.EventUpdateService).Service.UpdateStatus == nil {
			continue
		}
		if e.(state.EventUpdateService).Service.UpdateStatus.State == api.UpdateStatus_ROLLBACK_STARTED {
			break
		}
	}

	observedTask = testutils.WatchTaskCreate(t, watchCreate)
	assert.Equal(t, observedTask.Status.State, api.TaskStateNew)
	assert.Equal(t, observedTask.Spec.GetContainer().Image, "image1")

	observedTask = testutils.WatchTaskCreate(t, watchCreate)
	assert.Equal(t, observedTask.Status.State, api.TaskStateNew)
	assert.Equal(t, observedTask.Spec.GetContainer().Image, "image1")

	observedTask = testutils.WatchTaskCreate(t, watchCreate)
	assert.Equal(t, observedTask.Status.State, api.TaskStateNew)
	assert.Equal(t, observedTask.Spec.GetContainer().Image, "image1")

	// Should end up in ROLLBACK_PAUSED state
	for {
		e := <-watchServiceUpdate
		if e.(state.EventUpdateService).Service.UpdateStatus.State == api.UpdateStatus_ROLLBACK_PAUSED {
			break
		}
	}
}
Example #21
0
// Run starts all allocator go-routines and waits for Stop to be called.
func (a *Allocator) Run(ctx context.Context) error {
	// Setup cancel context for all goroutines to use.
	ctx, cancel := context.WithCancel(ctx)
	var wg sync.WaitGroup

	defer func() {
		cancel()
		wg.Wait()
		close(a.doneChan)
	}()

	var actors []func() error
	watch, watchCancel := state.Watch(a.store.WatchQueue(),
		state.EventCreateNetwork{},
		state.EventDeleteNetwork{},
		state.EventCreateService{},
		state.EventUpdateService{},
		state.EventDeleteService{},
		state.EventCreateTask{},
		state.EventUpdateTask{},
		state.EventDeleteTask{},
		state.EventCreateNode{},
		state.EventUpdateNode{},
		state.EventDeleteNode{},
		state.EventCommit{},
	)

	for _, aa := range []allocActor{
		{
			ch:        watch,
			cancel:    watchCancel,
			taskVoter: networkVoter,
			init:      a.doNetworkInit,
			action:    a.doNetworkAlloc,
		},
	} {
		if aa.taskVoter != "" {
			a.registerToVote(aa.taskVoter)
		}

		// Copy the iterated value for variable capture.
		aaCopy := aa
		actor := func() error {
			wg.Add(1)
			defer wg.Done()

			// init might return an allocator specific context
			// which is a child of the passed in context to hold
			// allocator specific state
			if err := aaCopy.init(ctx); err != nil {
				// Stop the watches for this allocator
				// if we are failing in the init of
				// this allocator.
				aa.cancel()
				return err
			}

			wg.Add(1)
			go func() {
				defer wg.Done()
				a.run(ctx, aaCopy)
			}()
			return nil
		}

		actors = append(actors, actor)
	}

	for _, actor := range actors {
		if err := actor(); err != nil {
			return err
		}
	}

	<-a.stopChan
	return nil
}
Example #22
0
func TestScheduler(t *testing.T) {
	ctx := context.Background()
	initialNodeSet := []*api.Node{
		{
			ID: "id1",
			Spec: api.NodeSpec{
				Annotations: api.Annotations{
					Name: "name1",
				},
			},
			Status: api.NodeStatus{
				State: api.NodeStatus_READY,
			},
		},
		{
			ID: "id2",
			Spec: api.NodeSpec{
				Annotations: api.Annotations{
					Name: "name2",
				},
			},
			Status: api.NodeStatus{
				State: api.NodeStatus_READY,
			},
		},
		{
			ID: "id3",
			Spec: api.NodeSpec{
				Annotations: api.Annotations{
					Name: "name2",
				},
			},
			Status: api.NodeStatus{
				State: api.NodeStatus_READY,
			},
		},
	}

	initialTaskSet := []*api.Task{
		{
			ID:           "id1",
			DesiredState: api.TaskStateRunning,
			ServiceAnnotations: api.Annotations{
				Name: "name1",
			},

			Status: api.TaskStatus{
				State: api.TaskStateAssigned,
			},
			NodeID: initialNodeSet[0].ID,
		},
		{
			ID:           "id2",
			DesiredState: api.TaskStateRunning,
			ServiceAnnotations: api.Annotations{
				Name: "name2",
			},
			Status: api.TaskStatus{
				State: api.TaskStatePending,
			},
		},
		{
			ID:           "id3",
			DesiredState: api.TaskStateRunning,
			ServiceAnnotations: api.Annotations{
				Name: "name2",
			},
			Status: api.TaskStatus{
				State: api.TaskStatePending,
			},
		},
	}

	s := store.NewMemoryStore(nil)
	assert.NotNil(t, s)
	defer s.Close()

	err := s.Update(func(tx store.Tx) error {
		// Prepoulate nodes
		for _, n := range initialNodeSet {
			assert.NoError(t, store.CreateNode(tx, n))
		}

		// Prepopulate tasks
		for _, task := range initialTaskSet {
			assert.NoError(t, store.CreateTask(tx, task))
		}
		return nil
	})
	assert.NoError(t, err)

	scheduler := New(s)

	watch, cancel := state.Watch(s.WatchQueue(), state.EventUpdateTask{})
	defer cancel()

	go func() {
		assert.NoError(t, scheduler.Run(ctx))
	}()
	defer scheduler.Stop()

	assignment1 := watchAssignment(t, watch)
	// must assign to id2 or id3 since id1 already has a task
	assert.Regexp(t, assignment1.NodeID, "(id2|id3)")

	assignment2 := watchAssignment(t, watch)
	// must assign to id2 or id3 since id1 already has a task
	if assignment1.NodeID == "id2" {
		assert.Equal(t, "id3", assignment2.NodeID)
	} else {
		assert.Equal(t, "id2", assignment2.NodeID)
	}

	err = s.Update(func(tx store.Tx) error {
		// Update each node to make sure this doesn't mess up the
		// scheduler's state.
		for _, n := range initialNodeSet {
			assert.NoError(t, store.UpdateNode(tx, n))
		}
		return nil
	})
	assert.NoError(t, err)

	err = s.Update(func(tx store.Tx) error {
		// Delete the task associated with node 1 so it's now the most lightly
		// loaded node.
		assert.NoError(t, store.DeleteTask(tx, "id1"))

		// Create a new task. It should get assigned to id1.
		t4 := &api.Task{
			ID:           "id4",
			DesiredState: api.TaskStateRunning,
			ServiceAnnotations: api.Annotations{
				Name: "name4",
			},
			Status: api.TaskStatus{
				State: api.TaskStatePending,
			},
		}
		assert.NoError(t, store.CreateTask(tx, t4))
		return nil
	})
	assert.NoError(t, err)

	assignment3 := watchAssignment(t, watch)
	assert.Equal(t, "id1", assignment3.NodeID)

	// Update a task to make it unassigned. It should get assigned by the
	// scheduler.
	err = s.Update(func(tx store.Tx) error {
		// Remove assignment from task id4. It should get assigned
		// to node id1.
		t4 := &api.Task{
			ID:           "id4",
			DesiredState: api.TaskStateRunning,
			ServiceAnnotations: api.Annotations{
				Name: "name4",
			},
			Status: api.TaskStatus{
				State: api.TaskStatePending,
			},
		}
		assert.NoError(t, store.UpdateTask(tx, t4))
		return nil
	})
	assert.NoError(t, err)

	assignment4 := watchAssignment(t, watch)
	assert.Equal(t, "id1", assignment4.NodeID)

	err = s.Update(func(tx store.Tx) error {
		// Create a ready node, then remove it. No tasks should ever
		// be assigned to it.
		node := &api.Node{
			ID: "removednode",
			Spec: api.NodeSpec{
				Annotations: api.Annotations{
					Name: "removednode",
				},
			},
			Status: api.NodeStatus{
				State: api.NodeStatus_DOWN,
			},
		}
		assert.NoError(t, store.CreateNode(tx, node))
		assert.NoError(t, store.DeleteNode(tx, node.ID))

		// Create an unassigned task.
		task := &api.Task{
			ID:           "removednode",
			DesiredState: api.TaskStateRunning,
			ServiceAnnotations: api.Annotations{
				Name: "removednode",
			},
			Status: api.TaskStatus{
				State: api.TaskStatePending,
			},
		}
		assert.NoError(t, store.CreateTask(tx, task))
		return nil
	})
	assert.NoError(t, err)

	assignmentRemovedNode := watchAssignment(t, watch)
	assert.NotEqual(t, "removednode", assignmentRemovedNode.NodeID)

	err = s.Update(func(tx store.Tx) error {
		// Create a ready node. It should be used for the next
		// assignment.
		n4 := &api.Node{
			ID: "id4",
			Spec: api.NodeSpec{
				Annotations: api.Annotations{
					Name: "name4",
				},
			},
			Status: api.NodeStatus{
				State: api.NodeStatus_READY,
			},
		}
		assert.NoError(t, store.CreateNode(tx, n4))

		// Create an unassigned task.
		t5 := &api.Task{
			ID:           "id5",
			DesiredState: api.TaskStateRunning,
			ServiceAnnotations: api.Annotations{
				Name: "name5",
			},
			Status: api.TaskStatus{
				State: api.TaskStatePending,
			},
		}
		assert.NoError(t, store.CreateTask(tx, t5))
		return nil
	})
	assert.NoError(t, err)

	assignment5 := watchAssignment(t, watch)
	assert.Equal(t, "id4", assignment5.NodeID)

	err = s.Update(func(tx store.Tx) error {
		// Create a non-ready node. It should NOT be used for the next
		// assignment.
		n5 := &api.Node{
			ID: "id5",
			Spec: api.NodeSpec{
				Annotations: api.Annotations{
					Name: "name5",
				},
			},
			Status: api.NodeStatus{
				State: api.NodeStatus_DOWN,
			},
		}
		assert.NoError(t, store.CreateNode(tx, n5))

		// Create an unassigned task.
		t6 := &api.Task{
			ID:           "id6",
			DesiredState: api.TaskStateRunning,
			ServiceAnnotations: api.Annotations{
				Name: "name6",
			},
			Status: api.TaskStatus{
				State: api.TaskStatePending,
			},
		}
		assert.NoError(t, store.CreateTask(tx, t6))
		return nil
	})
	assert.NoError(t, err)

	assignment6 := watchAssignment(t, watch)
	assert.NotEqual(t, "id5", assignment6.NodeID)

	err = s.Update(func(tx store.Tx) error {
		// Update node id5 to put it in the READY state.
		n5 := &api.Node{
			ID: "id5",
			Spec: api.NodeSpec{
				Annotations: api.Annotations{
					Name: "name5",
				},
			},
			Status: api.NodeStatus{
				State: api.NodeStatus_READY,
			},
		}
		assert.NoError(t, store.UpdateNode(tx, n5))

		// Create an unassigned task. Should be assigned to the
		// now-ready node.
		t7 := &api.Task{
			ID:           "id7",
			DesiredState: api.TaskStateRunning,
			ServiceAnnotations: api.Annotations{
				Name: "name7",
			},
			Status: api.TaskStatus{
				State: api.TaskStatePending,
			},
		}
		assert.NoError(t, store.CreateTask(tx, t7))
		return nil
	})
	assert.NoError(t, err)

	assignment7 := watchAssignment(t, watch)
	assert.Equal(t, "id5", assignment7.NodeID)

	err = s.Update(func(tx store.Tx) error {
		// Create a ready node, then immediately take it down. The next
		// unassigned task should NOT be assigned to it.
		n6 := &api.Node{
			ID: "id6",
			Spec: api.NodeSpec{
				Annotations: api.Annotations{
					Name: "name6",
				},
			},
			Status: api.NodeStatus{
				State: api.NodeStatus_READY,
			},
		}
		assert.NoError(t, store.CreateNode(tx, n6))
		n6.Status.State = api.NodeStatus_DOWN
		assert.NoError(t, store.UpdateNode(tx, n6))

		// Create an unassigned task.
		t8 := &api.Task{
			ID:           "id8",
			DesiredState: api.TaskStateRunning,
			ServiceAnnotations: api.Annotations{
				Name: "name8",
			},
			Status: api.TaskStatus{
				State: api.TaskStatePending,
			},
		}
		assert.NoError(t, store.CreateTask(tx, t8))
		return nil
	})
	assert.NoError(t, err)

	assignment8 := watchAssignment(t, watch)
	assert.NotEqual(t, "id6", assignment8.NodeID)
}
Example #23
0
func TestAllocator(t *testing.T) {
	s := store.NewMemoryStore(nil)
	assert.NotNil(t, s)
	defer s.Close()

	a, err := New(s)
	assert.NoError(t, err)
	assert.NotNil(t, a)

	// Try adding some objects to store before allocator is started
	assert.NoError(t, s.Update(func(tx store.Tx) error {
		n1 := &api.Network{
			ID: "testID1",
			Spec: api.NetworkSpec{
				Annotations: api.Annotations{
					Name: "test1",
				},
			},
		}
		assert.NoError(t, store.CreateNetwork(tx, n1))

		s1 := &api.Service{
			ID: "testServiceID1",
			Spec: api.ServiceSpec{
				Annotations: api.Annotations{
					Name: "service1",
				},
				Task: api.TaskSpec{
					Networks: []*api.NetworkAttachmentConfig{
						{
							Target: "testID1",
						},
					},
				},
				Endpoint: &api.EndpointSpec{},
			},
		}
		assert.NoError(t, store.CreateService(tx, s1))

		t1 := &api.Task{
			ID: "testTaskID1",
			Status: api.TaskStatus{
				State: api.TaskStateNew,
			},
			Networks: []*api.NetworkAttachment{
				{
					Network: n1,
				},
			},
		}
		assert.NoError(t, store.CreateTask(tx, t1))
		return nil
	}))

	netWatch, cancel := state.Watch(s.WatchQueue(), state.EventUpdateNetwork{}, state.EventDeleteNetwork{})
	defer cancel()
	taskWatch, cancel := state.Watch(s.WatchQueue(), state.EventUpdateTask{}, state.EventDeleteTask{})
	defer cancel()
	serviceWatch, cancel := state.Watch(s.WatchQueue(), state.EventUpdateService{}, state.EventDeleteService{})
	defer cancel()

	// Start allocator
	go func() {
		assert.NoError(t, a.Run(context.Background()))
	}()

	// Now verify if we get network and tasks updated properly
	watchNetwork(t, netWatch, false, isValidNetwork)
	watchTask(t, s, taskWatch, false, isValidTask)
	watchService(t, serviceWatch, false, nil)

	// Add new networks/tasks/services after allocator is started.
	assert.NoError(t, s.Update(func(tx store.Tx) error {
		n2 := &api.Network{
			ID: "testID2",
			Spec: api.NetworkSpec{
				Annotations: api.Annotations{
					Name: "test2",
				},
			},
		}
		assert.NoError(t, store.CreateNetwork(tx, n2))
		return nil
	}))

	watchNetwork(t, netWatch, false, isValidNetwork)

	assert.NoError(t, s.Update(func(tx store.Tx) error {
		s2 := &api.Service{
			ID: "testServiceID2",
			Spec: api.ServiceSpec{
				Annotations: api.Annotations{
					Name: "service2",
				},
				Networks: []*api.NetworkAttachmentConfig{
					{
						Target: "testID2",
					},
				},
				Endpoint: &api.EndpointSpec{},
			},
		}
		assert.NoError(t, store.CreateService(tx, s2))
		return nil
	}))

	watchService(t, serviceWatch, false, nil)

	assert.NoError(t, s.Update(func(tx store.Tx) error {
		t2 := &api.Task{
			ID: "testTaskID2",
			Status: api.TaskStatus{
				State: api.TaskStateNew,
			},
			ServiceID:    "testServiceID2",
			DesiredState: api.TaskStateRunning,
		}
		assert.NoError(t, store.CreateTask(tx, t2))
		return nil
	}))

	watchTask(t, s, taskWatch, false, isValidTask)

	// Now try adding a task which depends on a network before adding the network.
	n3 := &api.Network{
		ID: "testID3",
		Spec: api.NetworkSpec{
			Annotations: api.Annotations{
				Name: "test3",
			},
		},
	}

	assert.NoError(t, s.Update(func(tx store.Tx) error {
		t3 := &api.Task{
			ID: "testTaskID3",
			Status: api.TaskStatus{
				State: api.TaskStateNew,
			},
			DesiredState: api.TaskStateRunning,
			Networks: []*api.NetworkAttachment{
				{
					Network: n3,
				},
			},
		}
		assert.NoError(t, store.CreateTask(tx, t3))
		return nil
	}))

	// Wait for a little bit of time before adding network just to
	// test network is not available while task allocation is
	// going through
	time.Sleep(10 * time.Millisecond)

	assert.NoError(t, s.Update(func(tx store.Tx) error {
		assert.NoError(t, store.CreateNetwork(tx, n3))
		return nil
	}))

	watchNetwork(t, netWatch, false, isValidNetwork)
	watchTask(t, s, taskWatch, false, isValidTask)

	assert.NoError(t, s.Update(func(tx store.Tx) error {
		assert.NoError(t, store.DeleteTask(tx, "testTaskID3"))
		return nil
	}))
	watchTask(t, s, taskWatch, false, isValidTask)

	assert.NoError(t, s.Update(func(tx store.Tx) error {
		t5 := &api.Task{
			ID: "testTaskID5",
			Spec: api.TaskSpec{
				Networks: []*api.NetworkAttachmentConfig{
					{
						Target: "testID2",
					},
				},
			},
			Status: api.TaskStatus{
				State: api.TaskStateNew,
			},
			DesiredState: api.TaskStateRunning,
			ServiceID:    "testServiceID2",
		}
		assert.NoError(t, store.CreateTask(tx, t5))
		return nil
	}))
	watchTask(t, s, taskWatch, false, isValidTask)

	assert.NoError(t, s.Update(func(tx store.Tx) error {
		assert.NoError(t, store.DeleteNetwork(tx, "testID3"))
		return nil
	}))
	watchNetwork(t, netWatch, false, isValidNetwork)

	assert.NoError(t, s.Update(func(tx store.Tx) error {
		assert.NoError(t, store.DeleteService(tx, "testServiceID2"))
		return nil
	}))
	watchService(t, serviceWatch, false, nil)

	// Try to create a task with no network attachments and test
	// that it moves to ALLOCATED state.
	assert.NoError(t, s.Update(func(tx store.Tx) error {
		t4 := &api.Task{
			ID: "testTaskID4",
			Status: api.TaskStatus{
				State: api.TaskStateNew,
			},
			DesiredState: api.TaskStateRunning,
		}
		assert.NoError(t, store.CreateTask(tx, t4))
		return nil
	}))
	watchTask(t, s, taskWatch, false, isValidTask)

	assert.NoError(t, s.Update(func(tx store.Tx) error {
		n2 := store.GetNetwork(tx, "testID2")
		require.NotEqual(t, nil, n2)
		assert.NoError(t, store.UpdateNetwork(tx, n2))
		return nil
	}))
	watchNetwork(t, netWatch, false, isValidNetwork)
	watchNetwork(t, netWatch, true, nil)

	// Try updating task which is already allocated
	assert.NoError(t, s.Update(func(tx store.Tx) error {
		t2 := store.GetTask(tx, "testTaskID2")
		require.NotEqual(t, nil, t2)
		assert.NoError(t, store.UpdateTask(tx, t2))
		return nil
	}))
	watchTask(t, s, taskWatch, false, isValidTask)
	watchTask(t, s, taskWatch, true, nil)

	// Try adding networks with conflicting network resources and
	// add task which attaches to a network which gets allocated
	// later and verify if task reconciles and moves to ALLOCATED.
	n4 := &api.Network{
		ID: "testID4",
		Spec: api.NetworkSpec{
			Annotations: api.Annotations{
				Name: "test4",
			},
			DriverConfig: &api.Driver{
				Name: "overlay",
				Options: map[string]string{
					"com.docker.network.driver.overlay.vxlanid_list": "328",
				},
			},
		},
	}

	n5 := n4.Copy()
	n5.ID = "testID5"
	n5.Spec.Annotations.Name = "test5"
	assert.NoError(t, s.Update(func(tx store.Tx) error {
		assert.NoError(t, store.CreateNetwork(tx, n4))
		return nil
	}))
	watchNetwork(t, netWatch, false, isValidNetwork)

	assert.NoError(t, s.Update(func(tx store.Tx) error {
		assert.NoError(t, store.CreateNetwork(tx, n5))
		return nil
	}))
	watchNetwork(t, netWatch, true, nil)

	assert.NoError(t, s.Update(func(tx store.Tx) error {
		t6 := &api.Task{
			ID: "testTaskID6",
			Status: api.TaskStatus{
				State: api.TaskStateNew,
			},
			DesiredState: api.TaskStateRunning,
			Networks: []*api.NetworkAttachment{
				{
					Network: n5,
				},
			},
		}
		assert.NoError(t, store.CreateTask(tx, t6))
		return nil
	}))
	watchTask(t, s, taskWatch, true, nil)

	// Now remove the conflicting network.
	assert.NoError(t, s.Update(func(tx store.Tx) error {
		assert.NoError(t, store.DeleteNetwork(tx, n4.ID))
		return nil
	}))
	watchNetwork(t, netWatch, false, isValidNetwork)
	watchTask(t, s, taskWatch, false, isValidTask)

	// Try adding services with conflicting port configs and add
	// task which is part of the service whose allocation hasn't
	// happened and when that happens later and verify if task
	// reconciles and moves to ALLOCATED.
	s3 := &api.Service{
		ID: "testServiceID3",
		Spec: api.ServiceSpec{
			Annotations: api.Annotations{
				Name: "service3",
			},
			Endpoint: &api.EndpointSpec{
				Ports: []*api.PortConfig{
					{
						Name:          "http",
						TargetPort:    80,
						PublishedPort: 8080,
					},
				},
			},
		},
	}

	s4 := s3.Copy()
	s4.ID = "testServiceID4"
	s4.Spec.Annotations.Name = "service4"
	assert.NoError(t, s.Update(func(tx store.Tx) error {
		assert.NoError(t, store.CreateService(tx, s3))
		return nil
	}))
	watchService(t, serviceWatch, false, nil)
	assert.NoError(t, s.Update(func(tx store.Tx) error {
		assert.NoError(t, store.CreateService(tx, s4))
		return nil
	}))
	watchService(t, serviceWatch, true, nil)

	assert.NoError(t, s.Update(func(tx store.Tx) error {
		t7 := &api.Task{
			ID: "testTaskID7",
			Status: api.TaskStatus{
				State: api.TaskStateNew,
			},
			ServiceID:    "testServiceID4",
			DesiredState: api.TaskStateRunning,
		}
		assert.NoError(t, store.CreateTask(tx, t7))
		return nil
	}))
	watchTask(t, s, taskWatch, true, nil)

	// Now remove the conflicting service.
	assert.NoError(t, s.Update(func(tx store.Tx) error {
		assert.NoError(t, store.DeleteService(tx, s3.ID))
		return nil
	}))
	watchService(t, serviceWatch, false, nil)
	watchTask(t, s, taskWatch, false, isValidTask)

	a.Stop()
}
Example #24
0
func TestHA(t *testing.T) {
	ctx := context.Background()
	initialNodeSet := []*api.Node{
		{
			ID: "id1",
			Status: api.NodeStatus{
				State: api.NodeStatus_READY,
			},
		},
		{
			ID: "id2",
			Status: api.NodeStatus{
				State: api.NodeStatus_READY,
			},
		},
		{
			ID: "id3",
			Status: api.NodeStatus{
				State: api.NodeStatus_READY,
			},
		},
		{
			ID: "id4",
			Status: api.NodeStatus{
				State: api.NodeStatus_READY,
			},
		},
		{
			ID: "id5",
			Status: api.NodeStatus{
				State: api.NodeStatus_READY,
			},
		},
	}

	taskTemplate1 := &api.Task{
		DesiredState: api.TaskStateRunning,
		ServiceID:    "service1",
		Spec: api.TaskSpec{
			Runtime: &api.TaskSpec_Container{
				Container: &api.ContainerSpec{
					Image: "v:1",
				},
			},
		},
		Status: api.TaskStatus{
			State: api.TaskStatePending,
		},
	}

	taskTemplate2 := &api.Task{
		DesiredState: api.TaskStateRunning,
		ServiceID:    "service2",
		Spec: api.TaskSpec{
			Runtime: &api.TaskSpec_Container{
				Container: &api.ContainerSpec{
					Image: "v:2",
				},
			},
		},
		Status: api.TaskStatus{
			State: api.TaskStatePending,
		},
	}

	s := store.NewMemoryStore(nil)
	assert.NotNil(t, s)
	defer s.Close()

	t1Instances := 18

	err := s.Update(func(tx store.Tx) error {
		// Prepoulate nodes
		for _, n := range initialNodeSet {
			assert.NoError(t, store.CreateNode(tx, n))
		}

		// Prepopulate tasks from template 1
		for i := 0; i != t1Instances; i++ {
			taskTemplate1.ID = fmt.Sprintf("t1id%d", i)
			assert.NoError(t, store.CreateTask(tx, taskTemplate1))
		}
		return nil
	})
	assert.NoError(t, err)

	scheduler := New(s)

	watch, cancel := state.Watch(s.WatchQueue(), state.EventUpdateTask{})
	defer cancel()

	go func() {
		assert.NoError(t, scheduler.Run(ctx))
	}()
	defer scheduler.Stop()

	t1Assignments := make(map[string]int)
	for i := 0; i != t1Instances; i++ {
		assignment := watchAssignment(t, watch)
		if !strings.HasPrefix(assignment.ID, "t1") {
			t.Fatal("got assignment for different kind of task")
		}
		t1Assignments[assignment.NodeID]++
	}

	assert.Len(t, t1Assignments, 5)

	nodesWith3T1Tasks := 0
	nodesWith4T1Tasks := 0
	for nodeID, taskCount := range t1Assignments {
		if taskCount == 3 {
			nodesWith3T1Tasks++
		} else if taskCount == 4 {
			nodesWith4T1Tasks++
		} else {
			t.Fatalf("unexpected number of tasks %d on node %s", taskCount, nodeID)
		}
	}

	assert.Equal(t, 3, nodesWith4T1Tasks)
	assert.Equal(t, 2, nodesWith3T1Tasks)

	t2Instances := 2

	// Add a new service with two instances. They should fill the nodes
	// that only have two tasks.
	err = s.Update(func(tx store.Tx) error {
		for i := 0; i != t2Instances; i++ {
			taskTemplate2.ID = fmt.Sprintf("t2id%d", i)
			assert.NoError(t, store.CreateTask(tx, taskTemplate2))
		}
		return nil
	})
	assert.NoError(t, err)

	t2Assignments := make(map[string]int)
	for i := 0; i != t2Instances; i++ {
		assignment := watchAssignment(t, watch)
		if !strings.HasPrefix(assignment.ID, "t2") {
			t.Fatal("got assignment for different kind of task")
		}
		t2Assignments[assignment.NodeID]++
	}

	assert.Len(t, t2Assignments, 2)

	for nodeID := range t2Assignments {
		assert.Equal(t, 3, t1Assignments[nodeID])
	}

	// Scale up service 1 to 21 tasks. It should cover the two nodes that
	// service 2 was assigned to, and also one other node.
	err = s.Update(func(tx store.Tx) error {
		for i := t1Instances; i != t1Instances+3; i++ {
			taskTemplate1.ID = fmt.Sprintf("t1id%d", i)
			assert.NoError(t, store.CreateTask(tx, taskTemplate1))
		}
		return nil
	})
	assert.NoError(t, err)

	var sharedNodes [2]string

	for i := 0; i != 3; i++ {
		assignment := watchAssignment(t, watch)
		if !strings.HasPrefix(assignment.ID, "t1") {
			t.Fatal("got assignment for different kind of task")
		}
		if t1Assignments[assignment.NodeID] == 5 {
			t.Fatal("more than one new task assigned to the same node")
		}
		t1Assignments[assignment.NodeID]++

		if t2Assignments[assignment.NodeID] != 0 {
			if sharedNodes[0] == "" {
				sharedNodes[0] = assignment.NodeID
			} else if sharedNodes[1] == "" {
				sharedNodes[1] = assignment.NodeID
			} else {
				t.Fatal("all three assignments went to nodes with service2 tasks")
			}
		}
	}

	assert.NotEmpty(t, sharedNodes[0])
	assert.NotEmpty(t, sharedNodes[1])
	assert.NotEqual(t, sharedNodes[0], sharedNodes[1])

	nodesWith4T1Tasks = 0
	nodesWith5T1Tasks := 0
	for nodeID, taskCount := range t1Assignments {
		if taskCount == 4 {
			nodesWith4T1Tasks++
		} else if taskCount == 5 {
			nodesWith5T1Tasks++
		} else {
			t.Fatalf("unexpected number of tasks %d on node %s", taskCount, nodeID)
		}
	}

	assert.Equal(t, 4, nodesWith4T1Tasks)
	assert.Equal(t, 1, nodesWith5T1Tasks)

	// Add another task from service2. It must not land on the node that
	// has 5 service1 tasks.
	err = s.Update(func(tx store.Tx) error {
		taskTemplate2.ID = "t2id4"
		assert.NoError(t, store.CreateTask(tx, taskTemplate2))
		return nil
	})
	assert.NoError(t, err)

	assignment := watchAssignment(t, watch)
	if assignment.ID != "t2id4" {
		t.Fatal("got assignment for different task")
	}

	if t2Assignments[assignment.NodeID] != 0 {
		t.Fatal("was scheduled on a node that already has a service2 task")
	}
	if t1Assignments[assignment.NodeID] == 5 {
		t.Fatal("was scheduled on the node that has the most service1 tasks")
	}
	t2Assignments[assignment.NodeID]++

	// Remove all tasks on node id1.
	err = s.Update(func(tx store.Tx) error {
		tasks, err := store.FindTasks(tx, store.ByNodeID("id1"))
		assert.NoError(t, err)
		for _, task := range tasks {
			assert.NoError(t, store.DeleteTask(tx, task.ID))
		}
		return nil
	})
	assert.NoError(t, err)

	t1Assignments["id1"] = 0
	t2Assignments["id1"] = 0

	// Add four instances of service1 and two instances of service2.
	// All instances of service1 should land on node "id1", and one
	// of the two service2 instances should as well.
	// Put these in a map to randomize the order in which they are
	// created.
	err = s.Update(func(tx store.Tx) error {
		tasksMap := make(map[string]*api.Task)
		for i := 22; i <= 25; i++ {
			taskTemplate1.ID = fmt.Sprintf("t1id%d", i)
			tasksMap[taskTemplate1.ID] = taskTemplate1.Copy()
		}
		for i := 5; i <= 6; i++ {
			taskTemplate2.ID = fmt.Sprintf("t2id%d", i)
			tasksMap[taskTemplate2.ID] = taskTemplate2.Copy()
		}
		for _, task := range tasksMap {
			assert.NoError(t, store.CreateTask(tx, task))
		}
		return nil
	})
	assert.NoError(t, err)

	for i := 0; i != 4+2; i++ {
		assignment := watchAssignment(t, watch)
		if strings.HasPrefix(assignment.ID, "t1") {
			t1Assignments[assignment.NodeID]++
		} else if strings.HasPrefix(assignment.ID, "t2") {
			t2Assignments[assignment.NodeID]++
		}
	}

	assert.Equal(t, 4, t1Assignments["id1"])
	assert.Equal(t, 1, t2Assignments["id1"])
}
Example #25
0
func TestUpdater(t *testing.T) {
	ctx := context.Background()
	s := store.NewMemoryStore(nil)
	assert.NotNil(t, s)

	// Move tasks to their desired state.
	watch, cancel := state.Watch(s.WatchQueue(), state.EventUpdateTask{})
	defer cancel()
	go func() {
		for {
			select {
			case e := <-watch:
				task := e.(state.EventUpdateTask).Task
				if task.Status.State == task.DesiredState {
					continue
				}
				err := s.Update(func(tx store.Tx) error {
					task = store.GetTask(tx, task.ID)
					task.Status.State = task.DesiredState
					return store.UpdateTask(tx, task)
				})
				assert.NoError(t, err)
			}
		}
	}()

	instances := 3
	cluster := &api.Cluster{
		// test cluster configuration propagation to task creation.
		Spec: api.ClusterSpec{
			Annotations: api.Annotations{
				Name: "default",
			},
		},
	}

	service := &api.Service{
		ID: "id1",
		Spec: api.ServiceSpec{
			Annotations: api.Annotations{
				Name: "name1",
			},
			Mode: &api.ServiceSpec_Replicated{
				Replicated: &api.ReplicatedService{
					Replicas: uint64(instances),
				},
			},
			Task: api.TaskSpec{
				Runtime: &api.TaskSpec_Container{
					Container: &api.ContainerSpec{
						Image: "v:1",
						// This won't apply in this test because we set the old tasks to DEAD.
						StopGracePeriod: ptypes.DurationProto(time.Hour),
					},
				},
			},
		},
	}

	err := s.Update(func(tx store.Tx) error {
		assert.NoError(t, store.CreateCluster(tx, cluster))
		assert.NoError(t, store.CreateService(tx, service))
		for i := 0; i < instances; i++ {
			assert.NoError(t, store.CreateTask(tx, newTask(cluster, service, uint64(i))))
		}
		return nil
	})
	assert.NoError(t, err)

	originalTasks := getRunnableServiceTasks(t, s, service)
	for _, task := range originalTasks {
		assert.Equal(t, "v:1", task.Spec.GetContainer().Image)
		assert.Nil(t, task.LogDriver) // should be left alone
	}

	service.Spec.Task.GetContainer().Image = "v:2"
	service.Spec.Task.LogDriver = &api.Driver{Name: "tasklogdriver"}
	updater := NewUpdater(s, NewRestartSupervisor(s))
	updater.Run(ctx, cluster, service, getRunnableServiceTasks(t, s, service))
	updatedTasks := getRunnableServiceTasks(t, s, service)
	for _, task := range updatedTasks {
		assert.Equal(t, "v:2", task.Spec.GetContainer().Image)
		assert.Equal(t, service.Spec.Task.LogDriver, task.LogDriver) // pick up from task
	}

	service.Spec.Task.GetContainer().Image = "v:3"
	cluster.Spec.DefaultLogDriver = &api.Driver{Name: "clusterlogdriver"} // make cluster default logdriver.
	service.Spec.Update = &api.UpdateConfig{
		Parallelism: 1,
	}
	updater = NewUpdater(s, NewRestartSupervisor(s))
	updater.Run(ctx, cluster, service, getRunnableServiceTasks(t, s, service))
	updatedTasks = getRunnableServiceTasks(t, s, service)
	for _, task := range updatedTasks {
		assert.Equal(t, "v:3", task.Spec.GetContainer().Image)
		assert.Equal(t, service.Spec.Task.LogDriver, task.LogDriver) // still pick up from task
	}

	service.Spec.Task.GetContainer().Image = "v:4"
	service.Spec.Task.LogDriver = nil // use cluster default now.
	service.Spec.Update = &api.UpdateConfig{
		Parallelism: 1,
		Delay:       *ptypes.DurationProto(10 * time.Millisecond),
	}
	updater = NewUpdater(s, NewRestartSupervisor(s))
	updater.Run(ctx, cluster, service, getRunnableServiceTasks(t, s, service))
	updatedTasks = getRunnableServiceTasks(t, s, service)
	for _, task := range updatedTasks {
		assert.Equal(t, "v:4", task.Spec.GetContainer().Image)
		assert.Equal(t, cluster.Spec.DefaultLogDriver, task.LogDriver) // pick up from cluster
	}
}
Example #26
0
func TestUpdaterStopGracePeriod(t *testing.T) {
	ctx := context.Background()
	s := store.NewMemoryStore(nil)
	assert.NotNil(t, s)

	// Move tasks to their desired state.
	watch, cancel := state.Watch(s.WatchQueue(), state.EventUpdateTask{})
	defer cancel()
	go func() {
		for {
			select {
			case e := <-watch:
				task := e.(state.EventUpdateTask).Task
				err := s.Update(func(tx store.Tx) error {
					task = store.GetTask(tx, task.ID)
					// Explicitly do not set task state to
					// DEAD to trigger StopGracePeriod
					if task.DesiredState == api.TaskStateRunning && task.Status.State != api.TaskStateRunning {
						task.Status.State = api.TaskStateRunning
						return store.UpdateTask(tx, task)
					}
					return nil
				})
				assert.NoError(t, err)
			}
		}
	}()

	var instances uint64 = 3
	service := &api.Service{
		ID: "id1",
		Spec: api.ServiceSpec{
			Annotations: api.Annotations{
				Name: "name1",
			},
			Task: api.TaskSpec{
				Runtime: &api.TaskSpec_Container{
					Container: &api.ContainerSpec{
						Image:           "v:1",
						StopGracePeriod: ptypes.DurationProto(100 * time.Millisecond),
					},
				},
			},
			Mode: &api.ServiceSpec_Replicated{
				Replicated: &api.ReplicatedService{
					Replicas: instances,
				},
			},
		},
	}

	err := s.Update(func(tx store.Tx) error {
		assert.NoError(t, store.CreateService(tx, service))
		for i := uint64(0); i < instances; i++ {
			task := newTask(nil, service, uint64(i))
			task.Status.State = api.TaskStateRunning
			assert.NoError(t, store.CreateTask(tx, task))
		}
		return nil
	})
	assert.NoError(t, err)

	originalTasks := getRunnableServiceTasks(t, s, service)
	for _, task := range originalTasks {
		assert.Equal(t, "v:1", task.Spec.GetContainer().Image)
	}

	before := time.Now()

	service.Spec.Task.GetContainer().Image = "v:2"
	updater := NewUpdater(s, NewRestartSupervisor(s))
	// Override the default (1 minute) to speed up the test.
	updater.restarts.taskTimeout = 100 * time.Millisecond
	updater.Run(ctx, nil, service, getRunnableServiceTasks(t, s, service))
	updatedTasks := getRunnableServiceTasks(t, s, service)
	for _, task := range updatedTasks {
		assert.Equal(t, "v:2", task.Spec.GetContainer().Image)
	}

	after := time.Now()

	// At least 100 ms should have elapsed. Only check the lower bound,
	// because the system may be slow and it could have taken longer.
	if after.Sub(before) < 100*time.Millisecond {
		t.Fatal("stop timeout should have elapsed")
	}
}
Example #27
0
func TestSchedulerResourceConstraintHA(t *testing.T) {
	// node 1 starts with 1 task, node 2 starts with 3 tasks.
	// however, node 1 only has enough memory to schedule one more task.

	ctx := context.Background()
	node1 := &api.Node{
		ID: "id1",
		Spec: api.NodeSpec{
			Annotations: api.Annotations{
				Name: "id1",
			},
		},
		Status: api.NodeStatus{
			State: api.NodeStatus_READY,
		},
		Description: &api.NodeDescription{
			Resources: &api.Resources{
				MemoryBytes: 1e9,
			},
		},
	}
	node2 := &api.Node{
		ID: "id2",
		Spec: api.NodeSpec{
			Annotations: api.Annotations{
				Name: "id2",
			},
		},
		Status: api.NodeStatus{
			State: api.NodeStatus_READY,
		},
		Description: &api.NodeDescription{
			Resources: &api.Resources{
				MemoryBytes: 1e11,
			},
		},
	}

	taskTemplate := &api.Task{
		DesiredState: api.TaskStateRunning,
		Spec: api.TaskSpec{
			Runtime: &api.TaskSpec_Container{
				Container: &api.ContainerSpec{},
			},
			Resources: &api.ResourceRequirements{
				Reservations: &api.Resources{
					MemoryBytes: 5e8,
				},
			},
		},
		ServiceAnnotations: api.Annotations{
			Name: "name1",
		},
		Status: api.TaskStatus{
			State: api.TaskStatePending,
		},
	}

	s := store.NewMemoryStore(nil)
	assert.NotNil(t, s)
	defer s.Close()

	err := s.Update(func(tx store.Tx) error {
		// Add initial node and task
		assert.NoError(t, store.CreateNode(tx, node1))
		assert.NoError(t, store.CreateNode(tx, node2))

		// preassigned tasks
		task1 := taskTemplate.Copy()
		task1.ID = "id1"
		task1.NodeID = "id1"
		task1.Status.State = api.TaskStateRunning
		assert.NoError(t, store.CreateTask(tx, task1))

		task2 := taskTemplate.Copy()
		task2.ID = "id2"
		task2.NodeID = "id2"
		task2.Status.State = api.TaskStateRunning
		assert.NoError(t, store.CreateTask(tx, task2))

		task3 := taskTemplate.Copy()
		task3.ID = "id3"
		task3.NodeID = "id2"
		task3.Status.State = api.TaskStateRunning
		assert.NoError(t, store.CreateTask(tx, task3))

		task4 := taskTemplate.Copy()
		task4.ID = "id4"
		task4.NodeID = "id2"
		task4.Status.State = api.TaskStateRunning
		assert.NoError(t, store.CreateTask(tx, task4))

		// tasks to assign
		task5 := taskTemplate.Copy()
		task5.ID = "id5"
		assert.NoError(t, store.CreateTask(tx, task5))

		task6 := taskTemplate.Copy()
		task6.ID = "id6"
		assert.NoError(t, store.CreateTask(tx, task6))

		return nil
	})
	assert.NoError(t, err)

	scheduler := New(s)

	watch, cancel := state.Watch(s.WatchQueue(), state.EventUpdateTask{})
	defer cancel()

	go func() {
		assert.NoError(t, scheduler.Run(ctx))
	}()
	defer scheduler.Stop()

	assignment1 := watchAssignment(t, watch)
	if assignment1.ID != "id5" && assignment1.ID != "id6" {
		t.Fatal("assignment for unexpected task")
	}
	assignment2 := watchAssignment(t, watch)
	if assignment1.ID == "id5" {
		assert.Equal(t, "id6", assignment2.ID)
	} else {
		assert.Equal(t, "id5", assignment2.ID)
	}

	if assignment1.NodeID == "id1" {
		assert.Equal(t, "id2", assignment2.NodeID)
	} else {
		assert.Equal(t, "id1", assignment2.NodeID)
	}
}
Example #28
0
// Run starts the update and returns only once its complete or cancelled.
func (u *Updater) Run(ctx context.Context, slots []slot) {
	defer close(u.doneChan)

	service := u.newService

	// If the update is in a PAUSED state, we should not do anything.
	if service.UpdateStatus != nil &&
		(service.UpdateStatus.State == api.UpdateStatus_PAUSED ||
			service.UpdateStatus.State == api.UpdateStatus_ROLLBACK_PAUSED) {
		return
	}

	var dirtySlots []slot
	for _, slot := range slots {
		if u.isSlotDirty(slot) {
			dirtySlots = append(dirtySlots, slot)
		}
	}
	// Abort immediately if all tasks are clean.
	if len(dirtySlots) == 0 {
		if service.UpdateStatus != nil &&
			(service.UpdateStatus.State == api.UpdateStatus_UPDATING ||
				service.UpdateStatus.State == api.UpdateStatus_ROLLBACK_STARTED) {
			u.completeUpdate(ctx, service.ID)
		}
		return
	}

	// If there's no update in progress, we are starting one.
	if service.UpdateStatus == nil {
		u.startUpdate(ctx, service.ID)
	}

	parallelism := 0
	if service.Spec.Update != nil {
		parallelism = int(service.Spec.Update.Parallelism)
	}
	if parallelism == 0 {
		// TODO(aluzzardi): We could try to optimize unlimited parallelism by performing updates in a single
		// goroutine using a batch transaction.
		parallelism = len(dirtySlots)
	}

	// Start the workers.
	slotQueue := make(chan slot)
	wg := sync.WaitGroup{}
	wg.Add(parallelism)
	for i := 0; i < parallelism; i++ {
		go func() {
			u.worker(ctx, slotQueue)
			wg.Done()
		}()
	}

	failureAction := api.UpdateConfig_PAUSE
	allowedFailureFraction := float32(0)
	monitoringPeriod := defaultMonitor

	if service.Spec.Update != nil {
		failureAction = service.Spec.Update.FailureAction
		allowedFailureFraction = service.Spec.Update.AllowedFailureFraction

		if service.Spec.Update.Monitor != nil {
			var err error
			monitoringPeriod, err = ptypes.Duration(service.Spec.Update.Monitor)
			if err != nil {
				monitoringPeriod = defaultMonitor
			}
		}
	}

	var failedTaskWatch chan events.Event

	if failureAction != api.UpdateConfig_CONTINUE {
		var cancelWatch func()
		failedTaskWatch, cancelWatch = state.Watch(
			u.store.WatchQueue(),
			state.EventUpdateTask{
				Task:   &api.Task{ServiceID: service.ID, Status: api.TaskStatus{State: api.TaskStateRunning}},
				Checks: []state.TaskCheckFunc{state.TaskCheckServiceID, state.TaskCheckStateGreaterThan},
			},
		)
		defer cancelWatch()
	}

	stopped := false
	failedTasks := make(map[string]struct{})
	totalFailures := 0

	failureTriggersAction := func(failedTask *api.Task) bool {
		// Ignore tasks we have already seen as failures.
		if _, found := failedTasks[failedTask.ID]; found {
			return false
		}

		// If this failed/completed task is one that we
		// created as part of this update, we should
		// follow the failure action.
		u.updatedTasksMu.Lock()
		startedAt, found := u.updatedTasks[failedTask.ID]
		u.updatedTasksMu.Unlock()

		if found && (startedAt.IsZero() || time.Since(startedAt) <= monitoringPeriod) {
			failedTasks[failedTask.ID] = struct{}{}
			totalFailures++
			if float32(totalFailures)/float32(len(dirtySlots)) > allowedFailureFraction {
				switch failureAction {
				case api.UpdateConfig_PAUSE:
					stopped = true
					message := fmt.Sprintf("update paused due to failure or early termination of task %s", failedTask.ID)
					u.pauseUpdate(ctx, service.ID, message)
					return true
				case api.UpdateConfig_ROLLBACK:
					// Never roll back a rollback
					if service.UpdateStatus != nil && service.UpdateStatus.State == api.UpdateStatus_ROLLBACK_STARTED {
						message := fmt.Sprintf("rollback paused due to failure or early termination of task %s", failedTask.ID)
						u.pauseUpdate(ctx, service.ID, message)
						return true
					}
					stopped = true
					message := fmt.Sprintf("update rolled back due to failure or early termination of task %s", failedTask.ID)
					u.rollbackUpdate(ctx, service.ID, message)
					return true
				}
			}
		}

		return false
	}

slotsLoop:
	for _, slot := range dirtySlots {
	retryLoop:
		for {
			// Wait for a worker to pick up the task or abort the update, whichever comes first.
			select {
			case <-u.stopChan:
				stopped = true
				break slotsLoop
			case ev := <-failedTaskWatch:
				if failureTriggersAction(ev.(state.EventUpdateTask).Task) {
					break slotsLoop
				}
			case slotQueue <- slot:
				break retryLoop
			}
		}
	}

	close(slotQueue)
	wg.Wait()

	if !stopped {
		// Keep watching for task failures for one more monitoringPeriod,
		// before declaring the update complete.
		doneMonitoring := time.After(monitoringPeriod)
	monitorLoop:
		for {
			select {
			case <-u.stopChan:
				stopped = true
				break monitorLoop
			case <-doneMonitoring:
				break monitorLoop
			case ev := <-failedTaskWatch:
				if failureTriggersAction(ev.(state.EventUpdateTask).Task) {
					break monitorLoop
				}
			}
		}
	}

	// TODO(aaronl): Potentially roll back the service if not enough tasks
	// have reached RUNNING by this point.

	if !stopped {
		u.completeUpdate(ctx, service.ID)
	}
}
Example #29
0
func (u *Updater) updateTask(ctx context.Context, slot slot, updated *api.Task) error {
	// Kick off the watch before even creating the updated task. This is in order to avoid missing any event.
	taskUpdates, cancel := state.Watch(u.watchQueue, state.EventUpdateTask{
		Task:   &api.Task{ID: updated.ID},
		Checks: []state.TaskCheckFunc{state.TaskCheckID},
	})
	defer cancel()

	// Create an empty entry for this task, so the updater knows a failure
	// should count towards the failure count. The timestamp is added
	// if/when the task reaches RUNNING.
	u.updatedTasksMu.Lock()
	u.updatedTasks[updated.ID] = time.Time{}
	u.updatedTasksMu.Unlock()

	var delayStartCh <-chan struct{}
	// Atomically create the updated task and bring down the old one.
	_, err := u.store.Batch(func(batch *store.Batch) error {
		oldTask, err := u.removeOldTasks(ctx, batch, slot)
		if err != nil {
			return err
		}

		err = batch.Update(func(tx store.Tx) error {
			if err := store.CreateTask(tx, updated); err != nil {
				return err
			}
			return nil
		})
		if err != nil {
			return err
		}

		delayStartCh = u.restarts.DelayStart(ctx, nil, oldTask, updated.ID, 0, true)

		return nil

	})
	if err != nil {
		return err
	}

	if delayStartCh != nil {
		<-delayStartCh
	}

	// Wait for the new task to come up.
	// TODO(aluzzardi): Consider adding a timeout here.
	for {
		select {
		case e := <-taskUpdates:
			updated = e.(state.EventUpdateTask).Task
			if updated.Status.State >= api.TaskStateRunning {
				u.updatedTasksMu.Lock()
				u.updatedTasks[updated.ID] = time.Now()
				u.updatedTasksMu.Unlock()
				return nil
			}
		case <-u.stopChan:
			return nil
		}
	}
}
Example #30
0
func TestSchedulerResourceConstraint(t *testing.T) {
	ctx := context.Background()
	// Create a ready node without enough memory to run the task.
	underprovisionedNode := &api.Node{
		ID: "underprovisioned",
		Spec: api.NodeSpec{
			Annotations: api.Annotations{
				Name: "underprovisioned",
			},
		},
		Status: api.NodeStatus{
			State: api.NodeStatus_READY,
		},
		Description: &api.NodeDescription{
			Resources: &api.Resources{
				NanoCPUs:    1e9,
				MemoryBytes: 1e9,
			},
		},
	}

	// Non-ready nodes that satisfy the constraints but shouldn't be used
	nonready1 := &api.Node{
		ID: "nonready1",
		Spec: api.NodeSpec{
			Annotations: api.Annotations{
				Name: "nonready1",
			},
		},
		Status: api.NodeStatus{
			State: api.NodeStatus_UNKNOWN,
		},
		Description: &api.NodeDescription{
			Resources: &api.Resources{
				NanoCPUs:    2e9,
				MemoryBytes: 2e9,
			},
		},
	}
	nonready2 := &api.Node{
		ID: "nonready2",
		Spec: api.NodeSpec{
			Annotations: api.Annotations{
				Name: "nonready2",
			},
		},
		Status: api.NodeStatus{
			State: api.NodeStatus_UNKNOWN,
		},
		Description: &api.NodeDescription{
			Resources: &api.Resources{
				NanoCPUs:    2e9,
				MemoryBytes: 2e9,
			},
		},
	}

	initialTask := &api.Task{
		ID:           "id1",
		DesiredState: api.TaskStateRunning,
		Spec: api.TaskSpec{
			Runtime: &api.TaskSpec_Container{
				Container: &api.ContainerSpec{},
			},
			Resources: &api.ResourceRequirements{
				Reservations: &api.Resources{
					MemoryBytes: 2e9,
				},
			},
		},
		ServiceAnnotations: api.Annotations{
			Name: "name1",
		},
		Status: api.TaskStatus{
			State: api.TaskStatePending,
		},
	}

	s := store.NewMemoryStore(nil)
	assert.NotNil(t, s)
	defer s.Close()

	err := s.Update(func(tx store.Tx) error {
		// Add initial node and task
		assert.NoError(t, store.CreateTask(tx, initialTask))
		assert.NoError(t, store.CreateNode(tx, underprovisionedNode))
		assert.NoError(t, store.CreateNode(tx, nonready1))
		assert.NoError(t, store.CreateNode(tx, nonready2))
		return nil
	})
	assert.NoError(t, err)

	scheduler := New(s)

	watch, cancel := state.Watch(s.WatchQueue(), state.EventUpdateTask{})
	defer cancel()

	go func() {
		assert.NoError(t, scheduler.Run(ctx))
	}()
	defer scheduler.Stop()

	failure := watchAssignmentFailure(t, watch)
	assert.Equal(t, "no suitable node (2 nodes not available for new tasks; insufficient resources on 1 node)", failure.Status.Message)

	err = s.Update(func(tx store.Tx) error {
		// Create a node with enough memory. The task should get
		// assigned to this node.
		node := &api.Node{
			ID: "bignode",
			Spec: api.NodeSpec{
				Annotations: api.Annotations{
					Name: "bignode",
				},
			},
			Description: &api.NodeDescription{
				Resources: &api.Resources{
					NanoCPUs:    4e9,
					MemoryBytes: 8e9,
				},
			},
			Status: api.NodeStatus{
				State: api.NodeStatus_READY,
			},
		}
		assert.NoError(t, store.CreateNode(tx, node))
		return nil
	})
	assert.NoError(t, err)

	assignment := watchAssignment(t, watch)
	assert.Equal(t, "bignode", assignment.NodeID)
}