Example #1
0
func (m *Manager) watchForKEKChanges(ctx context.Context) error {
	clusterID := m.config.SecurityConfig.ClientTLSCreds.Organization()
	clusterWatch, clusterWatchCancel, err := store.ViewAndWatch(m.raftNode.MemoryStore(),
		func(tx store.ReadTx) error {
			cluster := store.GetCluster(tx, clusterID)
			if cluster == nil {
				return fmt.Errorf("unable to get current cluster")
			}
			return m.updateKEK(ctx, cluster)
		},
		state.EventUpdateCluster{
			Cluster: &api.Cluster{ID: clusterID},
			Checks:  []state.ClusterCheckFunc{state.ClusterCheckID},
		},
	)
	if err != nil {
		return err
	}
	go func() {
		for {
			select {
			case event := <-clusterWatch:
				clusterEvent := event.(state.EventUpdateCluster)
				m.updateKEK(ctx, clusterEvent.Cluster)
			case <-ctx.Done():
				clusterWatchCancel()
				return
			}
		}
	}()
	return nil
}
Example #2
0
// Run is roleManager's main loop.
func (rm *roleManager) Run() {
	defer close(rm.doneChan)

	var (
		nodes    []*api.Node
		ticker   *time.Ticker
		tickerCh <-chan time.Time
	)

	watcher, cancelWatch, err := store.ViewAndWatch(rm.store,
		func(readTx store.ReadTx) error {
			var err error
			nodes, err = store.FindNodes(readTx, store.All)
			return err
		},
		state.EventUpdateNode{})
	defer cancelWatch()

	if err != nil {
		log.L.WithError(err).Error("failed to check nodes for role changes")
	} else {
		for _, node := range nodes {
			rm.pending[node.ID] = node
			rm.reconcileRole(node)
		}
		if len(rm.pending) != 0 {
			ticker = time.NewTicker(roleReconcileInterval)
			tickerCh = ticker.C
		}
	}

	for {
		select {
		case event := <-watcher:
			node := event.(state.EventUpdateNode).Node
			rm.pending[node.ID] = node
			rm.reconcileRole(node)
			if len(rm.pending) != 0 && ticker == nil {
				ticker = time.NewTicker(roleReconcileInterval)
				tickerCh = ticker.C
			}
		case <-tickerCh:
			for _, node := range rm.pending {
				rm.reconcileRole(node)
			}
			if len(rm.pending) == 0 {
				ticker.Stop()
				ticker = nil
				tickerCh = nil
			}
		case <-rm.ctx.Done():
			if ticker != nil {
				ticker.Stop()
			}
			return
		}
	}
}
Example #3
0
// Run is the scheduler event loop.
func (s *Scheduler) Run(ctx context.Context) error {
	defer close(s.doneChan)

	updates, cancel, err := store.ViewAndWatch(s.store, s.setupTasksList)
	if err != nil {
		log.G(ctx).WithError(err).Errorf("snapshot store update failed")
		return err
	}
	defer cancel()

	// Validate resource for tasks from preassigned tasks
	// do this before other tasks because preassigned tasks like
	// global service should start before other tasks
	s.processPreassignedTasks(ctx)

	// Queue all unassigned tasks before processing changes.
	s.tick(ctx)

	pendingChanges := 0

	// Watch for changes.
	for {
		select {
		case event := <-updates:
			switch v := event.(type) {
			case state.EventCreateTask:
				pendingChanges += s.createTask(ctx, v.Task)
			case state.EventUpdateTask:
				pendingChanges += s.updateTask(ctx, v.Task)
			case state.EventDeleteTask:
				s.deleteTask(ctx, v.Task)
			case state.EventCreateNode:
				s.createOrUpdateNode(v.Node)
				pendingChanges++
			case state.EventUpdateNode:
				s.createOrUpdateNode(v.Node)
				pendingChanges++
			case state.EventDeleteNode:
				s.nodeHeap.remove(v.Node.ID)
			case state.EventCommit:
				if len(s.preassignedTasks) > 0 {
					s.processPreassignedTasks(ctx)
				}
				if pendingChanges > 0 {
					s.tick(ctx)
					pendingChanges = 0
				}
			}

		case <-s.stopChan:
			return nil
		}
	}
}
Example #4
0
// Session is a stream which controls agent connection.
// Each message contains list of backup Managers with weights. Also there is
// a special boolean field Disconnect which if true indicates that node should
// reconnect to another Manager immediately.
func (d *Dispatcher) Session(r *api.SessionRequest, stream api.Dispatcher_SessionServer) error {
	ctx := stream.Context()
	nodeInfo, err := ca.RemoteNode(ctx)
	if err != nil {
		return err
	}
	nodeID := nodeInfo.NodeID

	if err := d.isRunningLocked(); err != nil {
		return err
	}

	// register the node.
	sessionID, err := d.register(stream.Context(), nodeID, r.Description)
	if err != nil {
		return err
	}

	fields := logrus.Fields{
		"node.id":      nodeID,
		"node.session": sessionID,
		"method":       "(*Dispatcher).Session",
	}
	if nodeInfo.ForwardedBy != nil {
		fields["forwarder.id"] = nodeInfo.ForwardedBy.NodeID
	}
	log := log.G(ctx).WithFields(fields)

	var nodeObj *api.Node
	nodeUpdates, cancel, err := store.ViewAndWatch(d.store, func(readTx store.ReadTx) error {
		nodeObj = store.GetNode(readTx, nodeID)
		return nil
	}, state.EventUpdateNode{Node: &api.Node{ID: nodeID},
		Checks: []state.NodeCheckFunc{state.NodeCheckID}},
	)
	if cancel != nil {
		defer cancel()
	}

	if err != nil {
		log.WithError(err).Error("ViewAndWatch Node failed")
	}

	if _, err = d.nodes.GetWithSession(nodeID, sessionID); err != nil {
		return err
	}

	if err := stream.Send(&api.SessionMessage{
		SessionID:            sessionID,
		Node:                 nodeObj,
		Managers:             d.getManagers(),
		NetworkBootstrapKeys: d.networkBootstrapKeys,
	}); err != nil {
		return err
	}

	managerUpdates, mgrCancel := d.mgrQueue.Watch()
	defer mgrCancel()
	keyMgrUpdates, keyMgrCancel := d.keyMgrQueue.Watch()
	defer keyMgrCancel()

	// disconnectNode is a helper forcibly shutdown connection
	disconnectNode := func() error {
		// force disconnect by shutting down the stream.
		transportStream, ok := transport.StreamFromContext(stream.Context())
		if ok {
			// if we have the transport stream, we can signal a disconnect
			// in the client.
			if err := transportStream.ServerTransport().Close(); err != nil {
				log.WithError(err).Error("session end")
			}
		}

		nodeStatus := api.NodeStatus{State: api.NodeStatus_DISCONNECTED, Message: "node is currently trying to find new manager"}
		if err := d.nodeRemove(nodeID, nodeStatus); err != nil {
			log.WithError(err).Error("failed to remove node")
		}
		// still return an abort if the transport closure was ineffective.
		return grpc.Errorf(codes.Aborted, "node must disconnect")
	}

	for {
		// After each message send, we need to check the nodes sessionID hasn't
		// changed. If it has, we will the stream and make the node
		// re-register.
		node, err := d.nodes.GetWithSession(nodeID, sessionID)
		if err != nil {
			return err
		}

		var mgrs []*api.WeightedPeer

		var disconnect bool

		select {
		case ev := <-managerUpdates:
			mgrs = ev.([]*api.WeightedPeer)
		case ev := <-nodeUpdates:
			nodeObj = ev.(state.EventUpdateNode).Node
		case <-stream.Context().Done():
			return stream.Context().Err()
		case <-node.Disconnect:
			disconnect = true
		case <-d.ctx.Done():
			disconnect = true
		case <-keyMgrUpdates:
		}
		if mgrs == nil {
			mgrs = d.getManagers()
		}

		if err := stream.Send(&api.SessionMessage{
			SessionID:            sessionID,
			Node:                 nodeObj,
			Managers:             mgrs,
			NetworkBootstrapKeys: d.networkBootstrapKeys,
		}); err != nil {
			return err
		}
		if disconnect {
			return disconnectNode()
		}
	}
}
Example #5
0
// Tasks is a stream of tasks state for node. Each message contains full list
// of tasks which should be run on node, if task is not present in that list,
// it should be terminated.
func (d *Dispatcher) Tasks(r *api.TasksRequest, stream api.Dispatcher_TasksServer) error {
	nodeInfo, err := ca.RemoteNode(stream.Context())
	if err != nil {
		return err
	}
	nodeID := nodeInfo.NodeID

	if err := d.isRunningLocked(); err != nil {
		return err
	}

	fields := logrus.Fields{
		"node.id":      nodeID,
		"node.session": r.SessionID,
		"method":       "(*Dispatcher).Tasks",
	}
	if nodeInfo.ForwardedBy != nil {
		fields["forwarder.id"] = nodeInfo.ForwardedBy.NodeID
	}
	log.G(stream.Context()).WithFields(fields).Debugf("")

	if _, err = d.nodes.GetWithSession(nodeID, r.SessionID); err != nil {
		return err
	}

	tasksMap := make(map[string]*api.Task)
	nodeTasks, cancel, err := store.ViewAndWatch(
		d.store,
		func(readTx store.ReadTx) error {
			tasks, err := store.FindTasks(readTx, store.ByNodeID(nodeID))
			if err != nil {
				return err
			}
			for _, t := range tasks {
				tasksMap[t.ID] = t
			}
			return nil
		},
		state.EventCreateTask{Task: &api.Task{NodeID: nodeID},
			Checks: []state.TaskCheckFunc{state.TaskCheckNodeID}},
		state.EventUpdateTask{Task: &api.Task{NodeID: nodeID},
			Checks: []state.TaskCheckFunc{state.TaskCheckNodeID}},
		state.EventDeleteTask{Task: &api.Task{NodeID: nodeID},
			Checks: []state.TaskCheckFunc{state.TaskCheckNodeID}},
	)
	if err != nil {
		return err
	}
	defer cancel()

	for {
		if _, err := d.nodes.GetWithSession(nodeID, r.SessionID); err != nil {
			return err
		}

		var tasks []*api.Task
		for _, t := range tasksMap {
			// dispatcher only sends tasks that have been assigned to a node
			if t != nil && t.Status.State >= api.TaskStateAssigned {
				tasks = append(tasks, t)
			}
		}

		if err := stream.Send(&api.TasksMessage{Tasks: tasks}); err != nil {
			return err
		}

		select {
		case event := <-nodeTasks:
			switch v := event.(type) {
			case state.EventCreateTask:
				tasksMap[v.Task.ID] = v.Task
			case state.EventUpdateTask:
				tasksMap[v.Task.ID] = v.Task
			case state.EventDeleteTask:
				delete(tasksMap, v.Task.ID)
			}
		case <-stream.Context().Done():
			return stream.Context().Err()
		case <-d.ctx.Done():
			return d.ctx.Err()
		}
	}
}
Example #6
0
// Run runs dispatcher tasks which should be run on leader dispatcher.
// Dispatcher can be stopped with cancelling ctx or calling Stop().
func (d *Dispatcher) Run(ctx context.Context) error {
	d.mu.Lock()
	if d.isRunning() {
		d.mu.Unlock()
		return fmt.Errorf("dispatcher is already running")
	}
	logger := log.G(ctx).WithField("module", "dispatcher")
	ctx = log.WithLogger(ctx, logger)
	if err := d.markNodesUnknown(ctx); err != nil {
		logger.Errorf(`failed to move all nodes to "unknown" state: %v`, err)
	}
	configWatcher, cancel, err := store.ViewAndWatch(
		d.store,
		func(readTx store.ReadTx) error {
			clusters, err := store.FindClusters(readTx, store.ByName(store.DefaultClusterName))
			if err != nil {
				return err
			}
			if err == nil && len(clusters) == 1 {
				heartbeatPeriod, err := ptypes.Duration(clusters[0].Spec.Dispatcher.HeartbeatPeriod)
				if err == nil && heartbeatPeriod > 0 {
					d.config.HeartbeatPeriod = heartbeatPeriod
				}
				if clusters[0].NetworkBootstrapKeys != nil {
					d.networkBootstrapKeys = clusters[0].NetworkBootstrapKeys
				}
			}
			return nil
		},
		state.EventUpdateCluster{},
	)
	if err != nil {
		d.mu.Unlock()
		return err
	}
	defer cancel()
	d.ctx, d.cancel = context.WithCancel(ctx)
	d.mu.Unlock()

	publishManagers := func() {
		mgrs := getWeightedPeers(d.cluster)
		sort.Sort(weightedPeerByNodeID(mgrs))
		d.mu.Lock()
		if reflect.DeepEqual(mgrs, d.lastSeenManagers) {
			d.mu.Unlock()
			return
		}
		d.lastSeenManagers = mgrs
		d.mu.Unlock()
		d.mgrQueue.Publish(mgrs)
	}

	publishManagers()
	publishTicker := time.NewTicker(1 * time.Second)
	defer publishTicker.Stop()

	batchTimer := time.NewTimer(maxBatchInterval)
	defer batchTimer.Stop()

	for {
		select {
		case <-publishTicker.C:
			publishManagers()
		case <-d.processTaskUpdatesTrigger:
			d.processTaskUpdates()
			batchTimer.Reset(maxBatchInterval)
		case <-batchTimer.C:
			d.processTaskUpdates()
			batchTimer.Reset(maxBatchInterval)
		case v := <-configWatcher:
			cluster := v.(state.EventUpdateCluster)
			d.mu.Lock()
			if cluster.Cluster.Spec.Dispatcher.HeartbeatPeriod != nil {
				// ignore error, since Spec has passed validation before
				heartbeatPeriod, _ := ptypes.Duration(cluster.Cluster.Spec.Dispatcher.HeartbeatPeriod)
				if heartbeatPeriod != d.config.HeartbeatPeriod {
					// only call d.nodes.updatePeriod when heartbeatPeriod changes
					d.config.HeartbeatPeriod = heartbeatPeriod
					d.nodes.updatePeriod(d.config.HeartbeatPeriod, d.config.HeartbeatEpsilon, d.config.GracePeriodMultiplier)
				}
			}
			d.networkBootstrapKeys = cluster.Cluster.NetworkBootstrapKeys
			d.mu.Unlock()
			d.keyMgrQueue.Publish(struct{}{})
		case <-d.ctx.Done():
			return nil
		}
	}
}
Example #7
0
// Assignments is a stream of assignments for a node. Each message contains
// either full list of tasks and secrets for the node, or an incremental update.
func (d *Dispatcher) Assignments(r *api.AssignmentsRequest, stream api.Dispatcher_AssignmentsServer) error {
	nodeInfo, err := ca.RemoteNode(stream.Context())
	if err != nil {
		return err
	}
	nodeID := nodeInfo.NodeID

	dctx, err := d.isRunningLocked()
	if err != nil {
		return err
	}

	fields := logrus.Fields{
		"node.id":      nodeID,
		"node.session": r.SessionID,
		"method":       "(*Dispatcher).Assignments",
	}
	if nodeInfo.ForwardedBy != nil {
		fields["forwarder.id"] = nodeInfo.ForwardedBy.NodeID
	}
	log := log.G(stream.Context()).WithFields(fields)
	log.Debugf("")

	if _, err = d.nodes.GetWithSession(nodeID, r.SessionID); err != nil {
		return err
	}

	var (
		sequence  int64
		appliesTo string
		initial   api.AssignmentsMessage
	)
	tasksMap := make(map[string]*api.Task)
	tasksUsingSecret := make(map[string]map[string]struct{})

	sendMessage := func(msg api.AssignmentsMessage, assignmentType api.AssignmentsMessage_Type) error {
		sequence++
		msg.AppliesTo = appliesTo
		msg.ResultsIn = strconv.FormatInt(sequence, 10)
		appliesTo = msg.ResultsIn
		msg.Type = assignmentType

		if err := stream.Send(&msg); err != nil {
			return err
		}
		return nil
	}

	// returns a slice of new secrets to send down
	addSecretsForTask := func(readTx store.ReadTx, t *api.Task) []*api.Secret {
		container := t.Spec.GetContainer()
		if container == nil {
			return nil
		}
		var newSecrets []*api.Secret
		for _, secretRef := range container.Secrets {
			// Empty ID prefix will return all secrets. Bail if there is no SecretID
			if secretRef.SecretID == "" {
				log.Debugf("invalid secret reference")
				continue
			}
			secretID := secretRef.SecretID
			log := log.WithFields(logrus.Fields{
				"secret.id":   secretID,
				"secret.name": secretRef.SecretName,
			})

			if len(tasksUsingSecret[secretID]) == 0 {
				tasksUsingSecret[secretID] = make(map[string]struct{})

				secrets, err := store.FindSecrets(readTx, store.ByIDPrefix(secretID))
				if err != nil {
					log.WithError(err).Errorf("error retrieving secret")
					continue
				}
				if len(secrets) != 1 {
					log.Debugf("secret not found")
					continue
				}

				// If the secret was found and there was one result
				// (there should never be more than one because of the
				// uniqueness constraint), add this secret to our
				// initial set that we send down.
				newSecrets = append(newSecrets, secrets[0])
			}
			tasksUsingSecret[secretID][t.ID] = struct{}{}
		}

		return newSecrets
	}

	// TODO(aaronl): Also send node secrets that should be exposed to
	// this node.
	nodeTasks, cancel, err := store.ViewAndWatch(
		d.store,
		func(readTx store.ReadTx) error {
			tasks, err := store.FindTasks(readTx, store.ByNodeID(nodeID))
			if err != nil {
				return err
			}

			for _, t := range tasks {
				// We only care about tasks that are ASSIGNED or
				// higher. If the state is below ASSIGNED, the
				// task may not meet the constraints for this
				// node, so we have to be careful about sending
				// secrets associated with it.
				if t.Status.State < api.TaskStateAssigned {
					continue
				}

				tasksMap[t.ID] = t
				taskChange := &api.AssignmentChange{
					Assignment: &api.Assignment{
						Item: &api.Assignment_Task{
							Task: t,
						},
					},
					Action: api.AssignmentChange_AssignmentActionUpdate,
				}
				initial.Changes = append(initial.Changes, taskChange)
				// Only send secrets down if these tasks are in < RUNNING
				if t.Status.State <= api.TaskStateRunning {
					newSecrets := addSecretsForTask(readTx, t)
					for _, secret := range newSecrets {
						secretChange := &api.AssignmentChange{
							Assignment: &api.Assignment{
								Item: &api.Assignment_Secret{
									Secret: secret,
								},
							},
							Action: api.AssignmentChange_AssignmentActionUpdate,
						}

						initial.Changes = append(initial.Changes, secretChange)
					}
				}
			}
			return nil
		},
		state.EventUpdateTask{Task: &api.Task{NodeID: nodeID},
			Checks: []state.TaskCheckFunc{state.TaskCheckNodeID}},
		state.EventDeleteTask{Task: &api.Task{NodeID: nodeID},
			Checks: []state.TaskCheckFunc{state.TaskCheckNodeID}},
		state.EventUpdateSecret{},
		state.EventDeleteSecret{},
	)
	if err != nil {
		return err
	}
	defer cancel()

	if err := sendMessage(initial, api.AssignmentsMessage_COMPLETE); err != nil {
		return err
	}

	for {
		// Check for session expiration
		if _, err := d.nodes.GetWithSession(nodeID, r.SessionID); err != nil {
			return err
		}

		// bursty events should be processed in batches and sent out together
		var (
			update          api.AssignmentsMessage
			modificationCnt int
			batchingTimer   *time.Timer
			batchingTimeout <-chan time.Time
			updateTasks     = make(map[string]*api.Task)
			updateSecrets   = make(map[string]*api.Secret)
			removeTasks     = make(map[string]struct{})
			removeSecrets   = make(map[string]struct{})
		)

		oneModification := func() {
			modificationCnt++

			if batchingTimer != nil {
				batchingTimer.Reset(batchingWaitTime)
			} else {
				batchingTimer = time.NewTimer(batchingWaitTime)
				batchingTimeout = batchingTimer.C
			}
		}

		// Release the secrets references from this task
		releaseSecretsForTask := func(t *api.Task) bool {
			var modified bool
			container := t.Spec.GetContainer()
			if container == nil {
				return modified
			}

			for _, secretRef := range container.Secrets {
				secretID := secretRef.SecretID
				delete(tasksUsingSecret[secretID], t.ID)
				if len(tasksUsingSecret[secretID]) == 0 {
					// No tasks are using the secret anymore
					delete(tasksUsingSecret, secretID)
					removeSecrets[secretID] = struct{}{}
					modified = true
				}
			}

			return modified
		}

		// The batching loop waits for 50 ms after the most recent
		// change, or until modificationBatchLimit is reached. The
		// worst case latency is modificationBatchLimit * batchingWaitTime,
		// which is 10 seconds.
	batchingLoop:
		for modificationCnt < modificationBatchLimit {
			select {
			case event := <-nodeTasks:
				switch v := event.(type) {
				// We don't monitor EventCreateTask because tasks are
				// never created in the ASSIGNED state. First tasks are
				// created by the orchestrator, then the scheduler moves
				// them to ASSIGNED. If this ever changes, we will need
				// to monitor task creations as well.
				case state.EventUpdateTask:
					// We only care about tasks that are ASSIGNED or
					// higher.
					if v.Task.Status.State < api.TaskStateAssigned {
						continue
					}

					if oldTask, exists := tasksMap[v.Task.ID]; exists {
						// States ASSIGNED and below are set by the orchestrator/scheduler,
						// not the agent, so tasks in these states need to be sent to the
						// agent even if nothing else has changed.
						if equality.TasksEqualStable(oldTask, v.Task) && v.Task.Status.State > api.TaskStateAssigned {
							// this update should not trigger a task change for the agent
							tasksMap[v.Task.ID] = v.Task
							// If this task got updated to a final state, let's release
							// the secrets that are being used by the task
							if v.Task.Status.State > api.TaskStateRunning {
								// If releasing the secrets caused a secret to be
								// removed from an agent, mark one modification
								if releaseSecretsForTask(v.Task) {
									oneModification()
								}
							}
							continue
						}
					} else if v.Task.Status.State <= api.TaskStateRunning {
						// If this task wasn't part of the assignment set before, and it's <= RUNNING
						// add the secrets it references to the secrets assignment.
						// Task states > RUNNING are worker reported only, are never created in
						// a > RUNNING state.
						var newSecrets []*api.Secret
						d.store.View(func(readTx store.ReadTx) {
							newSecrets = addSecretsForTask(readTx, v.Task)
						})
						for _, secret := range newSecrets {
							updateSecrets[secret.ID] = secret
						}
					}
					tasksMap[v.Task.ID] = v.Task
					updateTasks[v.Task.ID] = v.Task

					oneModification()
				case state.EventDeleteTask:
					if _, exists := tasksMap[v.Task.ID]; !exists {
						continue
					}

					removeTasks[v.Task.ID] = struct{}{}

					delete(tasksMap, v.Task.ID)

					// Release the secrets being used by this task
					// Ignoring the return here. We will always mark
					// this as a modification, since a task is being
					// removed.
					releaseSecretsForTask(v.Task)

					oneModification()
				// TODO(aaronl): For node secrets, we'll need to handle
				// EventCreateSecret.
				case state.EventUpdateSecret:
					if _, exists := tasksUsingSecret[v.Secret.ID]; !exists {
						continue
					}
					log.Debugf("Secret %s (ID: %d) was updated though it was still referenced by one or more tasks",
						v.Secret.Spec.Annotations.Name, v.Secret.ID)

				case state.EventDeleteSecret:
					if _, exists := tasksUsingSecret[v.Secret.ID]; !exists {
						continue
					}
					log.Debugf("Secret %s (ID: %d) was deleted though it was still referenced by one or more tasks",
						v.Secret.Spec.Annotations.Name, v.Secret.ID)
				}
			case <-batchingTimeout:
				break batchingLoop
			case <-stream.Context().Done():
				return stream.Context().Err()
			case <-dctx.Done():
				return dctx.Err()
			}
		}

		if batchingTimer != nil {
			batchingTimer.Stop()
		}

		if modificationCnt > 0 {
			for id, task := range updateTasks {
				if _, ok := removeTasks[id]; !ok {
					taskChange := &api.AssignmentChange{
						Assignment: &api.Assignment{
							Item: &api.Assignment_Task{
								Task: task,
							},
						},
						Action: api.AssignmentChange_AssignmentActionUpdate,
					}

					update.Changes = append(update.Changes, taskChange)
				}
			}
			for id, secret := range updateSecrets {
				// If, due to multiple updates, this secret is no longer in use,
				// don't send it down.
				if len(tasksUsingSecret[id]) == 0 {
					// delete this secret for the secrets to be updated
					// so that deleteSecrets knows the current list
					delete(updateSecrets, id)
					continue
				}
				secretChange := &api.AssignmentChange{
					Assignment: &api.Assignment{
						Item: &api.Assignment_Secret{
							Secret: secret,
						},
					},
					Action: api.AssignmentChange_AssignmentActionUpdate,
				}

				update.Changes = append(update.Changes, secretChange)
			}
			for id := range removeTasks {
				taskChange := &api.AssignmentChange{
					Assignment: &api.Assignment{
						Item: &api.Assignment_Task{
							Task: &api.Task{ID: id},
						},
					},
					Action: api.AssignmentChange_AssignmentActionRemove,
				}

				update.Changes = append(update.Changes, taskChange)
			}
			for id := range removeSecrets {
				// If this secret is also being sent on the updated set
				// don't also add it to the removed set
				if _, ok := updateSecrets[id]; ok {
					continue
				}

				secretChange := &api.AssignmentChange{
					Assignment: &api.Assignment{
						Item: &api.Assignment_Secret{
							Secret: &api.Secret{ID: id},
						},
					},
					Action: api.AssignmentChange_AssignmentActionRemove,
				}

				update.Changes = append(update.Changes, secretChange)
			}

			if err := sendMessage(update, api.AssignmentsMessage_INCREMENTAL); err != nil {
				return err
			}
		}
	}
}
Example #8
0
// Tasks is a stream of tasks state for node. Each message contains full list
// of tasks which should be run on node, if task is not present in that list,
// it should be terminated.
func (d *Dispatcher) Tasks(r *api.TasksRequest, stream api.Dispatcher_TasksServer) error {
	nodeInfo, err := ca.RemoteNode(stream.Context())
	if err != nil {
		return err
	}
	nodeID := nodeInfo.NodeID

	dctx, err := d.isRunningLocked()
	if err != nil {
		return err
	}

	fields := logrus.Fields{
		"node.id":      nodeID,
		"node.session": r.SessionID,
		"method":       "(*Dispatcher).Tasks",
	}
	if nodeInfo.ForwardedBy != nil {
		fields["forwarder.id"] = nodeInfo.ForwardedBy.NodeID
	}
	log.G(stream.Context()).WithFields(fields).Debugf("")

	if _, err = d.nodes.GetWithSession(nodeID, r.SessionID); err != nil {
		return err
	}

	tasksMap := make(map[string]*api.Task)
	nodeTasks, cancel, err := store.ViewAndWatch(
		d.store,
		func(readTx store.ReadTx) error {
			tasks, err := store.FindTasks(readTx, store.ByNodeID(nodeID))
			if err != nil {
				return err
			}
			for _, t := range tasks {
				tasksMap[t.ID] = t
			}
			return nil
		},
		state.EventCreateTask{Task: &api.Task{NodeID: nodeID},
			Checks: []state.TaskCheckFunc{state.TaskCheckNodeID}},
		state.EventUpdateTask{Task: &api.Task{NodeID: nodeID},
			Checks: []state.TaskCheckFunc{state.TaskCheckNodeID}},
		state.EventDeleteTask{Task: &api.Task{NodeID: nodeID},
			Checks: []state.TaskCheckFunc{state.TaskCheckNodeID}},
	)
	if err != nil {
		return err
	}
	defer cancel()

	for {
		if _, err := d.nodes.GetWithSession(nodeID, r.SessionID); err != nil {
			return err
		}

		var tasks []*api.Task
		for _, t := range tasksMap {
			// dispatcher only sends tasks that have been assigned to a node
			if t != nil && t.Status.State >= api.TaskStateAssigned {
				tasks = append(tasks, t)
			}
		}

		if err := stream.Send(&api.TasksMessage{Tasks: tasks}); err != nil {
			return err
		}

		// bursty events should be processed in batches and sent out snapshot
		var (
			modificationCnt int
			batchingTimer   *time.Timer
			batchingTimeout <-chan time.Time
		)

	batchingLoop:
		for modificationCnt < modificationBatchLimit {
			select {
			case event := <-nodeTasks:
				switch v := event.(type) {
				case state.EventCreateTask:
					tasksMap[v.Task.ID] = v.Task
					modificationCnt++
				case state.EventUpdateTask:
					if oldTask, exists := tasksMap[v.Task.ID]; exists {
						// States ASSIGNED and below are set by the orchestrator/scheduler,
						// not the agent, so tasks in these states need to be sent to the
						// agent even if nothing else has changed.
						if equality.TasksEqualStable(oldTask, v.Task) && v.Task.Status.State > api.TaskStateAssigned {
							// this update should not trigger action at agent
							tasksMap[v.Task.ID] = v.Task
							continue
						}
					}
					tasksMap[v.Task.ID] = v.Task
					modificationCnt++
				case state.EventDeleteTask:
					delete(tasksMap, v.Task.ID)
					modificationCnt++
				}
				if batchingTimer != nil {
					batchingTimer.Reset(batchingWaitTime)
				} else {
					batchingTimer = time.NewTimer(batchingWaitTime)
					batchingTimeout = batchingTimer.C
				}
			case <-batchingTimeout:
				break batchingLoop
			case <-stream.Context().Done():
				return stream.Context().Err()
			case <-dctx.Done():
				return dctx.Err()
			}
		}

		if batchingTimer != nil {
			batchingTimer.Stop()
		}
	}
}
Example #9
0
// Run runs dispatcher tasks which should be run on leader dispatcher.
// Dispatcher can be stopped with cancelling ctx or calling Stop().
func (d *Dispatcher) Run(ctx context.Context) error {
	d.mu.Lock()
	if d.isRunning() {
		d.mu.Unlock()
		return errors.New("dispatcher is already running")
	}
	ctx = log.WithModule(ctx, "dispatcher")
	if err := d.markNodesUnknown(ctx); err != nil {
		log.G(ctx).Errorf(`failed to move all nodes to "unknown" state: %v`, err)
	}
	configWatcher, cancel, err := store.ViewAndWatch(
		d.store,
		func(readTx store.ReadTx) error {
			clusters, err := store.FindClusters(readTx, store.ByName(store.DefaultClusterName))
			if err != nil {
				return err
			}
			if err == nil && len(clusters) == 1 {
				heartbeatPeriod, err := gogotypes.DurationFromProto(clusters[0].Spec.Dispatcher.HeartbeatPeriod)
				if err == nil && heartbeatPeriod > 0 {
					d.config.HeartbeatPeriod = heartbeatPeriod
				}
				if clusters[0].NetworkBootstrapKeys != nil {
					d.networkBootstrapKeys = clusters[0].NetworkBootstrapKeys
				}
			}
			return nil
		},
		state.EventUpdateCluster{},
	)
	if err != nil {
		d.mu.Unlock()
		return err
	}
	// set queues here to guarantee that Close will close them
	d.mgrQueue = watch.NewQueue()
	d.keyMgrQueue = watch.NewQueue()

	peerWatcher, peerCancel := d.cluster.SubscribePeers()
	defer peerCancel()
	d.lastSeenManagers = getWeightedPeers(d.cluster)

	defer cancel()
	d.ctx, d.cancel = context.WithCancel(ctx)
	ctx = d.ctx
	d.wg.Add(1)
	defer d.wg.Done()
	d.mu.Unlock()

	publishManagers := func(peers []*api.Peer) {
		var mgrs []*api.WeightedPeer
		for _, p := range peers {
			mgrs = append(mgrs, &api.WeightedPeer{
				Peer:   p,
				Weight: remotes.DefaultObservationWeight,
			})
		}
		d.mu.Lock()
		d.lastSeenManagers = mgrs
		d.mu.Unlock()
		d.mgrQueue.Publish(mgrs)
	}

	batchTimer := time.NewTimer(maxBatchInterval)
	defer batchTimer.Stop()

	for {
		select {
		case ev := <-peerWatcher:
			publishManagers(ev.([]*api.Peer))
		case <-d.processUpdatesTrigger:
			d.processUpdates(ctx)
			batchTimer.Reset(maxBatchInterval)
		case <-batchTimer.C:
			d.processUpdates(ctx)
			batchTimer.Reset(maxBatchInterval)
		case v := <-configWatcher:
			cluster := v.(state.EventUpdateCluster)
			d.mu.Lock()
			if cluster.Cluster.Spec.Dispatcher.HeartbeatPeriod != nil {
				// ignore error, since Spec has passed validation before
				heartbeatPeriod, _ := gogotypes.DurationFromProto(cluster.Cluster.Spec.Dispatcher.HeartbeatPeriod)
				if heartbeatPeriod != d.config.HeartbeatPeriod {
					// only call d.nodes.updatePeriod when heartbeatPeriod changes
					d.config.HeartbeatPeriod = heartbeatPeriod
					d.nodes.updatePeriod(d.config.HeartbeatPeriod, d.config.HeartbeatEpsilon, d.config.GracePeriodMultiplier)
				}
			}
			d.networkBootstrapKeys = cluster.Cluster.NetworkBootstrapKeys
			d.mu.Unlock()
			d.keyMgrQueue.Publish(cluster.Cluster.NetworkBootstrapKeys)
		case <-ctx.Done():
			return nil
		}
	}
}
Example #10
0
// NodeCertificateStatus returns the current issuance status of an issuance request identified by the nodeID
func (s *Server) NodeCertificateStatus(ctx context.Context, request *api.NodeCertificateStatusRequest) (*api.NodeCertificateStatusResponse, error) {
	if request.NodeID == "" {
		return nil, grpc.Errorf(codes.InvalidArgument, codes.InvalidArgument.String())
	}

	if err := s.addTask(); err != nil {
		return nil, err
	}
	defer s.doneTask()

	var node *api.Node

	event := state.EventUpdateNode{
		Node:   &api.Node{ID: request.NodeID},
		Checks: []state.NodeCheckFunc{state.NodeCheckID},
	}

	// Retrieve the current value of the certificate with this token, and create a watcher
	updates, cancel, err := store.ViewAndWatch(
		s.store,
		func(tx store.ReadTx) error {
			node = store.GetNode(tx, request.NodeID)
			return nil
		},
		event,
	)
	if err != nil {
		return nil, err
	}
	defer cancel()

	// This node ID doesn't exist
	if node == nil {
		return nil, grpc.Errorf(codes.NotFound, codes.NotFound.String())
	}

	log.G(ctx).WithFields(logrus.Fields{
		"node.id": node.ID,
		"status":  node.Certificate.Status,
		"method":  "NodeCertificateStatus",
	})

	// If this certificate has a final state, return it immediately (both pending and renew are transition states)
	if isFinalState(node.Certificate.Status) {
		return &api.NodeCertificateStatusResponse{
			Status:      &node.Certificate.Status,
			Certificate: &node.Certificate,
		}, nil
	}

	log.G(ctx).WithFields(logrus.Fields{
		"node.id": node.ID,
		"status":  node.Certificate.Status,
		"method":  "NodeCertificateStatus",
	}).Debugf("started watching for certificate updates")

	// Certificate is Pending or in an Unknown state, let's wait for changes.
	for {
		select {
		case event := <-updates:
			switch v := event.(type) {
			case state.EventUpdateNode:
				// We got an update on the certificate record. If the status is a final state,
				// return the certificate.
				if isFinalState(v.Node.Certificate.Status) {
					cert := v.Node.Certificate.Copy()
					return &api.NodeCertificateStatusResponse{
						Status:      &cert.Status,
						Certificate: cert,
					}, nil
				}
			}
		case <-ctx.Done():
			return nil, ctx.Err()
		case <-s.ctx.Done():
			return nil, s.ctx.Err()
		}
	}
}
Example #11
0
// Run runs the CA signer main loop.
// The CA signer can be stopped with cancelling ctx or calling Stop().
func (s *Server) Run(ctx context.Context) error {
	s.mu.Lock()
	if s.isRunning() {
		s.mu.Unlock()
		return fmt.Errorf("CA signer is already running")
	}
	s.wg.Add(1)
	s.mu.Unlock()

	defer s.wg.Done()
	logger := log.G(ctx).WithField("module", "ca")
	ctx = log.WithLogger(ctx, logger)

	// Run() should never be called twice, but just in case, we're
	// attempting to close the started channel in a safe way
	select {
	case <-s.started:
		return fmt.Errorf("CA server cannot be started more than once")
	default:
		close(s.started)
	}

	// Retrieve the channels to keep track of changes in the cluster
	// Retrieve all the currently registered nodes
	var nodes []*api.Node
	updates, cancel, err := store.ViewAndWatch(
		s.store,
		func(readTx store.ReadTx) error {
			clusters, err := store.FindClusters(readTx, store.ByName(store.DefaultClusterName))
			if err != nil {
				return err
			}
			if len(clusters) != 1 {
				return fmt.Errorf("could not find cluster object")
			}
			s.updateCluster(ctx, clusters[0])

			nodes, err = store.FindNodes(readTx, store.All)
			return err
		},
		state.EventCreateNode{},
		state.EventUpdateNode{},
		state.EventUpdateCluster{},
	)

	// Do this after updateCluster has been called, so isRunning never
	// returns true without joinTokens being set correctly.
	s.mu.Lock()
	s.ctx, s.cancel = context.WithCancel(ctx)
	s.mu.Unlock()

	if err != nil {
		log.G(ctx).WithFields(logrus.Fields{
			"method": "(*Server).Run",
		}).WithError(err).Errorf("snapshot store view failed")
		return err
	}
	defer cancel()

	// We might have missed some updates if there was a leader election,
	// so let's pick up the slack.
	if err := s.reconcileNodeCertificates(ctx, nodes); err != nil {
		// We don't return here because that means the Run loop would
		// never run. Log an error instead.
		log.G(ctx).WithFields(logrus.Fields{
			"method": "(*Server).Run",
		}).WithError(err).Errorf("error attempting to reconcile certificates")
	}

	// Watch for new nodes being created, new nodes being updated, and changes
	// to the cluster
	for {
		select {
		case event := <-updates:
			switch v := event.(type) {
			case state.EventCreateNode:
				s.evaluateAndSignNodeCert(ctx, v.Node)
			case state.EventUpdateNode:
				// If this certificate is already at a final state
				// no need to evaluate and sign it.
				if !isFinalState(v.Node.Certificate.Status) {
					s.evaluateAndSignNodeCert(ctx, v.Node)
				}
			case state.EventUpdateCluster:
				s.updateCluster(ctx, v.Cluster)
			}

		case <-ctx.Done():
			return ctx.Err()
		case <-s.ctx.Done():
			return nil
		}
	}
}
Example #12
0
// Run runs the CA signer main loop.
// The CA signer can be stopped with cancelling ctx or calling Stop().
func (s *Server) Run(ctx context.Context) error {
	s.mu.Lock()
	if s.isRunning() {
		s.mu.Unlock()
		return errors.New("CA signer is already running")
	}
	s.wg.Add(1)
	s.mu.Unlock()

	defer s.wg.Done()
	ctx = log.WithModule(ctx, "ca")

	// Retrieve the channels to keep track of changes in the cluster
	// Retrieve all the currently registered nodes
	var nodes []*api.Node
	updates, cancel, err := store.ViewAndWatch(
		s.store,
		func(readTx store.ReadTx) error {
			clusters, err := store.FindClusters(readTx, store.ByName(store.DefaultClusterName))
			if err != nil {
				return err
			}
			if len(clusters) != 1 {
				return errors.New("could not find cluster object")
			}
			s.updateCluster(ctx, clusters[0])

			nodes, err = store.FindNodes(readTx, store.All)
			return err
		},
		state.EventCreateNode{},
		state.EventUpdateNode{},
		state.EventUpdateCluster{},
	)

	// Do this after updateCluster has been called, so isRunning never
	// returns true without joinTokens being set correctly.
	s.mu.Lock()
	s.ctx, s.cancel = context.WithCancel(ctx)
	s.mu.Unlock()
	close(s.started)

	if err != nil {
		log.G(ctx).WithFields(logrus.Fields{
			"method": "(*Server).Run",
		}).WithError(err).Errorf("snapshot store view failed")
		return err
	}
	defer cancel()

	// We might have missed some updates if there was a leader election,
	// so let's pick up the slack.
	if err := s.reconcileNodeCertificates(ctx, nodes); err != nil {
		// We don't return here because that means the Run loop would
		// never run. Log an error instead.
		log.G(ctx).WithFields(logrus.Fields{
			"method": "(*Server).Run",
		}).WithError(err).Errorf("error attempting to reconcile certificates")
	}

	ticker := time.NewTicker(s.reconciliationRetryInterval)
	defer ticker.Stop()

	// Watch for new nodes being created, new nodes being updated, and changes
	// to the cluster
	for {
		select {
		case event := <-updates:
			switch v := event.(type) {
			case state.EventCreateNode:
				s.evaluateAndSignNodeCert(ctx, v.Node)
			case state.EventUpdateNode:
				// If this certificate is already at a final state
				// no need to evaluate and sign it.
				if !isFinalState(v.Node.Certificate.Status) {
					s.evaluateAndSignNodeCert(ctx, v.Node)
				}
			case state.EventUpdateCluster:
				s.updateCluster(ctx, v.Cluster)
			}
		case <-ticker.C:
			for _, node := range s.pending {
				if err := s.evaluateAndSignNodeCert(ctx, node); err != nil {
					// If this sign operation did not succeed, the rest are
					// unlikely to. Yield so that we don't hammer an external CA.
					// Since the map iteration order is randomized, there is no
					// risk of getting stuck on a problematic CSR.
					break
				}
			}
		case <-ctx.Done():
			return ctx.Err()
		case <-s.ctx.Done():
			return nil
		}
	}
}
Example #13
0
// Run is the scheduler event loop.
func (s *Scheduler) Run(ctx context.Context) error {
	defer close(s.doneChan)

	updates, cancel, err := store.ViewAndWatch(s.store, s.setupTasksList)
	if err != nil {
		log.G(ctx).WithError(err).Errorf("snapshot store update failed")
		return err
	}
	defer cancel()

	// Validate resource for tasks from preassigned tasks
	// do this before other tasks because preassigned tasks like
	// global service should start before other tasks
	s.processPreassignedTasks(ctx)

	// Queue all unassigned tasks before processing changes.
	s.tick(ctx)

	const (
		// commitDebounceGap is the amount of time to wait between
		// commit events to debounce them.
		commitDebounceGap = 50 * time.Millisecond
		// maxLatency is a time limit on the debouncing.
		maxLatency = time.Second
	)
	var (
		debouncingStarted     time.Time
		commitDebounceTimer   *time.Timer
		commitDebounceTimeout <-chan time.Time
	)

	pendingChanges := 0

	schedule := func() {
		if len(s.preassignedTasks) > 0 {
			s.processPreassignedTasks(ctx)
		}
		if pendingChanges > 0 {
			s.tick(ctx)
			pendingChanges = 0
		}
	}

	// Watch for changes.
	for {
		select {
		case event := <-updates:
			switch v := event.(type) {
			case state.EventCreateTask:
				pendingChanges += s.createTask(ctx, v.Task)
			case state.EventUpdateTask:
				pendingChanges += s.updateTask(ctx, v.Task)
			case state.EventDeleteTask:
				s.deleteTask(ctx, v.Task)
			case state.EventCreateNode:
				s.createOrUpdateNode(v.Node)
				pendingChanges++
			case state.EventUpdateNode:
				s.createOrUpdateNode(v.Node)
				pendingChanges++
			case state.EventDeleteNode:
				s.nodeSet.remove(v.Node.ID)
			case state.EventCommit:
				if commitDebounceTimer != nil {
					if time.Since(debouncingStarted) > maxLatency {
						commitDebounceTimer.Stop()
						commitDebounceTimer = nil
						commitDebounceTimeout = nil
						schedule()
					} else {
						commitDebounceTimer.Reset(commitDebounceGap)
					}
				} else {
					commitDebounceTimer = time.NewTimer(commitDebounceGap)
					commitDebounceTimeout = commitDebounceTimer.C
					debouncingStarted = time.Now()
				}
			}
		case <-commitDebounceTimeout:
			schedule()
			commitDebounceTimer = nil
			commitDebounceTimeout = nil
		case <-s.stopChan:
			return nil
		}
	}
}
Example #14
0
// Assignments is a stream of assignments for a node. Each message contains
// either full list of tasks and secrets for the node, or an incremental update.
func (d *Dispatcher) Assignments(r *api.AssignmentsRequest, stream api.Dispatcher_AssignmentsServer) error {
	nodeInfo, err := ca.RemoteNode(stream.Context())
	if err != nil {
		return err
	}
	nodeID := nodeInfo.NodeID

	if err := d.isRunningLocked(); err != nil {
		return err
	}

	fields := logrus.Fields{
		"node.id":      nodeID,
		"node.session": r.SessionID,
		"method":       "(*Dispatcher).Assignments",
	}
	if nodeInfo.ForwardedBy != nil {
		fields["forwarder.id"] = nodeInfo.ForwardedBy.NodeID
	}
	log := log.G(stream.Context()).WithFields(fields)
	log.Debugf("")

	if _, err = d.nodes.GetWithSession(nodeID, r.SessionID); err != nil {
		return err
	}

	var (
		sequence  int64
		appliesTo string
		initial   api.AssignmentsMessage
	)
	tasksMap := make(map[string]*api.Task)

	sendMessage := func(msg api.AssignmentsMessage, assignmentType api.AssignmentsMessage_Type) error {
		sequence++
		msg.AppliesTo = appliesTo
		msg.ResultsIn = strconv.FormatInt(sequence, 10)
		appliesTo = msg.ResultsIn
		msg.Type = assignmentType

		if err := stream.Send(&msg); err != nil {
			return err
		}
		return nil
	}

	// TODO(aaronl): Also send node secrets that should be exposed to
	// this node.
	nodeTasks, cancel, err := store.ViewAndWatch(
		d.store,
		func(readTx store.ReadTx) error {
			tasks, err := store.FindTasks(readTx, store.ByNodeID(nodeID))
			if err != nil {
				return err
			}

			for _, t := range tasks {
				// We only care about tasks that are ASSIGNED or
				// higher. If the state is below ASSIGNED, the
				// task may not meet the constraints for this
				// node, so we have to be careful about sending
				// secrets associated with it.
				if t.Status.State < api.TaskStateAssigned {
					continue
				}

				tasksMap[t.ID] = t
				initial.UpdateTasks = append(initial.UpdateTasks, t)
			}
			return nil
		},
		state.EventUpdateTask{Task: &api.Task{NodeID: nodeID},
			Checks: []state.TaskCheckFunc{state.TaskCheckNodeID}},
		state.EventDeleteTask{Task: &api.Task{NodeID: nodeID},
			Checks: []state.TaskCheckFunc{state.TaskCheckNodeID}},
	)
	if err != nil {
		return err
	}
	defer cancel()

	if err := sendMessage(initial, api.AssignmentsMessage_COMPLETE); err != nil {
		return err
	}

	for {
		// Check for session expiration
		if _, err := d.nodes.GetWithSession(nodeID, r.SessionID); err != nil {
			return err
		}

		// bursty events should be processed in batches and sent out together
		var (
			update          api.AssignmentsMessage
			modificationCnt int
			batchingTimer   *time.Timer
			batchingTimeout <-chan time.Time
			updateTasks     = make(map[string]*api.Task)
			removeTasks     = make(map[string]struct{})
		)

		oneModification := func() {
			modificationCnt++

			if batchingTimer != nil {
				batchingTimer.Reset(batchingWaitTime)
			} else {
				batchingTimer = time.NewTimer(batchingWaitTime)
				batchingTimeout = batchingTimer.C
			}
		}

		// The batching loop waits for 50 ms after the most recent
		// change, or until modificationBatchLimit is reached. The
		// worst case latency is modificationBatchLimit * batchingWaitTime,
		// which is 10 seconds.
	batchingLoop:
		for modificationCnt < modificationBatchLimit {
			select {
			case event := <-nodeTasks:
				switch v := event.(type) {
				// We don't monitor EventCreateTask because tasks are
				// never created in the ASSIGNED state. First tasks are
				// created by the orchestrator, then the scheduler moves
				// them to ASSIGNED. If this ever changes, we will need
				// to monitor task creations as well.
				case state.EventUpdateTask:
					// We only care about tasks that are ASSIGNED or
					// higher.
					if v.Task.Status.State < api.TaskStateAssigned {
						continue
					}

					if oldTask, exists := tasksMap[v.Task.ID]; exists {
						// States ASSIGNED and below are set by the orchestrator/scheduler,
						// not the agent, so tasks in these states need to be sent to the
						// agent even if nothing else has changed.
						if equality.TasksEqualStable(oldTask, v.Task) && v.Task.Status.State > api.TaskStateAssigned {
							// this update should not trigger a task change for the agent
							tasksMap[v.Task.ID] = v.Task
							continue
						}
					}
					tasksMap[v.Task.ID] = v.Task
					updateTasks[v.Task.ID] = v.Task

					oneModification()
				case state.EventDeleteTask:

					if _, exists := tasksMap[v.Task.ID]; !exists {
						continue
					}

					removeTasks[v.Task.ID] = struct{}{}

					delete(tasksMap, v.Task.ID)

					oneModification()
				}
			case <-batchingTimeout:
				break batchingLoop
			case <-stream.Context().Done():
				return stream.Context().Err()
			case <-d.ctx.Done():
				return d.ctx.Err()
			}
		}

		if batchingTimer != nil {
			batchingTimer.Stop()
		}

		if modificationCnt > 0 {
			for id, task := range updateTasks {
				if _, ok := removeTasks[id]; !ok {
					update.UpdateTasks = append(update.UpdateTasks, task)
				}
			}
			for id := range removeTasks {
				update.RemoveTasks = append(update.RemoveTasks, id)
			}
			if err := sendMessage(update, api.AssignmentsMessage_INCREMENTAL); err != nil {
				return err
			}
		}
	}
}
// Tasks is a stream of tasks state for node. Each message contains full list
// of tasks which should be run on node, if task is not present in that list,
// it should be terminated.
func (d *Dispatcher) Tasks(r *api.TasksRequest, stream api.Dispatcher_TasksServer) error {
	nodeInfo, err := ca.RemoteNode(stream.Context())
	if err != nil {
		return err
	}
	nodeID := nodeInfo.NodeID

	if err := d.isRunningLocked(); err != nil {
		return err
	}

	fields := logrus.Fields{
		"node.id":      nodeID,
		"node.session": r.SessionID,
		"method":       "(*Dispatcher).Tasks",
	}
	if nodeInfo.ForwardedBy != nil {
		fields["forwarder.id"] = nodeInfo.ForwardedBy.NodeID
	}
	log.G(stream.Context()).WithFields(fields).Debugf("")

	if _, err = d.nodes.GetWithSession(nodeID, r.SessionID); err != nil {
		return err
	}

	tasksMap := make(map[string]*api.Task)
	nodeTasks, cancel, err := store.ViewAndWatch(
		d.store,
		func(readTx store.ReadTx) error {
			tasks, err := store.FindTasks(readTx, store.ByNodeID(nodeID))
			if err != nil {
				return err
			}
			for _, t := range tasks {
				tasksMap[t.ID] = t
			}
			return nil
		},
		state.EventCreateTask{Task: &api.Task{NodeID: nodeID},
			Checks: []state.TaskCheckFunc{state.TaskCheckNodeID}},
		state.EventUpdateTask{Task: &api.Task{NodeID: nodeID},
			Checks: []state.TaskCheckFunc{state.TaskCheckNodeID}},
		state.EventDeleteTask{Task: &api.Task{NodeID: nodeID},
			Checks: []state.TaskCheckFunc{state.TaskCheckNodeID}},
	)
	if err != nil {
		return err
	}
	defer cancel()

	for {
		if _, err := d.nodes.GetWithSession(nodeID, r.SessionID); err != nil {
			return err
		}

		var tasks []*api.Task
		for _, t := range tasksMap {
			// dispatcher only sends tasks that have been assigned to a node
			if t != nil && t.Status.State >= api.TaskStateAssigned {
				tasks = append(tasks, t)
			}
		}

		if err := stream.Send(&api.TasksMessage{Tasks: tasks}); err != nil {
			return err
		}

		// bursty events should be processed in batches and sent out snapshot
		const modificationBatchLimit = 200
		const eventPausedGap = 50 * time.Millisecond
		var modificationCnt int
		// eventPaused is true when there have been modifications
		// but next event has not arrived within eventPausedGap
		eventPaused := false

		for modificationCnt < modificationBatchLimit && !eventPaused {
			select {
			case event := <-nodeTasks:
				switch v := event.(type) {
				case state.EventCreateTask:
					tasksMap[v.Task.ID] = v.Task
					modificationCnt++
				case state.EventUpdateTask:
					if oldTask, exists := tasksMap[v.Task.ID]; exists {
						if equality.TasksEqualStable(oldTask, v.Task) {
							// this update should not trigger action at agent
							tasksMap[v.Task.ID] = v.Task
							continue
						}
					}
					tasksMap[v.Task.ID] = v.Task
					modificationCnt++
				case state.EventDeleteTask:
					delete(tasksMap, v.Task.ID)
					modificationCnt++
				}
			case <-time.After(eventPausedGap):
				if modificationCnt > 0 {
					eventPaused = true
				}
			case <-stream.Context().Done():
				return stream.Context().Err()
			case <-d.ctx.Done():
				return d.ctx.Err()
			}
		}
	}
}