Beispiel #1
0
func (d *Dispatcher) markNodesUnknown(ctx context.Context) error {
	log := log.G(ctx).WithField("method", "(*Dispatcher).markNodesUnknown")
	var nodes []*api.Node
	var err error
	d.store.View(func(tx store.ReadTx) {
		nodes, err = store.FindNodes(tx, store.All)
	})
	if err != nil {
		return fmt.Errorf("failed to get list of nodes: %v", err)
	}
	_, err = d.store.Batch(func(batch *store.Batch) error {
		for _, n := range nodes {
			err := batch.Update(func(tx store.Tx) error {
				// check if node is still here
				node := store.GetNode(tx, n.ID)
				if node == nil {
					return nil
				}
				// do not try to resurrect down nodes
				if node.Status.State == api.NodeStatus_DOWN {
					return nil
				}
				node.Status = api.NodeStatus{
					State:   api.NodeStatus_UNKNOWN,
					Message: `Node moved to "unknown" state due to leadership change in cluster`,
				}
				nodeID := node.ID

				expireFunc := func() {
					log := log.WithField("node", nodeID)
					nodeStatus := api.NodeStatus{State: api.NodeStatus_DOWN, Message: `heartbeat failure for node in "unknown" state`}
					log.Debugf("heartbeat expiration for unknown node")
					if err := d.nodeRemove(nodeID, nodeStatus); err != nil {
						log.WithError(err).Errorf(`failed deregistering node after heartbeat expiration for node in "unknown" state`)
					}
				}
				if err := d.nodes.AddUnknown(node, expireFunc); err != nil {
					return fmt.Errorf(`adding node in "unknown" state to node store failed: %v`, err)
				}
				if err := store.UpdateNode(tx, node); err != nil {
					return fmt.Errorf("update failed %v", err)
				}
				return nil
			})
			if err != nil {
				log.WithField("node", n.ID).WithError(err).Errorf(`failed to move node to "unknown" state`)
			}
		}
		return nil
	})
	return err
}
Beispiel #2
0
func (d *Dispatcher) processTaskUpdates() {
	d.taskUpdatesLock.Lock()
	if len(d.taskUpdates) == 0 {
		d.taskUpdatesLock.Unlock()
		return
	}
	taskUpdates := d.taskUpdates
	d.taskUpdates = make(map[string]*api.TaskStatus)
	d.taskUpdatesLock.Unlock()

	log := log.G(d.ctx).WithFields(logrus.Fields{
		"method": "(*Dispatcher).processTaskUpdates",
	})

	_, err := d.store.Batch(func(batch *store.Batch) error {
		for taskID, status := range taskUpdates {
			err := batch.Update(func(tx store.Tx) error {
				logger := log.WithField("task.id", taskID)
				task := store.GetTask(tx, taskID)
				if task == nil {
					logger.Errorf("task unavailable")
					return nil
				}

				logger = logger.WithField("state.transition", fmt.Sprintf("%v->%v", task.Status.State, status.State))

				if task.Status == *status {
					logger.Debug("task status identical, ignoring")
					return nil
				}

				if task.Status.State > status.State {
					logger.Debug("task status invalid transition")
					return nil
				}

				task.Status = *status
				if err := store.UpdateTask(tx, task); err != nil {
					logger.WithError(err).Error("failed to update task status")
					return nil
				}
				logger.Debug("task status updated")
				return nil
			})
			if err != nil {
				log.WithError(err).Error("dispatcher transaction failed")
			}
		}
		return nil
	})
	if err != nil {
		log.WithError(err).Error("dispatcher batch failed")
	}
}
Beispiel #3
0
func newPeer(id uint64, addr string, tr *Transport) (*peer, error) {
	cc, err := tr.dial(addr)
	if err != nil {
		return nil, errors.Wrapf(err, "failed to create conn for %x with addr %s", id, addr)
	}
	ctx, cancel := context.WithCancel(tr.ctx)
	ctx = log.WithField(ctx, "peer_id", fmt.Sprintf("%x", id))
	p := &peer{
		id:     id,
		addr:   addr,
		cc:     cc,
		tr:     tr,
		ctx:    ctx,
		cancel: cancel,
		msgc:   make(chan raftpb.Message, 4096),
		done:   make(chan struct{}),
	}
	go p.run(ctx)
	return p, nil
}
Beispiel #4
0
// UpdateTaskStatus updates status of task. Node should send such updates
// on every status change of its tasks.
func (d *Dispatcher) UpdateTaskStatus(ctx context.Context, r *api.UpdateTaskStatusRequest) (*api.UpdateTaskStatusResponse, error) {
	nodeInfo, err := ca.RemoteNode(ctx)
	if err != nil {
		return nil, err
	}
	nodeID := nodeInfo.NodeID
	fields := logrus.Fields{
		"node.id":      nodeID,
		"node.session": r.SessionID,
		"method":       "(*Dispatcher).UpdateTaskStatus",
	}
	if nodeInfo.ForwardedBy != nil {
		fields["forwarder.id"] = nodeInfo.ForwardedBy.NodeID
	}
	log := log.G(ctx).WithFields(fields)

	if err := d.isRunningLocked(); err != nil {
		return nil, err
	}

	if _, err := d.nodes.GetWithSession(nodeID, r.SessionID); err != nil {
		return nil, err
	}

	// Validate task updates
	for _, u := range r.Updates {
		if u.Status == nil {
			log.WithField("task.id", u.TaskID).Warn("task report has nil status")
			continue
		}

		var t *api.Task
		d.store.View(func(tx store.ReadTx) {
			t = store.GetTask(tx, u.TaskID)
		})
		if t == nil {
			log.WithField("task.id", u.TaskID).Warn("cannot find target task in store")
			continue
		}

		if t.NodeID != nodeID {
			err := grpc.Errorf(codes.PermissionDenied, "cannot update a task not assigned this node")
			log.WithField("task.id", u.TaskID).Error(err)
			return nil, err
		}
	}

	d.taskUpdatesLock.Lock()
	// Enqueue task updates
	for _, u := range r.Updates {
		if u.Status == nil {
			continue
		}
		d.taskUpdates[u.TaskID] = u.Status
	}

	numUpdates := len(d.taskUpdates)
	d.taskUpdatesLock.Unlock()

	if numUpdates >= maxBatchItems {
		d.processTaskUpdatesTrigger <- struct{}{}
	}
	return nil, nil
}
Beispiel #5
0
func (d *Dispatcher) processUpdates(ctx context.Context) {
	var (
		taskUpdates map[string]*api.TaskStatus
		nodeUpdates map[string]nodeUpdate
	)
	d.taskUpdatesLock.Lock()
	if len(d.taskUpdates) != 0 {
		taskUpdates = d.taskUpdates
		d.taskUpdates = make(map[string]*api.TaskStatus)
	}
	d.taskUpdatesLock.Unlock()

	d.nodeUpdatesLock.Lock()
	if len(d.nodeUpdates) != 0 {
		nodeUpdates = d.nodeUpdates
		d.nodeUpdates = make(map[string]nodeUpdate)
	}
	d.nodeUpdatesLock.Unlock()

	if len(taskUpdates) == 0 && len(nodeUpdates) == 0 {
		return
	}

	log := log.G(ctx).WithFields(logrus.Fields{
		"method": "(*Dispatcher).processUpdates",
	})

	_, err := d.store.Batch(func(batch *store.Batch) error {
		for taskID, status := range taskUpdates {
			err := batch.Update(func(tx store.Tx) error {
				logger := log.WithField("task.id", taskID)
				task := store.GetTask(tx, taskID)
				if task == nil {
					logger.Errorf("task unavailable")
					return nil
				}

				logger = logger.WithField("state.transition", fmt.Sprintf("%v->%v", task.Status.State, status.State))

				if task.Status == *status {
					logger.Debug("task status identical, ignoring")
					return nil
				}

				if task.Status.State > status.State {
					logger.Debug("task status invalid transition")
					return nil
				}

				task.Status = *status
				if err := store.UpdateTask(tx, task); err != nil {
					logger.WithError(err).Error("failed to update task status")
					return nil
				}
				logger.Debug("task status updated")
				return nil
			})
			if err != nil {
				log.WithError(err).Error("dispatcher task update transaction failed")
			}
		}

		for nodeID, nodeUpdate := range nodeUpdates {
			err := batch.Update(func(tx store.Tx) error {
				logger := log.WithField("node.id", nodeID)
				node := store.GetNode(tx, nodeID)
				if node == nil {
					logger.Errorf("node unavailable")
					return nil
				}

				if nodeUpdate.status != nil {
					node.Status.State = nodeUpdate.status.State
					node.Status.Message = nodeUpdate.status.Message
					if nodeUpdate.status.Addr != "" {
						node.Status.Addr = nodeUpdate.status.Addr
					}
				}
				if nodeUpdate.description != nil {
					node.Description = nodeUpdate.description
				}

				if err := store.UpdateNode(tx, node); err != nil {
					logger.WithError(err).Error("failed to update node status")
					return nil
				}
				logger.Debug("node status updated")
				return nil
			})
			if err != nil {
				log.WithError(err).Error("dispatcher node update transaction failed")
			}
		}

		return nil
	})
	if err != nil {
		log.WithError(err).Error("dispatcher batch failed")
	}

	d.processUpdatesCond.Broadcast()
}
Beispiel #6
0
func (d *Dispatcher) markNodesUnknown(ctx context.Context) error {
	log := log.G(ctx).WithField("method", "(*Dispatcher).markNodesUnknown")
	var nodes []*api.Node
	var err error
	d.store.View(func(tx store.ReadTx) {
		nodes, err = store.FindNodes(tx, store.All)
	})
	if err != nil {
		return errors.Wrap(err, "failed to get list of nodes")
	}
	_, err = d.store.Batch(func(batch *store.Batch) error {
		for _, n := range nodes {
			err := batch.Update(func(tx store.Tx) error {
				// check if node is still here
				node := store.GetNode(tx, n.ID)
				if node == nil {
					return nil
				}
				// do not try to resurrect down nodes
				if node.Status.State == api.NodeStatus_DOWN {
					nodeCopy := node
					expireFunc := func() {
						if err := d.moveTasksToOrphaned(nodeCopy.ID); err != nil {
							log.WithError(err).Error(`failed to move all tasks to "ORPHANED" state`)
						}

						d.downNodes.Delete(nodeCopy.ID)
					}

					d.downNodes.Add(nodeCopy, expireFunc)
					return nil
				}

				node.Status.State = api.NodeStatus_UNKNOWN
				node.Status.Message = `Node moved to "unknown" state due to leadership change in cluster`

				nodeID := node.ID

				expireFunc := func() {
					log := log.WithField("node", nodeID)
					log.Debugf("heartbeat expiration for unknown node")
					if err := d.markNodeNotReady(nodeID, api.NodeStatus_DOWN, `heartbeat failure for node in "unknown" state`); err != nil {
						log.WithError(err).Errorf(`failed deregistering node after heartbeat expiration for node in "unknown" state`)
					}
				}
				if err := d.nodes.AddUnknown(node, expireFunc); err != nil {
					return errors.Wrap(err, `adding node in "unknown" state to node store failed`)
				}
				if err := store.UpdateNode(tx, node); err != nil {
					return errors.Wrap(err, "update failed")
				}
				return nil
			})
			if err != nil {
				log.WithField("node", n.ID).WithError(err).Errorf(`failed to move node to "unknown" state`)
			}
		}
		return nil
	})
	return err
}
Beispiel #7
0
// ListenSubscriptions returns a stream of matching subscriptions for the current node
func (lb *LogBroker) ListenSubscriptions(request *api.ListenSubscriptionsRequest, stream api.LogBroker_ListenSubscriptionsServer) error {
	remote, err := ca.RemoteNode(stream.Context())
	if err != nil {
		return err
	}

	lb.nodeConnected(remote.NodeID)
	defer lb.nodeDisconnected(remote.NodeID)

	log := log.G(stream.Context()).WithFields(
		logrus.Fields{
			"method": "(*LogBroker).ListenSubscriptions",
			"node":   remote.NodeID,
		},
	)
	subscriptions, subscriptionCh, subscriptionCancel := lb.watchSubscriptions(remote.NodeID)
	defer subscriptionCancel()

	log.Debug("node registered")

	activeSubscriptions := make(map[string]*subscription)
	defer func() {
		// If the worker quits, mark all active subscriptions as finished.
		for _, subscription := range activeSubscriptions {
			subscription.Done(remote.NodeID, fmt.Errorf("node %s disconnected unexpectedly", remote.NodeID))
		}
	}()

	// Start by sending down all active subscriptions.
	for _, subscription := range subscriptions {
		select {
		case <-stream.Context().Done():
			return stream.Context().Err()
		case <-lb.pctx.Done():
			return nil
		default:
		}

		if err := stream.Send(subscription.message); err != nil {
			log.Error(err)
			return err
		}
		activeSubscriptions[subscription.message.ID] = subscription
	}

	// Send down new subscriptions.
	for {
		select {
		case v := <-subscriptionCh:
			subscription := v.(*subscription)

			if subscription.message.Close {
				log.WithField("subscription.id", subscription.message.ID).Debug("subscription closed")
				delete(activeSubscriptions, subscription.message.ID)
			} else {
				// Avoid sending down the same subscription multiple times
				if _, ok := activeSubscriptions[subscription.message.ID]; ok {
					continue
				}
				activeSubscriptions[subscription.message.ID] = subscription
				log.WithField("subscription.id", subscription.message.ID).Debug("subscription added")
			}
			if err := stream.Send(subscription.message); err != nil {
				log.Error(err)
				return err
			}
		case <-stream.Context().Done():
			return stream.Context().Err()
		case <-lb.pctx.Done():
			return nil
		}
	}
}