func (d *Dispatcher) markNodesUnknown(ctx context.Context) error { log := log.G(ctx).WithField("method", "(*Dispatcher).markNodesUnknown") var nodes []*api.Node var err error d.store.View(func(tx store.ReadTx) { nodes, err = store.FindNodes(tx, store.All) }) if err != nil { return fmt.Errorf("failed to get list of nodes: %v", err) } _, err = d.store.Batch(func(batch *store.Batch) error { for _, n := range nodes { err := batch.Update(func(tx store.Tx) error { // check if node is still here node := store.GetNode(tx, n.ID) if node == nil { return nil } // do not try to resurrect down nodes if node.Status.State == api.NodeStatus_DOWN { return nil } node.Status = api.NodeStatus{ State: api.NodeStatus_UNKNOWN, Message: `Node moved to "unknown" state due to leadership change in cluster`, } nodeID := node.ID expireFunc := func() { log := log.WithField("node", nodeID) nodeStatus := api.NodeStatus{State: api.NodeStatus_DOWN, Message: `heartbeat failure for node in "unknown" state`} log.Debugf("heartbeat expiration for unknown node") if err := d.nodeRemove(nodeID, nodeStatus); err != nil { log.WithError(err).Errorf(`failed deregistering node after heartbeat expiration for node in "unknown" state`) } } if err := d.nodes.AddUnknown(node, expireFunc); err != nil { return fmt.Errorf(`adding node in "unknown" state to node store failed: %v`, err) } if err := store.UpdateNode(tx, node); err != nil { return fmt.Errorf("update failed %v", err) } return nil }) if err != nil { log.WithField("node", n.ID).WithError(err).Errorf(`failed to move node to "unknown" state`) } } return nil }) return err }
func (d *Dispatcher) processTaskUpdates() { d.taskUpdatesLock.Lock() if len(d.taskUpdates) == 0 { d.taskUpdatesLock.Unlock() return } taskUpdates := d.taskUpdates d.taskUpdates = make(map[string]*api.TaskStatus) d.taskUpdatesLock.Unlock() log := log.G(d.ctx).WithFields(logrus.Fields{ "method": "(*Dispatcher).processTaskUpdates", }) _, err := d.store.Batch(func(batch *store.Batch) error { for taskID, status := range taskUpdates { err := batch.Update(func(tx store.Tx) error { logger := log.WithField("task.id", taskID) task := store.GetTask(tx, taskID) if task == nil { logger.Errorf("task unavailable") return nil } logger = logger.WithField("state.transition", fmt.Sprintf("%v->%v", task.Status.State, status.State)) if task.Status == *status { logger.Debug("task status identical, ignoring") return nil } if task.Status.State > status.State { logger.Debug("task status invalid transition") return nil } task.Status = *status if err := store.UpdateTask(tx, task); err != nil { logger.WithError(err).Error("failed to update task status") return nil } logger.Debug("task status updated") return nil }) if err != nil { log.WithError(err).Error("dispatcher transaction failed") } } return nil }) if err != nil { log.WithError(err).Error("dispatcher batch failed") } }
func newPeer(id uint64, addr string, tr *Transport) (*peer, error) { cc, err := tr.dial(addr) if err != nil { return nil, errors.Wrapf(err, "failed to create conn for %x with addr %s", id, addr) } ctx, cancel := context.WithCancel(tr.ctx) ctx = log.WithField(ctx, "peer_id", fmt.Sprintf("%x", id)) p := &peer{ id: id, addr: addr, cc: cc, tr: tr, ctx: ctx, cancel: cancel, msgc: make(chan raftpb.Message, 4096), done: make(chan struct{}), } go p.run(ctx) return p, nil }
// UpdateTaskStatus updates status of task. Node should send such updates // on every status change of its tasks. func (d *Dispatcher) UpdateTaskStatus(ctx context.Context, r *api.UpdateTaskStatusRequest) (*api.UpdateTaskStatusResponse, error) { nodeInfo, err := ca.RemoteNode(ctx) if err != nil { return nil, err } nodeID := nodeInfo.NodeID fields := logrus.Fields{ "node.id": nodeID, "node.session": r.SessionID, "method": "(*Dispatcher).UpdateTaskStatus", } if nodeInfo.ForwardedBy != nil { fields["forwarder.id"] = nodeInfo.ForwardedBy.NodeID } log := log.G(ctx).WithFields(fields) if err := d.isRunningLocked(); err != nil { return nil, err } if _, err := d.nodes.GetWithSession(nodeID, r.SessionID); err != nil { return nil, err } // Validate task updates for _, u := range r.Updates { if u.Status == nil { log.WithField("task.id", u.TaskID).Warn("task report has nil status") continue } var t *api.Task d.store.View(func(tx store.ReadTx) { t = store.GetTask(tx, u.TaskID) }) if t == nil { log.WithField("task.id", u.TaskID).Warn("cannot find target task in store") continue } if t.NodeID != nodeID { err := grpc.Errorf(codes.PermissionDenied, "cannot update a task not assigned this node") log.WithField("task.id", u.TaskID).Error(err) return nil, err } } d.taskUpdatesLock.Lock() // Enqueue task updates for _, u := range r.Updates { if u.Status == nil { continue } d.taskUpdates[u.TaskID] = u.Status } numUpdates := len(d.taskUpdates) d.taskUpdatesLock.Unlock() if numUpdates >= maxBatchItems { d.processTaskUpdatesTrigger <- struct{}{} } return nil, nil }
func (d *Dispatcher) processUpdates(ctx context.Context) { var ( taskUpdates map[string]*api.TaskStatus nodeUpdates map[string]nodeUpdate ) d.taskUpdatesLock.Lock() if len(d.taskUpdates) != 0 { taskUpdates = d.taskUpdates d.taskUpdates = make(map[string]*api.TaskStatus) } d.taskUpdatesLock.Unlock() d.nodeUpdatesLock.Lock() if len(d.nodeUpdates) != 0 { nodeUpdates = d.nodeUpdates d.nodeUpdates = make(map[string]nodeUpdate) } d.nodeUpdatesLock.Unlock() if len(taskUpdates) == 0 && len(nodeUpdates) == 0 { return } log := log.G(ctx).WithFields(logrus.Fields{ "method": "(*Dispatcher).processUpdates", }) _, err := d.store.Batch(func(batch *store.Batch) error { for taskID, status := range taskUpdates { err := batch.Update(func(tx store.Tx) error { logger := log.WithField("task.id", taskID) task := store.GetTask(tx, taskID) if task == nil { logger.Errorf("task unavailable") return nil } logger = logger.WithField("state.transition", fmt.Sprintf("%v->%v", task.Status.State, status.State)) if task.Status == *status { logger.Debug("task status identical, ignoring") return nil } if task.Status.State > status.State { logger.Debug("task status invalid transition") return nil } task.Status = *status if err := store.UpdateTask(tx, task); err != nil { logger.WithError(err).Error("failed to update task status") return nil } logger.Debug("task status updated") return nil }) if err != nil { log.WithError(err).Error("dispatcher task update transaction failed") } } for nodeID, nodeUpdate := range nodeUpdates { err := batch.Update(func(tx store.Tx) error { logger := log.WithField("node.id", nodeID) node := store.GetNode(tx, nodeID) if node == nil { logger.Errorf("node unavailable") return nil } if nodeUpdate.status != nil { node.Status.State = nodeUpdate.status.State node.Status.Message = nodeUpdate.status.Message if nodeUpdate.status.Addr != "" { node.Status.Addr = nodeUpdate.status.Addr } } if nodeUpdate.description != nil { node.Description = nodeUpdate.description } if err := store.UpdateNode(tx, node); err != nil { logger.WithError(err).Error("failed to update node status") return nil } logger.Debug("node status updated") return nil }) if err != nil { log.WithError(err).Error("dispatcher node update transaction failed") } } return nil }) if err != nil { log.WithError(err).Error("dispatcher batch failed") } d.processUpdatesCond.Broadcast() }
func (d *Dispatcher) markNodesUnknown(ctx context.Context) error { log := log.G(ctx).WithField("method", "(*Dispatcher).markNodesUnknown") var nodes []*api.Node var err error d.store.View(func(tx store.ReadTx) { nodes, err = store.FindNodes(tx, store.All) }) if err != nil { return errors.Wrap(err, "failed to get list of nodes") } _, err = d.store.Batch(func(batch *store.Batch) error { for _, n := range nodes { err := batch.Update(func(tx store.Tx) error { // check if node is still here node := store.GetNode(tx, n.ID) if node == nil { return nil } // do not try to resurrect down nodes if node.Status.State == api.NodeStatus_DOWN { nodeCopy := node expireFunc := func() { if err := d.moveTasksToOrphaned(nodeCopy.ID); err != nil { log.WithError(err).Error(`failed to move all tasks to "ORPHANED" state`) } d.downNodes.Delete(nodeCopy.ID) } d.downNodes.Add(nodeCopy, expireFunc) return nil } node.Status.State = api.NodeStatus_UNKNOWN node.Status.Message = `Node moved to "unknown" state due to leadership change in cluster` nodeID := node.ID expireFunc := func() { log := log.WithField("node", nodeID) log.Debugf("heartbeat expiration for unknown node") if err := d.markNodeNotReady(nodeID, api.NodeStatus_DOWN, `heartbeat failure for node in "unknown" state`); err != nil { log.WithError(err).Errorf(`failed deregistering node after heartbeat expiration for node in "unknown" state`) } } if err := d.nodes.AddUnknown(node, expireFunc); err != nil { return errors.Wrap(err, `adding node in "unknown" state to node store failed`) } if err := store.UpdateNode(tx, node); err != nil { return errors.Wrap(err, "update failed") } return nil }) if err != nil { log.WithField("node", n.ID).WithError(err).Errorf(`failed to move node to "unknown" state`) } } return nil }) return err }
// ListenSubscriptions returns a stream of matching subscriptions for the current node func (lb *LogBroker) ListenSubscriptions(request *api.ListenSubscriptionsRequest, stream api.LogBroker_ListenSubscriptionsServer) error { remote, err := ca.RemoteNode(stream.Context()) if err != nil { return err } lb.nodeConnected(remote.NodeID) defer lb.nodeDisconnected(remote.NodeID) log := log.G(stream.Context()).WithFields( logrus.Fields{ "method": "(*LogBroker).ListenSubscriptions", "node": remote.NodeID, }, ) subscriptions, subscriptionCh, subscriptionCancel := lb.watchSubscriptions(remote.NodeID) defer subscriptionCancel() log.Debug("node registered") activeSubscriptions := make(map[string]*subscription) defer func() { // If the worker quits, mark all active subscriptions as finished. for _, subscription := range activeSubscriptions { subscription.Done(remote.NodeID, fmt.Errorf("node %s disconnected unexpectedly", remote.NodeID)) } }() // Start by sending down all active subscriptions. for _, subscription := range subscriptions { select { case <-stream.Context().Done(): return stream.Context().Err() case <-lb.pctx.Done(): return nil default: } if err := stream.Send(subscription.message); err != nil { log.Error(err) return err } activeSubscriptions[subscription.message.ID] = subscription } // Send down new subscriptions. for { select { case v := <-subscriptionCh: subscription := v.(*subscription) if subscription.message.Close { log.WithField("subscription.id", subscription.message.ID).Debug("subscription closed") delete(activeSubscriptions, subscription.message.ID) } else { // Avoid sending down the same subscription multiple times if _, ok := activeSubscriptions[subscription.message.ID]; ok { continue } activeSubscriptions[subscription.message.ID] = subscription log.WithField("subscription.id", subscription.message.ID).Debug("subscription added") } if err := stream.Send(subscription.message); err != nil { log.Error(err) return err } case <-stream.Context().Done(): return stream.Context().Err() case <-lb.pctx.Done(): return nil } } }