// Load retrieves the given task's state from etcd or stores and returns // Runnable if no state exists. func (s *stateStore) Load(task metafora.Task) (*statemachine.State, error) { const notrecursive = false const nosort = false resp, err := s.c.Get(path.Join(s.path, task.ID()), notrecursive, nosort) if err != nil { if ee, ok := err.(*etcd.EtcdError); ok && ee.ErrorCode == EcodeKeyNotFound { metafora.Infof("task=%q has no existing state, default to Runnable", task.ID()) state := &statemachine.State{Code: statemachine.Runnable} if err := s.Store(task, state); err != nil { return nil, err } return state, nil } // Non-404 error, fail return nil, err } // Unmarshal state from key state := statemachine.State{} if err := json.Unmarshal([]byte(resp.Node.Value), &state); err != nil { return nil, err } return &state, nil }
func (c *cmdrListener) sendMsg(resp *etcd.Response) (index uint64, ok bool) { // Delete/Expire events shouldn't be processed if releaseActions[resp.Action] { return resp.Node.ModifiedIndex + 1, true } // Remove command so it's not processed twice cadresp, err := c.cli.CompareAndDelete(resp.Node.Key, resp.Node.Value, 0) if err != nil { if ee, ok := err.(*etcd.EtcdError); ok && ee.ErrorCode == EcodeCompareFailed { metafora.Infof("Received successive commands; attempting to retrieve the latest: %v", err) return resp.Node.ModifiedIndex + 1, true } metafora.Errorf("Error deleting command %s: %s - sending error to stateful handler: %v", c.path, resp.Node.Value, err) c.sendErr(err) return 0, false } msg := &statemachine.Message{} if err := json.Unmarshal([]byte(resp.Node.Value), msg); err != nil { metafora.Errorf("Error unmarshalling command from %s - sending error to stateful handler: %v", c.path, err) c.sendErr(err) return 0, false } select { case c.commands <- msg: return cadresp.Node.ModifiedIndex + 1, true case <-c.stop: return 0, false } }
// execute non-terminal states func (s *stateMachine) exec(state *State) *Message { switch state.Code { case Runnable: // Runnable passes control to the stateful handler return run(s.h, s.task, s.cmds) case Paused: // Paused until a message arrives return <-s.cmds case Sleeping: // Sleeping until the specified time (or a message) if state.Until == nil { metafora.Warnf("task=%q told to sleep without a time. Resuming.", s.task.ID()) return RunMessage() } dur := state.Until.Sub(time.Now()) metafora.Infof("task=%q sleeping for %s", s.task.ID(), dur) timer := time.NewTimer(dur) select { case <-timer.C: return RunMessage() case msg := <-s.cmds: timer.Stop() // Checkpoint & Release are special cases that shouldn't affect sleep // time, so maintain it across the state transition if msg.Code == Checkpoint || msg.Code == Release { msg.Until = state.Until } return msg } case Fault: // Special case where we potentially trim the current state to keep // errors from growing without bound. var msg *Message msg, state.Errors = s.errHandler(s.task, state.Errors) return msg default: panic("invalid state: " + state.String()) } }
// Run the state machine enabled handler. Loads the initial state and passes // control to the internal stateful handler passing commands from the command // listener into the handler's commands chan. func (s *stateMachine) Run() (done bool) { // Multiplex external (Stop) messages and internal ones s.cmds = make(chan *Message) go func() { for { select { case m := <-s.cl.Receive(): if !m.Valid() { metafora.Warnf("Ignoring invalid command: %q", m) continue } select { case s.cmds <- m: case <-s.stopped: return } case <-s.stopped: return } } }() // Stop the command listener and internal message multiplexer when Run exits defer func() { s.cl.Stop() s.stop() }() tid := s.task.ID() // Load the initial state state, err := s.ss.Load(s.task) if err != nil { // A failure to load the state for a task is *fatal* - the task will be // unscheduled and requires operator intervention to reschedule. metafora.Errorf("task=%q could not load initial state. Marking done! Error: %v", tid, err) return true } if state == nil { // Note to StateStore implementors: This should not happen! Either state or // err must be non-nil. This code is simply to prevent a nil pointer panic. metafora.Errorf("statestore %T returned nil state and err for task=%q - unscheduling") return true } if state.Code.Terminal() { metafora.Warnf("task=%q in terminal state %s - exiting.", tid, state.Code) return true } s.setState(state) // for introspection/debugging // Main Statemachine Loop done = false for { // Enter State metafora.Debugf("task=%q in state %s", tid, state.Code) msg := s.exec(state) // Apply Message newstate, ok := apply(state, msg) if !ok { metafora.Warnf("task=%q Invalid state transition=%q returned by task. Old state=%q", tid, msg.Code, state.Code) msg = ErrorMessage(err) if newstate, ok = apply(state, msg); !ok { metafora.Errorf("task=%q Unable to transition to error state! Exiting with state=%q", tid, state.Code) return state.Code.Terminal() } } metafora.Infof("task=%q transitioning %s --> %s --> %s", tid, state, msg, newstate) // Save state if err := s.ss.Store(s.task, newstate); err != nil { metafora.Errorf("task=%q Unable to persist state=%q. Unscheduling.", tid, newstate.Code) return true } // Set next state and loop if non-terminal state = newstate // Expose the state for introspection s.setState(state) // Exit and unschedule task on terminal state. if state.Code.Terminal() { return true } // Release messages indicate the task should exit but not unschedule. if msg.Code == Release { return false } // Alternatively Stop() may have been called but the handler may not have // returned the Release message. Always exit if we've been told to Stop() // even if the handler has returned a different Message. select { case <-s.stopped: return false default: } } }
func main() { mlvl := metafora.LogLevelInfo hostname, _ := os.Hostname() peers := flag.String("etcd", "http://127.0.0.1:2379", "comma delimited etcd peer list") namespace := flag.String("namespace", "koalemos", "metafora namespace") name := flag.String("name", hostname, "node name or empty for automatic") loglvl := flag.String("log", mlvl.String(), "set log level: [debug], info, warn, error") flag.Parse() hosts := strings.Split(*peers, ",") etcdc := etcd.NewClient(hosts) switch strings.ToLower(*loglvl) { case "debug": mlvl = metafora.LogLevelDebug case "info": mlvl = metafora.LogLevelInfo case "warn": mlvl = metafora.LogLevelWarn case "error": mlvl = metafora.LogLevelError default: metafora.Warnf("Invalid log level %q - using %s", *loglvl, mlvl) } metafora.SetLogLevel(mlvl) conf := m_etcd.NewConfig(*name, *namespace, hosts) // Replace NewTask func with one that returns a *koalemos.Task conf.NewTaskFunc = func(id, value string) metafora.Task { t := koalemos.NewTask(id) if value == "" { return t } if err := json.Unmarshal([]byte(value), t); err != nil { metafora.Errorf("Unable to unmarshal task %s: %v", t.ID(), err) return nil } return t } hfunc := makeHandlerFunc(etcdc) ec, err := m_etcd.NewEtcdCoordinator(conf) if err != nil { metafora.Errorf("Error creating etcd coordinator: %v", err) } bal := m_etcd.NewFairBalancer(conf) c, err := metafora.NewConsumer(ec, hfunc, bal) if err != nil { metafora.Errorf("Error creating consumer: %v", err) os.Exit(2) } metafora.Infof( "Starting koalsmosd with etcd=%s; namespace=%s; name=%s; loglvl=%s", *peers, conf.Namespace, conf.Name, mlvl) consumerRunning := make(chan struct{}) go func() { defer close(consumerRunning) c.Run() }() sigC := make(chan os.Signal, 1) signal.Notify(sigC, os.Interrupt, os.Kill, syscall.SIGTERM) select { case s := <-sigC: metafora.Infof("Received signal %s, shutting down", s) case <-consumerRunning: metafora.Warn("Consumer exited. Shutting down.") } c.Shutdown() metafora.Info("Shutdown") }