// Load retrieves the given task's state from etcd or stores and returns // Runnable if no state exists. func (s *stateStore) Load(task metafora.Task) (*statemachine.State, error) { const notrecursive = false const nosort = false resp, err := s.c.Get(path.Join(s.path, task.ID()), notrecursive, nosort) if err != nil { if ee, ok := err.(*etcd.EtcdError); ok && ee.ErrorCode == EcodeKeyNotFound { metafora.Infof("task=%q has no existing state, default to Runnable", task.ID()) state := &statemachine.State{Code: statemachine.Runnable} if err := s.Store(task, state); err != nil { return nil, err } return state, nil } // Non-404 error, fail return nil, err } // Unmarshal state from key state := statemachine.State{} if err := json.Unmarshal([]byte(resp.Node.Value), &state); err != nil { return nil, err } return &state, nil }
func (s *StateStore) Load(task metafora.Task) (*statemachine.State, error) { s.mu.RLock() defer s.mu.RUnlock() state, ok := s.store[task.ID()] if !ok { return &statemachine.State{Code: statemachine.Runnable}, nil } return state, nil }
// Store taskID's state in etcd overwriting any prior state. func (s *stateStore) Store(task metafora.Task, state *statemachine.State) error { buf, err := json.Marshal(state) if err != nil { return err } _, err = s.c.Set(path.Join(s.path, task.ID()), string(buf), foreverTTL) return err }
func (s *StateStore) Store(task metafora.Task, state *statemachine.State) error { s.mu.Lock() s.store[task.ID()] = state s.mu.Unlock() stored := StateChanged{TaskID: task.ID(), State: state} select { case s.Stored <- stored: default: } return nil }
// SubmitTask creates a new task in etcd func (mc *mclient) SubmitTask(task metafora.Task) error { fullpath := path.Join(mc.tskPath(task.ID()), PropsKey) buf, err := json.Marshal(task) if err != nil { return err } if _, err := mc.etcd.Create(fullpath, string(buf), foreverTTL); err != nil { return err } metafora.Debugf("task %s submitted: %s", task.ID(), fullpath) return nil }
// NewCommandListener makes a statemachine.CommandListener implementation // backed by etcd. The namespace should be the same as the coordinator as // commands use a separate path within a namespace than tasks or nodes. func NewCommandListener(task metafora.Task, namespace string, c *etcd.Client) statemachine.CommandListener { if namespace[0] != '/' { namespace = "/" + namespace } cl := &cmdrListener{ path: path.Join(namespace, commandPath, task.ID()), cli: c, commands: make(chan *statemachine.Message), mu: &sync.Mutex{}, stop: make(chan bool), } go cl.watcher() return cl }
func run(f StatefulHandler, task metafora.Task, cmd <-chan *Message) (m *Message) { defer func() { if r := recover(); r != nil { stackBuf := make([]byte, 6000) stackBufLen := runtime.Stack(stackBuf, false) stackTraceStr := string(stackBuf[0:stackBufLen]) metafora.Errorf("task=%q Run method panic()d! Applying Error message. Panic: %v\nStack: %s", task.ID(), r, stackTraceStr) m = &Message{Code: Error, Err: fmt.Errorf("panic: %v\nstack: %s", r, stackTraceStr)} } }() // Defensive code to give handlers a *copy* of the command chan. That way if // a handler keeps receiving on the command chan in a goroutine past the // handler's lifetime it doesn't intercept commands intended for the // statemachine. internalcmd := make(chan *Message) stopped := make(chan struct{}) go func() { for { select { case c := <-cmd: internalcmd <- c case <-stopped: return } } }() defer close(stopped) return f(task, internalcmd) }
// add starts refreshing a given key+value pair for a task asynchronously. func (m *taskManager) add(task metafora.Task) bool { tid := task.ID() // Attempt to claim the node key, value := m.ownerNode(tid) resp, err := m.client.Create(key, value, m.ttl) if err != nil { etcdErr, ok := err.(*etcd.EtcdError) if !ok || etcdErr.ErrorCode != EcodeNodeExist { metafora.Errorf("Claim of %s failed with an unexpected error: %v", key, err) } else { metafora.Debugf("Claim of %s failed, already claimed", key) } return false } index := resp.Node.CreatedIndex // lytics/metafora#124 - the successful create above may have resurrected a // deleted (done) task. Compare the CreatedIndex of the directory with the // CreatedIndex of the claim key, if they're equal this claim ressurected a // done task and should cleanup. resp, err = m.client.Get(m.taskPath(tid), unsorted, notrecursive) if err != nil { // Erroring here is BAD as we may have resurrected a done task, and because // of this failure there's no way to tell. The claim will eventually // timeout and the task will get reclaimed. metafora.Errorf("Error retrieving task path %q after claiming %q: %v", m.taskPath(tid), tid, err) return false } if resp.Node.CreatedIndex == index { metafora.Debugf("Task %s resurrected due to claim/done race. Re-deleting.", tid) if _, err = m.client.Delete(m.taskPath(tid), recursive); err != nil { // This is as bad as it gets. We *know* we resurrected a task, but we // failed to re-delete it. metafora.Errorf("Task %s was resurrected and could not be removed! %s should be manually removed. Error: %v", tid, m.taskPath(tid), err) } // Regardless of whether or not the delete succeeded, never treat // resurrected tasks as claimed. return false } // Claim successful, start the refresher metafora.Debugf("Claim successful: %s", key) done := make(chan struct{}) release := make(chan struct{}) finished := make(chan struct{}) m.taskL.Lock() m.tasks[tid] = taskStates{done: done, release: release, finished: finished} m.taskL.Unlock() metafora.Debugf("Starting claim refresher for task %s", tid) go func() { defer func() { m.taskL.Lock() delete(m.tasks, tid) m.taskL.Unlock() close(finished) }() for { select { case <-time.After(m.interval): // Try to refresh the claim node (0 index means compare by value) if _, err := m.client.CompareAndSwap(key, value, m.ttl, value, 0); err != nil { metafora.Errorf("Error trying to update task %s ttl: %v", tid, err) m.ctx.Lost(task) // On errors, don't even try to Delete as we're in a bad state return } case <-done: metafora.Debugf("Deleting directory for task %s as it's done.", tid) const recursive = true if _, err := m.client.Delete(m.taskPath(tid), recursive); err != nil { metafora.Errorf("Error deleting task %s while stopping: %v", tid, err) } return case <-release: metafora.Debugf("Deleting claim for task %s as it's released.", tid) // Not done, releasing; just delete the claim node if _, err := m.client.CompareAndDelete(key, value, 0); err != nil { metafora.Warnf("Error releasing task %s while stopping: %v", tid, err) } return } } }() return true }
// Done deletes the task. func (ec *EtcdCoordinator) Done(task metafora.Task) { const done = true ec.taskManager.remove(task.ID(), done) }
// Release deletes the claim file. func (ec *EtcdCoordinator) Release(task metafora.Task) { const done = false ec.taskManager.remove(task.ID(), done) }