Beispiel #1
0
// Load retrieves the given task's state from etcd or stores and returns
// Runnable if no state exists.
func (s *stateStore) Load(task metafora.Task) (*statemachine.State, error) {
	const notrecursive = false
	const nosort = false
	resp, err := s.c.Get(path.Join(s.path, task.ID()), notrecursive, nosort)
	if err != nil {
		if ee, ok := err.(*etcd.EtcdError); ok && ee.ErrorCode == EcodeKeyNotFound {
			metafora.Infof("task=%q has no existing state, default to Runnable", task.ID())
			state := &statemachine.State{Code: statemachine.Runnable}
			if err := s.Store(task, state); err != nil {
				return nil, err
			}
			return state, nil
		}

		// Non-404 error, fail
		return nil, err
	}

	// Unmarshal state from key
	state := statemachine.State{}
	if err := json.Unmarshal([]byte(resp.Node.Value), &state); err != nil {
		return nil, err
	}
	return &state, nil
}
Beispiel #2
0
func (s *StateStore) Load(task metafora.Task) (*statemachine.State, error) {
	s.mu.RLock()
	defer s.mu.RUnlock()
	state, ok := s.store[task.ID()]
	if !ok {
		return &statemachine.State{Code: statemachine.Runnable}, nil
	}
	return state, nil
}
Beispiel #3
0
// Store taskID's state in etcd overwriting any prior state.
func (s *stateStore) Store(task metafora.Task, state *statemachine.State) error {
	buf, err := json.Marshal(state)
	if err != nil {
		return err
	}

	_, err = s.c.Set(path.Join(s.path, task.ID()), string(buf), foreverTTL)
	return err
}
Beispiel #4
0
func (s *StateStore) Store(task metafora.Task, state *statemachine.State) error {
	s.mu.Lock()
	s.store[task.ID()] = state
	s.mu.Unlock()
	stored := StateChanged{TaskID: task.ID(), State: state}
	select {
	case s.Stored <- stored:
	default:
	}
	return nil
}
Beispiel #5
0
// SubmitTask creates a new task in etcd
func (mc *mclient) SubmitTask(task metafora.Task) error {
	fullpath := path.Join(mc.tskPath(task.ID()), PropsKey)
	buf, err := json.Marshal(task)
	if err != nil {
		return err
	}
	if _, err := mc.etcd.Create(fullpath, string(buf), foreverTTL); err != nil {
		return err
	}
	metafora.Debugf("task %s submitted: %s", task.ID(), fullpath)
	return nil
}
Beispiel #6
0
// NewCommandListener makes a statemachine.CommandListener implementation
// backed by etcd. The namespace should be the same as the coordinator as
// commands use a separate path within a namespace than tasks or nodes.
func NewCommandListener(task metafora.Task, namespace string, c *etcd.Client) statemachine.CommandListener {
	if namespace[0] != '/' {
		namespace = "/" + namespace
	}
	cl := &cmdrListener{
		path:     path.Join(namespace, commandPath, task.ID()),
		cli:      c,
		commands: make(chan *statemachine.Message),
		mu:       &sync.Mutex{},
		stop:     make(chan bool),
	}
	go cl.watcher()
	return cl
}
Beispiel #7
0
func run(f StatefulHandler, task metafora.Task, cmd <-chan *Message) (m *Message) {
	defer func() {
		if r := recover(); r != nil {
			stackBuf := make([]byte, 6000)
			stackBufLen := runtime.Stack(stackBuf, false)
			stackTraceStr := string(stackBuf[0:stackBufLen])
			metafora.Errorf("task=%q Run method panic()d! Applying Error message. Panic: %v\nStack: %s", task.ID(), r, stackTraceStr)
			m = &Message{Code: Error, Err: fmt.Errorf("panic: %v\nstack: %s", r, stackTraceStr)}
		}
	}()

	// Defensive code to give handlers a *copy* of the command chan. That way if
	// a handler keeps receiving on the command chan in a goroutine past the
	// handler's lifetime it doesn't intercept commands intended for the
	// statemachine.
	internalcmd := make(chan *Message)
	stopped := make(chan struct{})
	go func() {
		for {
			select {
			case c := <-cmd:
				internalcmd <- c
			case <-stopped:
				return
			}
		}
	}()
	defer close(stopped)

	return f(task, internalcmd)
}
Beispiel #8
0
// add starts refreshing a given key+value pair for a task asynchronously.
func (m *taskManager) add(task metafora.Task) bool {
	tid := task.ID()
	// Attempt to claim the node
	key, value := m.ownerNode(tid)
	resp, err := m.client.Create(key, value, m.ttl)
	if err != nil {
		etcdErr, ok := err.(*etcd.EtcdError)
		if !ok || etcdErr.ErrorCode != EcodeNodeExist {
			metafora.Errorf("Claim of %s failed with an unexpected error: %v", key, err)
		} else {
			metafora.Debugf("Claim of %s failed, already claimed", key)
		}
		return false
	}

	index := resp.Node.CreatedIndex

	// lytics/metafora#124 - the successful create above may have resurrected a
	// deleted (done) task. Compare the CreatedIndex of the directory with the
	// CreatedIndex of the claim key, if they're equal this claim ressurected a
	// done task and should cleanup.
	resp, err = m.client.Get(m.taskPath(tid), unsorted, notrecursive)
	if err != nil {
		// Erroring here is BAD as we may have resurrected a done task, and because
		// of this failure there's no way to tell. The claim will eventually
		// timeout and the task will get reclaimed.
		metafora.Errorf("Error retrieving task path %q after claiming %q: %v", m.taskPath(tid), tid, err)
		return false
	}

	if resp.Node.CreatedIndex == index {
		metafora.Debugf("Task %s resurrected due to claim/done race. Re-deleting.", tid)
		if _, err = m.client.Delete(m.taskPath(tid), recursive); err != nil {
			// This is as bad as it gets. We *know* we resurrected a task, but we
			// failed to re-delete it.
			metafora.Errorf("Task %s was resurrected and could not be removed! %s should be manually removed. Error: %v",
				tid, m.taskPath(tid), err)
		}

		// Regardless of whether or not the delete succeeded, never treat
		// resurrected tasks as claimed.
		return false
	}

	// Claim successful, start the refresher
	metafora.Debugf("Claim successful: %s", key)
	done := make(chan struct{})
	release := make(chan struct{})
	finished := make(chan struct{})
	m.taskL.Lock()
	m.tasks[tid] = taskStates{done: done, release: release, finished: finished}
	m.taskL.Unlock()

	metafora.Debugf("Starting claim refresher for task %s", tid)
	go func() {
		defer func() {
			m.taskL.Lock()
			delete(m.tasks, tid)
			m.taskL.Unlock()
			close(finished)
		}()

		for {
			select {
			case <-time.After(m.interval):
				// Try to refresh the claim node (0 index means compare by value)
				if _, err := m.client.CompareAndSwap(key, value, m.ttl, value, 0); err != nil {
					metafora.Errorf("Error trying to update task %s ttl: %v", tid, err)
					m.ctx.Lost(task)
					// On errors, don't even try to Delete as we're in a bad state
					return
				}
			case <-done:
				metafora.Debugf("Deleting directory for task %s as it's done.", tid)
				const recursive = true
				if _, err := m.client.Delete(m.taskPath(tid), recursive); err != nil {
					metafora.Errorf("Error deleting task %s while stopping: %v", tid, err)
				}
				return
			case <-release:
				metafora.Debugf("Deleting claim for task %s as it's released.", tid)
				// Not done, releasing; just delete the claim node
				if _, err := m.client.CompareAndDelete(key, value, 0); err != nil {
					metafora.Warnf("Error releasing task %s while stopping: %v", tid, err)
				}
				return
			}
		}
	}()
	return true
}
Beispiel #9
0
// Done deletes the task.
func (ec *EtcdCoordinator) Done(task metafora.Task) {
	const done = true
	ec.taskManager.remove(task.ID(), done)
}
Beispiel #10
0
// Release deletes the claim file.
func (ec *EtcdCoordinator) Release(task metafora.Task) {
	const done = false
	ec.taskManager.remove(task.ID(), done)
}