예제 #1
func TestInitializeReplication(t *testing.T) {
	replicator, store, server := testReplicatorAndServer(t)
	defer server.Stop()

	// Make the kv store look like preparer is installed on test nodes

	// err being nil ensures that checking preparers and locking the hosts
	// succeeded
	replication, _, err := replicator.InitializeReplication(false)
	if err != nil {
		t.Fatalf("Error initializing replication: %s", err)
	defer replication.Cancel()

	// Confirm that the appropriate kv keys have been locked
	for _, node := range testNodes {
		lockPath := kp.LockPath(kp.IntentPath(node, testPodId))
		lockHolder, _, err := store.LockHolder(lockPath)
		if err != nil {
			t.Fatalf("Unexpected error checking for lock holder: %s", err)

		if lockHolder != testLockMessage {
			t.Errorf("Expected lock holder for key '%s' to be '%s', was '%s'", lockPath, testLockMessage, lockHolder)
예제 #2
func TestInitializeReplicationCanOverrideLocks(t *testing.T) {
	replicator, store, server := testReplicatorAndServer(t)
	defer server.Stop()

	// This makes it look like the preparers are installed on the hosts
	// we're deploying to
	for _, node := range testNodes {
		key := fmt.Sprintf("reality/%s/p2-preparer", node)
		server.SetKV(key, []byte(testPreparerManifest))

	// Claim a lock on a host and verify that InitializeReplication fails
	lock, _, err := store.NewLock("competing lock", nil)
	if err != nil {
		t.Fatalf("Unable to set up competing lock: %s", err)
	defer lock.Destroy()
	lockPath := kp.LockPath(kp.IntentPath(testNodes[0], testPodId))
	err = lock.Lock(lockPath)
	if err != nil {
		t.Fatalf("Unable to set up competing lock: %s", err)

	replication, _, err := replicator.InitializeReplication(true)
	if err != nil {
		t.Fatalf("Expected InitializeReplication to override competing lock, but error occured: %s", err)
예제 #3
// Attempts to claim a lock on every host being deployed to.
// if overrideLock is true, will destroy any session holding any of the keys we
// wish to lock
func (r Replicator) LockHosts(lock kp.Lock, overrideLock bool) error {
	for _, host := range r.Nodes {
		lockPath := kp.LockPath(host, r.Manifest.ID())
		err := r.lock(lock, lockPath, overrideLock)

		if err != nil {
			return err
	return nil
예제 #4
파일: store.go 프로젝트: tomzhang/p2
func (s consulStore) Lock(id rcf.ID, session string) (bool, error) {
	key := kp.LockPath(kp.RollPath(id.String()))
	success, _, err := s.kv.Acquire(&api.KVPair{
		Key:     key,
		Value:   []byte(session),
		Session: session,
	}, nil)
	if err != nil {
		return false, consulutil.NewKVError("acquire", key, err)
	return success, nil
예제 #5
파일: farm.go 프로젝트: tomzhang/p2
// close one child
func (rlf *Farm) releaseChild(id fields.ID) {
	rlf.logger.WithField("ru", id).Infoln("Releasing update")
	delete(rlf.children, id)

	// if our lock is active, attempt to gracefully release it
	if rlf.lock != nil {
		err := rlf.lock.Unlock(kp.LockPath(kp.RollPath(id.String())))
		if err != nil {
			rlf.logger.WithField("ru", id).Warnln("Could not release update lock")
예제 #6
파일: farm.go 프로젝트: tomzhang/p2
// close one child
func (rcf *Farm) releaseChild(id fields.ID) {
	rcf.logger.WithField("rc", id).Infoln("Releasing replication controller")
	delete(rcf.children, id)

	// if our lock is active, attempt to gracefully release it on this rc
	if rcf.lock != nil {
		err := rcf.lock.Unlock(kp.LockPath(kp.RCPath(id.String())))
		if err != nil {
			rcf.logger.WithField("rc", id).Warnln("Could not release replication controller lock")
예제 #7
파일: replication.go 프로젝트: tomzhang/p2
// Attempts to claim a lock on every host being deployed to.
// if overrideLock is true, will destroy any session holding any of the keys we
// wish to lock
func (r replication) lockHosts(overrideLock bool, lockMessage string) error {
	lock, renewalErrCh, err := r.store.NewLock(lockMessage, nil)
	if err != nil {
		return err

	for _, host := range r.nodes {
		lockPath := kp.LockPath(kp.IntentPath(host, r.manifest.ID()))
		err := r.lock(lock, lockPath, overrideLock)

		if err != nil {
			return err
	go r.handleRenewalErrors(lock, renewalErrCh)

	return nil
예제 #8
func TestInitializeReplicationFailsIfLockExists(t *testing.T) {
	replicator, store, server := testReplicatorAndServer(t)
	defer server.Stop()

	// This makes it look like the preparers are installed on the hosts
	// we're deploying to
	for _, node := range testNodes {
		key := fmt.Sprintf("reality/%s/p2-preparer", node)
		server.SetKV(key, []byte(testPreparerManifest))

	// Claim a lock on a host and verify that InitializeReplication fails
	lock, _, err := store.NewLock("competing lock", nil)
	if err != nil {
		t.Fatalf("Unable to set up competing lock: %s", err)
	defer lock.Destroy()
	lockPath := kp.LockPath(kp.IntentPath(testNodes[0], testPodId))
	err = lock.Lock(lockPath)
	if err != nil {
		t.Fatalf("Unable to set up competing lock: %s", err)

	_, _, err = replicator.InitializeReplication(false)
	if err == nil {
		t.Fatalf("Expected error due to competing lock, but no error occurred")

	matched, err := regexp.MatchString("already held", err.Error())
	if err != nil {
		t.Fatalf("Unable to compare error message to expected string")

	if !matched {
		t.Fatalf("Expected error message to be related to a lock already being held, but was %s", err.Error())
예제 #9
func TestStopsIfLockDestroyed(t *testing.T) {
	active := 1
	store, server := makeStore(t)
	defer server.Stop()

	healthChecker, resultsCh := channelHealthChecker(testNodes, t)
	threshold := health.Passing
	manifest := basicManifest()

	// Make the kv store look like preparer is installed on test nodes

	// Create the replication manually for this test so we can trigger lock
	// renewals on a faster interval (to keep test short)
	errCh := make(chan error)
	replication := &replication{
		active:    active,
		nodes:     testNodes,
		store:     store,
		manifest:  manifest,
		health:    healthChecker,
		threshold: threshold,
		logger:    basicLogger(),
		errCh:     errCh,
		replicationCancelledCh: make(chan struct{}),
		replicationDoneCh:      make(chan struct{}),
		quitCh:                 make(chan struct{}),

	triggerRenewalCh := make(chan time.Time)
	lock, renewalErrCh, err := store.NewLock(testLockMessage, triggerRenewalCh)
	if err != nil {
		t.Fatalf("Unable to create initial replication lock: %s", err)

	for _, host := range testNodes {
		lockPath := kp.LockPath(kp.IntentPath(host, manifest.ID()))
		err := replication.lock(lock, lockPath, false)

		if err != nil {
			t.Fatalf("Unable to perform initial replication lock: %s", err)
	go replication.handleRenewalErrors(lock, renewalErrCh)

	doneCh := make(chan struct{})

	go func() {
		select {
		case err := <-errCh:
			if err == nil || !IsFatalError(err) {
				t.Fatalf("Should have seen a fatal lock renewal error before replication finished")
		case <-time.After(5 * time.Second):
			t.Fatalf("Did not get expected lock renewal error within timeout")
	imitatePreparers(server, doneCh)

	go func() {

	// Report healthy for one node, and unhealthy for the rest so
	// replication cannot finish without interruption
	for i, node := range testNodes {
		if i == 0 {
			go func(node string) {
				for {
					select {
					case resultsCh[node] <- health.Result{
						ID:     testPodId,
						Status: health.Passing,
					case <-doneCh:
					time.Sleep(500 * time.Millisecond)
		} else {
			go func(node string) {
				for {
					select {
					case resultsCh[node] <- health.Result{
						ID:     testPodId,
						Status: health.Critical,
					case <-doneCh:
					time.Sleep(500 * time.Millisecond)

	// Wait for the first node to be deployed
	firstNodeDeployed := make(chan struct{})
	manifestBytes, err := manifest.Marshal()
	if err != nil {
		t.Fatalf("Unable to get bytes from manifest: %s", err)
	go func() {
		realityKey := fmt.Sprintf("reality/%s/%s", testNodes[0], testPodId)
		for range time.Tick(10 * time.Millisecond) {
			if bytes.Equal(server.GetKV(realityKey), manifestBytes) {

	select {
	case <-time.After(5 * time.Second):
		t.Fatalf("Took too long for first node to be deployed")
	case <-firstNodeDeployed:

	// Trigger some lock renewals, confirm that replication is still going (doneCh not closed)
	for i := 0; i < 3; i++ {
		select {
		case triggerRenewalCh <- time.Now():
		case <-doneCh:
			t.Fatalf("Replication ended prematurely (lock couldn't be renewed but wasn't destroyed yet)")
		case <-time.After(1 * time.Second):
			t.Fatalf("Test timed out triggering a lock renewal")

	// Destroy lock holder so the next renewal will fail
	lockPath := kp.LockPath(kp.IntentPath(testNodes[0], manifest.ID()))
	_, id, err := store.LockHolder(lockPath)
	if err != nil {
		t.Fatalf("Unable to determine lock holder in order to destroy the lock: %s", err)

	err = store.DestroyLockHolder(id)
	if err != nil {
		t.Fatalf("Unable to destroy lock holder")

	// Trigger one more renewal which should cause replication to stop
	select {
	case triggerRenewalCh <- time.Now():
	case <-time.After(1 * time.Second):
		t.Fatalf("Test timed out triggering a lock renewal")
	case <-doneCh:
		t.Fatalf("Replication ended prematurely")

	select {
	case <-time.After(5 * time.Second):
		t.Fatalf("Took too long for replication to end after lock cancellation")
	case <-doneCh:

	// One node should have been updated because active == 1, the other
	// should not have been because health never passed
	realityBytes := server.GetKV(fmt.Sprintf("reality/%s/%s", testNodes[0], testPodId))

	if !bytes.Equal(realityBytes, manifestBytes) {
		t.Fatalf("Expected reality for %s to be %s: was %s", testNodes[0], string(manifestBytes), string(realityBytes))

	realityBytes = server.GetKV(fmt.Sprintf("reality/%s/%s", testNodes[1], testPodId))
	if bytes.Equal(realityBytes, manifestBytes) {
		t.Fatalf("The second node shouldn't have been deployed to but it was")
예제 #10
파일: run_update.go 프로젝트: tomzhang/p2
func (u update) lockPath(id rcf.ID) string {
	// RUs want to lock the RCs they're mutating, but this lock is separate
	// from the RC lock (which is held by the rc.WatchDesires goroutine), so the
	// key being locked is different
	return kp.LockPath(kp.RCPath(id.String(), "update"))
예제 #11
파일: farm.go 프로젝트: tomzhang/p2
// Start is a blocking function that monitors Consul for updates. The Farm will
// attempt to claim updates as they appear and, if successful, will start
// goroutines for those updatesto do their job. Closing the quit channel will
// cause this function to return, releasing all locks it holds.
// Start is not safe for concurrent execution. Do not execute multiple
// concurrent instances of Start.
func (rlf *Farm) Start(quit <-chan struct{}) {
	subQuit := make(chan struct{})
	defer close(subQuit)
	rlWatch, rlErr := rlf.rls.Watch(subQuit)

	for {
		select {
		case <-quit:
			rlf.logger.NoFields().Infoln("Halt requested, releasing updates")
		case session := <-rlf.sessions:
			if session == "" {
				// our session has expired, we must assume our locked children
				// have all been released and that someone else may have
				// claimed them by now
				rlf.logger.NoFields().Errorln("Session expired, releasing updates")
				rlf.lock = nil
			} else {
				// a new session has been acquired - only happens after an
				// expiration message, so len(children)==0
				rlf.logger.WithField("session", session).Infoln("Acquired new session")
				lock := rlf.kps.NewUnmanagedLock(session, "")
				rlf.lock = &lock
				// TODO: restart the watch so that you get updates right away?
		case err := <-rlErr:
			rlf.logger.WithError(err).Errorln("Could not read consul updates")
		case rlFields := <-rlWatch:
			rlf.logger.WithField("n", len(rlFields)).Debugln("Received update update")
			if rlf.lock == nil {
				// we can't claim new nodes because our session is invalidated.
				// raise an error and ignore this update
				rlf.logger.NoFields().Warnln("Received update update, but do not have session to acquire locks")

			// track which children were found in the returned set
			foundChildren := make(map[fields.ID]struct{})
			for _, rlField := range rlFields {
				rlLogger := rlf.logger.SubLogger(logrus.Fields{
					"ru": rlField.NewRC,
				rcField, err := rlf.rcs.Get(rlField.NewRC)
				if err != nil {
					rlLogger.WithError(err).Errorln("Could not read new RC")
				rlLogger = rlLogger.SubLogger(logrus.Fields{
					"pod": rcField.Manifest.ID(),
				if _, ok := rlf.children[rlField.NewRC]; ok {
					// this one is already ours, skip
					rlLogger.NoFields().Debugln("Got update already owned by self")
					foundChildren[rlField.NewRC] = struct{}{}

				err = rlf.lock.Lock(kp.LockPath(kp.RollPath(rlField.NewRC.String())))
				if _, ok := err.(kp.AlreadyLockedError); ok {
					// someone else must have gotten it first - log and move to
					// the next one
					rlLogger.NoFields().Debugln("Lock on update was denied")
				} else if err != nil {
					rlLogger.NoFields().Errorln("Got error while locking update - session may be expired")
					// stop processing this update and go back to the select
					// chances are this error is a network problem or session
					// expiry, and all the others in this update would also fail
					continue START_LOOP

				// at this point the ru is ours, time to spin it up
				rlLogger.NoFields().Infoln("Acquired lock on new update, spawning")

				newChild := rlf.factory.New(rlField, rlLogger, *rlf.lock)
				childQuit := make(chan struct{})
				rlf.children[rlField.NewRC] = childRU{ru: newChild, quit: childQuit}
				foundChildren[rlField.NewRC] = struct{}{}

				go func(id fields.ID) {
					if !newChild.Run(childQuit) {
						// returned false, farm must have asked us to quit
					// our lock on this RU won't be released until it's deleted,
					// so if we fail to delete it, we have to retry
					for err := rlf.rls.Delete(id); err != nil; err = rlf.rls.Delete(id) {
						rlLogger.WithError(err).Errorln("Could not delete update")
						time.Sleep(1 * time.Second)
				}(rlField.NewRC) // do not close over rlField, it's a loop variable

			// now remove any children that were not found in the result set
			rlf.logger.NoFields().Debugln("Pruning updates that have disappeared")
			for id := range rlf.children {
				if _, ok := foundChildren[id]; !ok {
예제 #12
파일: farm.go 프로젝트: tomzhang/p2
// Start is a blocking function that monitors Consul for replication controllers.
// The Farm will attempt to claim replication controllers as they appear and,
// if successful, will start goroutines for those replication controllers to do
// their job. Closing the quit channel will cause this function to return,
// releasing all locks it holds.
// Start is not safe for concurrent execution. Do not execute multiple
// concurrent instances of Start.
func (rcf *Farm) Start(quit <-chan struct{}) {
	subQuit := make(chan struct{})
	defer close(subQuit)
	rcWatch, rcErr := rcf.rcStore.WatchNew(subQuit)

	for {
		select {
		case <-quit:
			rcf.logger.NoFields().Infoln("Halt requested, releasing replication controllers")
		case session := <-rcf.sessions:
			if session == "" {
				// our session has expired, we must assume our locked children
				// have all been released and that someone else may have
				// claimed them by now
				rcf.logger.NoFields().Errorln("Session expired, releasing replication controllers")
				rcf.lock = nil
			} else {
				// a new session has been acquired - only happens after an
				// expiration message, so len(children)==0
				rcf.logger.WithField("session", session).Infoln("Acquired new session")
				lock := rcf.kpStore.NewUnmanagedLock(session, "")
				rcf.lock = &lock
				// TODO: restart the watch so that you get updates right away?
		case err := <-rcErr:
			rcf.logger.WithError(err).Errorln("Could not read consul replication controllers")
		case rcFields := <-rcWatch:
			rcf.logger.WithField("n", len(rcFields)).Debugln("Received replication controller update")
			if rcf.lock == nil {
				// we can't claim new nodes because our session is invalidated.
				// raise an error and ignore this update
				rcf.logger.NoFields().Warnln("Received replication controller update, but do not have session to acquire locks")

			// track which children were found in the returned set
			foundChildren := make(map[fields.ID]struct{})
			for _, rcField := range rcFields {
				rcLogger := rcf.logger.SubLogger(logrus.Fields{
					"rc":  rcField.ID,
					"pod": rcField.Manifest.ID(),
				if _, ok := rcf.children[rcField.ID]; ok {
					// this one is already ours, skip
					rcLogger.NoFields().Debugln("Got replication controller already owned by self")
					foundChildren[rcField.ID] = struct{}{}

				err := rcf.lock.Lock(kp.LockPath(kp.RCPath(rcField.ID.String())))
				if _, ok := err.(kp.AlreadyLockedError); ok {
					// someone else must have gotten it first - log and move to
					// the next one
					rcLogger.NoFields().Debugln("Lock on replication controller was denied")
				} else if err != nil {
					rcLogger.NoFields().Errorln("Got error while locking replication controller - session may be expired")
					// stop processing this update and go back to the select
					// chances are this error is a network problem or session
					// expiry, and all the others in this update would also fail
					continue START_LOOP

				// at this point the rc is ours, time to spin it up
				rcLogger.NoFields().Infoln("Acquired lock on new replication controller, spawning")

				newChild := New(
				childQuit := make(chan struct{})
				rcf.children[rcField.ID] = childRC{rc: newChild, quit: childQuit}
				foundChildren[rcField.ID] = struct{}{}

				go func() {
					// disabled-ness is handled in watchdesires
					for err := range newChild.WatchDesires(childQuit) {
						rcLogger.WithError(err).Errorln("Got error in replication controller loop")

			// now remove any children that were not found in the result set
			rcf.logger.NoFields().Debugln("Pruning replication controllers that have disappeared")
			for id := range rcf.children {
				if _, ok := foundChildren[id]; !ok {