Beispiel #1
0
func TestInitializeReplication(t *testing.T) {
	replicator, store, server := testReplicatorAndServer(t)
	defer server.Stop()

	// Make the kv store look like preparer is installed on test nodes
	setupPreparers(server)

	// err being nil ensures that checking preparers and locking the hosts
	// succeeded
	replication, _, err := replicator.InitializeReplication(false)
	if err != nil {
		t.Fatalf("Error initializing replication: %s", err)
	}
	defer replication.Cancel()

	// Confirm that the appropriate kv keys have been locked
	for _, node := range testNodes {
		lockPath := kp.LockPath(kp.IntentPath(node, testPodId))
		lockHolder, _, err := store.LockHolder(lockPath)
		if err != nil {
			t.Fatalf("Unexpected error checking for lock holder: %s", err)
		}

		if lockHolder != testLockMessage {
			t.Errorf("Expected lock holder for key '%s' to be '%s', was '%s'", lockPath, testLockMessage, lockHolder)
		}
	}
}
Beispiel #2
0
func TestInitializeReplicationCanOverrideLocks(t *testing.T) {
	replicator, store, server := testReplicatorAndServer(t)
	defer server.Stop()

	// This makes it look like the preparers are installed on the hosts
	// we're deploying to
	for _, node := range testNodes {
		key := fmt.Sprintf("reality/%s/p2-preparer", node)
		server.SetKV(key, []byte(testPreparerManifest))
	}

	// Claim a lock on a host and verify that InitializeReplication fails
	lock, _, err := store.NewLock("competing lock", nil)
	if err != nil {
		t.Fatalf("Unable to set up competing lock: %s", err)
	}
	defer lock.Destroy()
	lockPath := kp.LockPath(kp.IntentPath(testNodes[0], testPodId))
	err = lock.Lock(lockPath)
	if err != nil {
		t.Fatalf("Unable to set up competing lock: %s", err)
	}

	replication, _, err := replicator.InitializeReplication(true)
	if err != nil {
		t.Fatalf("Expected InitializeReplication to override competing lock, but error occured: %s", err)
	}
	replication.Cancel()
}
Beispiel #3
0
// Attempts to claim a lock on every host being deployed to.
// if overrideLock is true, will destroy any session holding any of the keys we
// wish to lock
func (r Replicator) LockHosts(lock kp.Lock, overrideLock bool) error {
	for _, host := range r.Nodes {
		lockPath := kp.LockPath(host, r.Manifest.ID())
		err := r.lock(lock, lockPath, overrideLock)

		if err != nil {
			return err
		}
	}
	return nil
}
Beispiel #4
0
func (s consulStore) Lock(id rcf.ID, session string) (bool, error) {
	key := kp.LockPath(kp.RollPath(id.String()))
	success, _, err := s.kv.Acquire(&api.KVPair{
		Key:     key,
		Value:   []byte(session),
		Session: session,
	}, nil)
	if err != nil {
		return false, consulutil.NewKVError("acquire", key, err)
	}
	return success, nil
}
Beispiel #5
0
// close one child
func (rlf *Farm) releaseChild(id fields.ID) {
	rlf.logger.WithField("ru", id).Infoln("Releasing update")
	close(rlf.children[id].quit)
	delete(rlf.children, id)

	// if our lock is active, attempt to gracefully release it
	if rlf.lock != nil {
		err := rlf.lock.Unlock(kp.LockPath(kp.RollPath(id.String())))
		if err != nil {
			rlf.logger.WithField("ru", id).Warnln("Could not release update lock")
		}
	}
}
Beispiel #6
0
// close one child
func (rcf *Farm) releaseChild(id fields.ID) {
	rcf.logger.WithField("rc", id).Infoln("Releasing replication controller")
	close(rcf.children[id].quit)
	delete(rcf.children, id)

	// if our lock is active, attempt to gracefully release it on this rc
	if rcf.lock != nil {
		err := rcf.lock.Unlock(kp.LockPath(kp.RCPath(id.String())))
		if err != nil {
			rcf.logger.WithField("rc", id).Warnln("Could not release replication controller lock")
		}
	}
}
Beispiel #7
0
// Attempts to claim a lock on every host being deployed to.
// if overrideLock is true, will destroy any session holding any of the keys we
// wish to lock
func (r replication) lockHosts(overrideLock bool, lockMessage string) error {
	lock, renewalErrCh, err := r.store.NewLock(lockMessage, nil)
	if err != nil {
		return err
	}

	for _, host := range r.nodes {
		lockPath := kp.LockPath(kp.IntentPath(host, r.manifest.ID()))
		err := r.lock(lock, lockPath, overrideLock)

		if err != nil {
			return err
		}
	}
	go r.handleRenewalErrors(lock, renewalErrCh)

	return nil
}
Beispiel #8
0
func TestInitializeReplicationFailsIfLockExists(t *testing.T) {
	replicator, store, server := testReplicatorAndServer(t)
	defer server.Stop()

	// This makes it look like the preparers are installed on the hosts
	// we're deploying to
	for _, node := range testNodes {
		key := fmt.Sprintf("reality/%s/p2-preparer", node)
		server.SetKV(key, []byte(testPreparerManifest))
	}

	// Claim a lock on a host and verify that InitializeReplication fails
	lock, _, err := store.NewLock("competing lock", nil)
	if err != nil {
		t.Fatalf("Unable to set up competing lock: %s", err)
	}
	defer lock.Destroy()
	lockPath := kp.LockPath(kp.IntentPath(testNodes[0], testPodId))
	err = lock.Lock(lockPath)
	if err != nil {
		t.Fatalf("Unable to set up competing lock: %s", err)
	}

	_, _, err = replicator.InitializeReplication(false)
	if err == nil {
		t.Fatalf("Expected error due to competing lock, but no error occurred")
	}

	matched, err := regexp.MatchString("already held", err.Error())
	if err != nil {
		t.Fatalf("Unable to compare error message to expected string")
	}

	if !matched {
		t.Fatalf("Expected error message to be related to a lock already being held, but was %s", err.Error())
	}
}
Beispiel #9
0
func TestStopsIfLockDestroyed(t *testing.T) {
	active := 1
	store, server := makeStore(t)
	defer server.Stop()

	healthChecker, resultsCh := channelHealthChecker(testNodes, t)
	threshold := health.Passing
	manifest := basicManifest()

	// Make the kv store look like preparer is installed on test nodes
	setupPreparers(server)

	// Create the replication manually for this test so we can trigger lock
	// renewals on a faster interval (to keep test short)
	errCh := make(chan error)
	replication := &replication{
		active:    active,
		nodes:     testNodes,
		store:     store,
		manifest:  manifest,
		health:    healthChecker,
		threshold: threshold,
		logger:    basicLogger(),
		errCh:     errCh,
		replicationCancelledCh: make(chan struct{}),
		replicationDoneCh:      make(chan struct{}),
		quitCh:                 make(chan struct{}),
	}

	triggerRenewalCh := make(chan time.Time)
	lock, renewalErrCh, err := store.NewLock(testLockMessage, triggerRenewalCh)
	if err != nil {
		t.Fatalf("Unable to create initial replication lock: %s", err)
	}

	for _, host := range testNodes {
		lockPath := kp.LockPath(kp.IntentPath(host, manifest.ID()))
		err := replication.lock(lock, lockPath, false)

		if err != nil {
			t.Fatalf("Unable to perform initial replication lock: %s", err)
		}
	}
	go replication.handleRenewalErrors(lock, renewalErrCh)

	doneCh := make(chan struct{})

	go func() {
		select {
		case err := <-errCh:
			if err == nil || !IsFatalError(err) {
				t.Fatalf("Should have seen a fatal lock renewal error before replication finished")
			}
		case <-time.After(5 * time.Second):
			t.Fatalf("Did not get expected lock renewal error within timeout")
		}
	}()
	imitatePreparers(server, doneCh)

	go func() {
		replication.Enact()
		close(doneCh)
	}()

	// Report healthy for one node, and unhealthy for the rest so
	// replication cannot finish without interruption
	for i, node := range testNodes {
		if i == 0 {
			go func(node string) {
				for {
					select {
					case resultsCh[node] <- health.Result{
						ID:     testPodId,
						Status: health.Passing,
					}:
					case <-doneCh:
						return
					}
					time.Sleep(500 * time.Millisecond)
				}
			}(node)
		} else {
			go func(node string) {
				for {
					select {
					case resultsCh[node] <- health.Result{
						ID:     testPodId,
						Status: health.Critical,
					}:
					case <-doneCh:
						return
					}
					time.Sleep(500 * time.Millisecond)
				}
			}(node)
		}
	}

	// Wait for the first node to be deployed
	firstNodeDeployed := make(chan struct{})
	manifestBytes, err := manifest.Marshal()
	if err != nil {
		t.Fatalf("Unable to get bytes from manifest: %s", err)
	}
	go func() {
		realityKey := fmt.Sprintf("reality/%s/%s", testNodes[0], testPodId)
		for range time.Tick(10 * time.Millisecond) {
			if bytes.Equal(server.GetKV(realityKey), manifestBytes) {
				close(firstNodeDeployed)
				return
			}
		}
	}()

	select {
	case <-time.After(5 * time.Second):
		t.Fatalf("Took too long for first node to be deployed")
	case <-firstNodeDeployed:
	}

	// Trigger some lock renewals, confirm that replication is still going (doneCh not closed)
	for i := 0; i < 3; i++ {
		select {
		case triggerRenewalCh <- time.Now():
		case <-doneCh:
			t.Fatalf("Replication ended prematurely (lock couldn't be renewed but wasn't destroyed yet)")
		case <-time.After(1 * time.Second):
			t.Fatalf("Test timed out triggering a lock renewal")
		}
	}

	// Destroy lock holder so the next renewal will fail
	lockPath := kp.LockPath(kp.IntentPath(testNodes[0], manifest.ID()))
	_, id, err := store.LockHolder(lockPath)
	if err != nil {
		t.Fatalf("Unable to determine lock holder in order to destroy the lock: %s", err)
	}

	err = store.DestroyLockHolder(id)
	if err != nil {
		t.Fatalf("Unable to destroy lock holder")
	}

	// Trigger one more renewal which should cause replication to stop
	select {
	case triggerRenewalCh <- time.Now():
	case <-time.After(1 * time.Second):
		t.Fatalf("Test timed out triggering a lock renewal")
	case <-doneCh:
		t.Fatalf("Replication ended prematurely")
	}

	select {
	case <-time.After(5 * time.Second):
		t.Fatalf("Took too long for replication to end after lock cancellation")
	case <-doneCh:
	}

	// One node should have been updated because active == 1, the other
	// should not have been because health never passed
	realityBytes := server.GetKV(fmt.Sprintf("reality/%s/%s", testNodes[0], testPodId))

	if !bytes.Equal(realityBytes, manifestBytes) {
		t.Fatalf("Expected reality for %s to be %s: was %s", testNodes[0], string(manifestBytes), string(realityBytes))
	}

	realityBytes = server.GetKV(fmt.Sprintf("reality/%s/%s", testNodes[1], testPodId))
	if bytes.Equal(realityBytes, manifestBytes) {
		t.Fatalf("The second node shouldn't have been deployed to but it was")
	}
}
Beispiel #10
0
func (u update) lockPath(id rcf.ID) string {
	// RUs want to lock the RCs they're mutating, but this lock is separate
	// from the RC lock (which is held by the rc.WatchDesires goroutine), so the
	// key being locked is different
	return kp.LockPath(kp.RCPath(id.String(), "update"))
}
Beispiel #11
0
// Start is a blocking function that monitors Consul for updates. The Farm will
// attempt to claim updates as they appear and, if successful, will start
// goroutines for those updatesto do their job. Closing the quit channel will
// cause this function to return, releasing all locks it holds.
//
// Start is not safe for concurrent execution. Do not execute multiple
// concurrent instances of Start.
func (rlf *Farm) Start(quit <-chan struct{}) {
	subQuit := make(chan struct{})
	defer close(subQuit)
	rlWatch, rlErr := rlf.rls.Watch(subQuit)

START_LOOP:
	for {
		select {
		case <-quit:
			rlf.logger.NoFields().Infoln("Halt requested, releasing updates")
			rlf.releaseChildren()
			return
		case session := <-rlf.sessions:
			if session == "" {
				// our session has expired, we must assume our locked children
				// have all been released and that someone else may have
				// claimed them by now
				rlf.logger.NoFields().Errorln("Session expired, releasing updates")
				rlf.lock = nil
				rlf.releaseChildren()
			} else {
				// a new session has been acquired - only happens after an
				// expiration message, so len(children)==0
				rlf.logger.WithField("session", session).Infoln("Acquired new session")
				lock := rlf.kps.NewUnmanagedLock(session, "")
				rlf.lock = &lock
				// TODO: restart the watch so that you get updates right away?
			}
		case err := <-rlErr:
			rlf.logger.WithError(err).Errorln("Could not read consul updates")
		case rlFields := <-rlWatch:
			rlf.logger.WithField("n", len(rlFields)).Debugln("Received update update")
			if rlf.lock == nil {
				// we can't claim new nodes because our session is invalidated.
				// raise an error and ignore this update
				rlf.logger.NoFields().Warnln("Received update update, but do not have session to acquire locks")
				continue
			}

			// track which children were found in the returned set
			foundChildren := make(map[fields.ID]struct{})
			for _, rlField := range rlFields {
				rlLogger := rlf.logger.SubLogger(logrus.Fields{
					"ru": rlField.NewRC,
				})
				rcField, err := rlf.rcs.Get(rlField.NewRC)
				if err != nil {
					rlLogger.WithError(err).Errorln("Could not read new RC")
					continue
				}
				rlLogger = rlLogger.SubLogger(logrus.Fields{
					"pod": rcField.Manifest.ID(),
				})
				if _, ok := rlf.children[rlField.NewRC]; ok {
					// this one is already ours, skip
					rlLogger.NoFields().Debugln("Got update already owned by self")
					foundChildren[rlField.NewRC] = struct{}{}
					continue
				}

				err = rlf.lock.Lock(kp.LockPath(kp.RollPath(rlField.NewRC.String())))
				if _, ok := err.(kp.AlreadyLockedError); ok {
					// someone else must have gotten it first - log and move to
					// the next one
					rlLogger.NoFields().Debugln("Lock on update was denied")
					continue
				} else if err != nil {
					rlLogger.NoFields().Errorln("Got error while locking update - session may be expired")
					// stop processing this update and go back to the select
					// chances are this error is a network problem or session
					// expiry, and all the others in this update would also fail
					continue START_LOOP
				}

				// at this point the ru is ours, time to spin it up
				rlLogger.NoFields().Infoln("Acquired lock on new update, spawning")

				newChild := rlf.factory.New(rlField, rlLogger, *rlf.lock)
				childQuit := make(chan struct{})
				rlf.children[rlField.NewRC] = childRU{ru: newChild, quit: childQuit}
				foundChildren[rlField.NewRC] = struct{}{}

				go func(id fields.ID) {
					if !newChild.Run(childQuit) {
						// returned false, farm must have asked us to quit
						return
					}
					// our lock on this RU won't be released until it's deleted,
					// so if we fail to delete it, we have to retry
					for err := rlf.rls.Delete(id); err != nil; err = rlf.rls.Delete(id) {
						rlLogger.WithError(err).Errorln("Could not delete update")
						time.Sleep(1 * time.Second)
					}
				}(rlField.NewRC) // do not close over rlField, it's a loop variable
			}

			// now remove any children that were not found in the result set
			rlf.logger.NoFields().Debugln("Pruning updates that have disappeared")
			for id := range rlf.children {
				if _, ok := foundChildren[id]; !ok {
					rlf.releaseChild(id)
				}
			}
		}
	}
}
Beispiel #12
0
// Start is a blocking function that monitors Consul for replication controllers.
// The Farm will attempt to claim replication controllers as they appear and,
// if successful, will start goroutines for those replication controllers to do
// their job. Closing the quit channel will cause this function to return,
// releasing all locks it holds.
//
// Start is not safe for concurrent execution. Do not execute multiple
// concurrent instances of Start.
func (rcf *Farm) Start(quit <-chan struct{}) {
	subQuit := make(chan struct{})
	defer close(subQuit)
	rcWatch, rcErr := rcf.rcStore.WatchNew(subQuit)

START_LOOP:
	for {
		select {
		case <-quit:
			rcf.logger.NoFields().Infoln("Halt requested, releasing replication controllers")
			rcf.releaseChildren()
			return
		case session := <-rcf.sessions:
			if session == "" {
				// our session has expired, we must assume our locked children
				// have all been released and that someone else may have
				// claimed them by now
				rcf.logger.NoFields().Errorln("Session expired, releasing replication controllers")
				rcf.lock = nil
				rcf.releaseChildren()
			} else {
				// a new session has been acquired - only happens after an
				// expiration message, so len(children)==0
				rcf.logger.WithField("session", session).Infoln("Acquired new session")
				lock := rcf.kpStore.NewUnmanagedLock(session, "")
				rcf.lock = &lock
				// TODO: restart the watch so that you get updates right away?
			}
		case err := <-rcErr:
			rcf.logger.WithError(err).Errorln("Could not read consul replication controllers")
		case rcFields := <-rcWatch:
			rcf.logger.WithField("n", len(rcFields)).Debugln("Received replication controller update")
			if rcf.lock == nil {
				// we can't claim new nodes because our session is invalidated.
				// raise an error and ignore this update
				rcf.logger.NoFields().Warnln("Received replication controller update, but do not have session to acquire locks")
				continue
			}

			// track which children were found in the returned set
			foundChildren := make(map[fields.ID]struct{})
			for _, rcField := range rcFields {
				rcLogger := rcf.logger.SubLogger(logrus.Fields{
					"rc":  rcField.ID,
					"pod": rcField.Manifest.ID(),
				})
				if _, ok := rcf.children[rcField.ID]; ok {
					// this one is already ours, skip
					rcLogger.NoFields().Debugln("Got replication controller already owned by self")
					foundChildren[rcField.ID] = struct{}{}
					continue
				}

				err := rcf.lock.Lock(kp.LockPath(kp.RCPath(rcField.ID.String())))
				if _, ok := err.(kp.AlreadyLockedError); ok {
					// someone else must have gotten it first - log and move to
					// the next one
					rcLogger.NoFields().Debugln("Lock on replication controller was denied")
					continue
				} else if err != nil {
					rcLogger.NoFields().Errorln("Got error while locking replication controller - session may be expired")
					// stop processing this update and go back to the select
					// chances are this error is a network problem or session
					// expiry, and all the others in this update would also fail
					continue START_LOOP
				}

				// at this point the rc is ours, time to spin it up
				rcLogger.NoFields().Infoln("Acquired lock on new replication controller, spawning")

				newChild := New(
					rcField,
					rcf.kpStore,
					rcf.rcStore,
					rcf.scheduler,
					rcf.labeler,
					rcLogger,
				)
				childQuit := make(chan struct{})
				rcf.children[rcField.ID] = childRC{rc: newChild, quit: childQuit}
				foundChildren[rcField.ID] = struct{}{}

				go func() {
					// disabled-ness is handled in watchdesires
					for err := range newChild.WatchDesires(childQuit) {
						rcLogger.WithError(err).Errorln("Got error in replication controller loop")
					}
				}()
			}

			// now remove any children that were not found in the result set
			rcf.logger.NoFields().Debugln("Pruning replication controllers that have disappeared")
			for id := range rcf.children {
				if _, ok := foundChildren[id]; !ok {
					rcf.releaseChild(id)
				}
			}
		}
	}
}