func TestInitializeReplication(t *testing.T) { replicator, store, server := testReplicatorAndServer(t) defer server.Stop() // Make the kv store look like preparer is installed on test nodes setupPreparers(server) // err being nil ensures that checking preparers and locking the hosts // succeeded replication, _, err := replicator.InitializeReplication(false) if err != nil { t.Fatalf("Error initializing replication: %s", err) } defer replication.Cancel() // Confirm that the appropriate kv keys have been locked for _, node := range testNodes { lockPath := kp.LockPath(kp.IntentPath(node, testPodId)) lockHolder, _, err := store.LockHolder(lockPath) if err != nil { t.Fatalf("Unexpected error checking for lock holder: %s", err) } if lockHolder != testLockMessage { t.Errorf("Expected lock holder for key '%s' to be '%s', was '%s'", lockPath, testLockMessage, lockHolder) } } }
func TestInitializeReplicationCanOverrideLocks(t *testing.T) { replicator, store, server := testReplicatorAndServer(t) defer server.Stop() // This makes it look like the preparers are installed on the hosts // we're deploying to for _, node := range testNodes { key := fmt.Sprintf("reality/%s/p2-preparer", node) server.SetKV(key, []byte(testPreparerManifest)) } // Claim a lock on a host and verify that InitializeReplication fails lock, _, err := store.NewLock("competing lock", nil) if err != nil { t.Fatalf("Unable to set up competing lock: %s", err) } defer lock.Destroy() lockPath := kp.LockPath(kp.IntentPath(testNodes[0], testPodId)) err = lock.Lock(lockPath) if err != nil { t.Fatalf("Unable to set up competing lock: %s", err) } replication, _, err := replicator.InitializeReplication(true) if err != nil { t.Fatalf("Expected InitializeReplication to override competing lock, but error occured: %s", err) } replication.Cancel() }
// Attempts to claim a lock on every host being deployed to. // if overrideLock is true, will destroy any session holding any of the keys we // wish to lock func (r Replicator) LockHosts(lock kp.Lock, overrideLock bool) error { for _, host := range r.Nodes { lockPath := kp.LockPath(host, r.Manifest.ID()) err := r.lock(lock, lockPath, overrideLock) if err != nil { return err } } return nil }
func (s consulStore) Lock(id rcf.ID, session string) (bool, error) { key := kp.LockPath(kp.RollPath(id.String())) success, _, err := s.kv.Acquire(&api.KVPair{ Key: key, Value: []byte(session), Session: session, }, nil) if err != nil { return false, consulutil.NewKVError("acquire", key, err) } return success, nil }
// close one child func (rlf *Farm) releaseChild(id fields.ID) { rlf.logger.WithField("ru", id).Infoln("Releasing update") close(rlf.children[id].quit) delete(rlf.children, id) // if our lock is active, attempt to gracefully release it if rlf.lock != nil { err := rlf.lock.Unlock(kp.LockPath(kp.RollPath(id.String()))) if err != nil { rlf.logger.WithField("ru", id).Warnln("Could not release update lock") } } }
// close one child func (rcf *Farm) releaseChild(id fields.ID) { rcf.logger.WithField("rc", id).Infoln("Releasing replication controller") close(rcf.children[id].quit) delete(rcf.children, id) // if our lock is active, attempt to gracefully release it on this rc if rcf.lock != nil { err := rcf.lock.Unlock(kp.LockPath(kp.RCPath(id.String()))) if err != nil { rcf.logger.WithField("rc", id).Warnln("Could not release replication controller lock") } } }
// Attempts to claim a lock on every host being deployed to. // if overrideLock is true, will destroy any session holding any of the keys we // wish to lock func (r replication) lockHosts(overrideLock bool, lockMessage string) error { lock, renewalErrCh, err := r.store.NewLock(lockMessage, nil) if err != nil { return err } for _, host := range r.nodes { lockPath := kp.LockPath(kp.IntentPath(host, r.manifest.ID())) err := r.lock(lock, lockPath, overrideLock) if err != nil { return err } } go r.handleRenewalErrors(lock, renewalErrCh) return nil }
func TestInitializeReplicationFailsIfLockExists(t *testing.T) { replicator, store, server := testReplicatorAndServer(t) defer server.Stop() // This makes it look like the preparers are installed on the hosts // we're deploying to for _, node := range testNodes { key := fmt.Sprintf("reality/%s/p2-preparer", node) server.SetKV(key, []byte(testPreparerManifest)) } // Claim a lock on a host and verify that InitializeReplication fails lock, _, err := store.NewLock("competing lock", nil) if err != nil { t.Fatalf("Unable to set up competing lock: %s", err) } defer lock.Destroy() lockPath := kp.LockPath(kp.IntentPath(testNodes[0], testPodId)) err = lock.Lock(lockPath) if err != nil { t.Fatalf("Unable to set up competing lock: %s", err) } _, _, err = replicator.InitializeReplication(false) if err == nil { t.Fatalf("Expected error due to competing lock, but no error occurred") } matched, err := regexp.MatchString("already held", err.Error()) if err != nil { t.Fatalf("Unable to compare error message to expected string") } if !matched { t.Fatalf("Expected error message to be related to a lock already being held, but was %s", err.Error()) } }
func TestStopsIfLockDestroyed(t *testing.T) { active := 1 store, server := makeStore(t) defer server.Stop() healthChecker, resultsCh := channelHealthChecker(testNodes, t) threshold := health.Passing manifest := basicManifest() // Make the kv store look like preparer is installed on test nodes setupPreparers(server) // Create the replication manually for this test so we can trigger lock // renewals on a faster interval (to keep test short) errCh := make(chan error) replication := &replication{ active: active, nodes: testNodes, store: store, manifest: manifest, health: healthChecker, threshold: threshold, logger: basicLogger(), errCh: errCh, replicationCancelledCh: make(chan struct{}), replicationDoneCh: make(chan struct{}), quitCh: make(chan struct{}), } triggerRenewalCh := make(chan time.Time) lock, renewalErrCh, err := store.NewLock(testLockMessage, triggerRenewalCh) if err != nil { t.Fatalf("Unable to create initial replication lock: %s", err) } for _, host := range testNodes { lockPath := kp.LockPath(kp.IntentPath(host, manifest.ID())) err := replication.lock(lock, lockPath, false) if err != nil { t.Fatalf("Unable to perform initial replication lock: %s", err) } } go replication.handleRenewalErrors(lock, renewalErrCh) doneCh := make(chan struct{}) go func() { select { case err := <-errCh: if err == nil || !IsFatalError(err) { t.Fatalf("Should have seen a fatal lock renewal error before replication finished") } case <-time.After(5 * time.Second): t.Fatalf("Did not get expected lock renewal error within timeout") } }() imitatePreparers(server, doneCh) go func() { replication.Enact() close(doneCh) }() // Report healthy for one node, and unhealthy for the rest so // replication cannot finish without interruption for i, node := range testNodes { if i == 0 { go func(node string) { for { select { case resultsCh[node] <- health.Result{ ID: testPodId, Status: health.Passing, }: case <-doneCh: return } time.Sleep(500 * time.Millisecond) } }(node) } else { go func(node string) { for { select { case resultsCh[node] <- health.Result{ ID: testPodId, Status: health.Critical, }: case <-doneCh: return } time.Sleep(500 * time.Millisecond) } }(node) } } // Wait for the first node to be deployed firstNodeDeployed := make(chan struct{}) manifestBytes, err := manifest.Marshal() if err != nil { t.Fatalf("Unable to get bytes from manifest: %s", err) } go func() { realityKey := fmt.Sprintf("reality/%s/%s", testNodes[0], testPodId) for range time.Tick(10 * time.Millisecond) { if bytes.Equal(server.GetKV(realityKey), manifestBytes) { close(firstNodeDeployed) return } } }() select { case <-time.After(5 * time.Second): t.Fatalf("Took too long for first node to be deployed") case <-firstNodeDeployed: } // Trigger some lock renewals, confirm that replication is still going (doneCh not closed) for i := 0; i < 3; i++ { select { case triggerRenewalCh <- time.Now(): case <-doneCh: t.Fatalf("Replication ended prematurely (lock couldn't be renewed but wasn't destroyed yet)") case <-time.After(1 * time.Second): t.Fatalf("Test timed out triggering a lock renewal") } } // Destroy lock holder so the next renewal will fail lockPath := kp.LockPath(kp.IntentPath(testNodes[0], manifest.ID())) _, id, err := store.LockHolder(lockPath) if err != nil { t.Fatalf("Unable to determine lock holder in order to destroy the lock: %s", err) } err = store.DestroyLockHolder(id) if err != nil { t.Fatalf("Unable to destroy lock holder") } // Trigger one more renewal which should cause replication to stop select { case triggerRenewalCh <- time.Now(): case <-time.After(1 * time.Second): t.Fatalf("Test timed out triggering a lock renewal") case <-doneCh: t.Fatalf("Replication ended prematurely") } select { case <-time.After(5 * time.Second): t.Fatalf("Took too long for replication to end after lock cancellation") case <-doneCh: } // One node should have been updated because active == 1, the other // should not have been because health never passed realityBytes := server.GetKV(fmt.Sprintf("reality/%s/%s", testNodes[0], testPodId)) if !bytes.Equal(realityBytes, manifestBytes) { t.Fatalf("Expected reality for %s to be %s: was %s", testNodes[0], string(manifestBytes), string(realityBytes)) } realityBytes = server.GetKV(fmt.Sprintf("reality/%s/%s", testNodes[1], testPodId)) if bytes.Equal(realityBytes, manifestBytes) { t.Fatalf("The second node shouldn't have been deployed to but it was") } }
func (u update) lockPath(id rcf.ID) string { // RUs want to lock the RCs they're mutating, but this lock is separate // from the RC lock (which is held by the rc.WatchDesires goroutine), so the // key being locked is different return kp.LockPath(kp.RCPath(id.String(), "update")) }
// Start is a blocking function that monitors Consul for updates. The Farm will // attempt to claim updates as they appear and, if successful, will start // goroutines for those updatesto do their job. Closing the quit channel will // cause this function to return, releasing all locks it holds. // // Start is not safe for concurrent execution. Do not execute multiple // concurrent instances of Start. func (rlf *Farm) Start(quit <-chan struct{}) { subQuit := make(chan struct{}) defer close(subQuit) rlWatch, rlErr := rlf.rls.Watch(subQuit) START_LOOP: for { select { case <-quit: rlf.logger.NoFields().Infoln("Halt requested, releasing updates") rlf.releaseChildren() return case session := <-rlf.sessions: if session == "" { // our session has expired, we must assume our locked children // have all been released and that someone else may have // claimed them by now rlf.logger.NoFields().Errorln("Session expired, releasing updates") rlf.lock = nil rlf.releaseChildren() } else { // a new session has been acquired - only happens after an // expiration message, so len(children)==0 rlf.logger.WithField("session", session).Infoln("Acquired new session") lock := rlf.kps.NewUnmanagedLock(session, "") rlf.lock = &lock // TODO: restart the watch so that you get updates right away? } case err := <-rlErr: rlf.logger.WithError(err).Errorln("Could not read consul updates") case rlFields := <-rlWatch: rlf.logger.WithField("n", len(rlFields)).Debugln("Received update update") if rlf.lock == nil { // we can't claim new nodes because our session is invalidated. // raise an error and ignore this update rlf.logger.NoFields().Warnln("Received update update, but do not have session to acquire locks") continue } // track which children were found in the returned set foundChildren := make(map[fields.ID]struct{}) for _, rlField := range rlFields { rlLogger := rlf.logger.SubLogger(logrus.Fields{ "ru": rlField.NewRC, }) rcField, err := rlf.rcs.Get(rlField.NewRC) if err != nil { rlLogger.WithError(err).Errorln("Could not read new RC") continue } rlLogger = rlLogger.SubLogger(logrus.Fields{ "pod": rcField.Manifest.ID(), }) if _, ok := rlf.children[rlField.NewRC]; ok { // this one is already ours, skip rlLogger.NoFields().Debugln("Got update already owned by self") foundChildren[rlField.NewRC] = struct{}{} continue } err = rlf.lock.Lock(kp.LockPath(kp.RollPath(rlField.NewRC.String()))) if _, ok := err.(kp.AlreadyLockedError); ok { // someone else must have gotten it first - log and move to // the next one rlLogger.NoFields().Debugln("Lock on update was denied") continue } else if err != nil { rlLogger.NoFields().Errorln("Got error while locking update - session may be expired") // stop processing this update and go back to the select // chances are this error is a network problem or session // expiry, and all the others in this update would also fail continue START_LOOP } // at this point the ru is ours, time to spin it up rlLogger.NoFields().Infoln("Acquired lock on new update, spawning") newChild := rlf.factory.New(rlField, rlLogger, *rlf.lock) childQuit := make(chan struct{}) rlf.children[rlField.NewRC] = childRU{ru: newChild, quit: childQuit} foundChildren[rlField.NewRC] = struct{}{} go func(id fields.ID) { if !newChild.Run(childQuit) { // returned false, farm must have asked us to quit return } // our lock on this RU won't be released until it's deleted, // so if we fail to delete it, we have to retry for err := rlf.rls.Delete(id); err != nil; err = rlf.rls.Delete(id) { rlLogger.WithError(err).Errorln("Could not delete update") time.Sleep(1 * time.Second) } }(rlField.NewRC) // do not close over rlField, it's a loop variable } // now remove any children that were not found in the result set rlf.logger.NoFields().Debugln("Pruning updates that have disappeared") for id := range rlf.children { if _, ok := foundChildren[id]; !ok { rlf.releaseChild(id) } } } } }
// Start is a blocking function that monitors Consul for replication controllers. // The Farm will attempt to claim replication controllers as they appear and, // if successful, will start goroutines for those replication controllers to do // their job. Closing the quit channel will cause this function to return, // releasing all locks it holds. // // Start is not safe for concurrent execution. Do not execute multiple // concurrent instances of Start. func (rcf *Farm) Start(quit <-chan struct{}) { subQuit := make(chan struct{}) defer close(subQuit) rcWatch, rcErr := rcf.rcStore.WatchNew(subQuit) START_LOOP: for { select { case <-quit: rcf.logger.NoFields().Infoln("Halt requested, releasing replication controllers") rcf.releaseChildren() return case session := <-rcf.sessions: if session == "" { // our session has expired, we must assume our locked children // have all been released and that someone else may have // claimed them by now rcf.logger.NoFields().Errorln("Session expired, releasing replication controllers") rcf.lock = nil rcf.releaseChildren() } else { // a new session has been acquired - only happens after an // expiration message, so len(children)==0 rcf.logger.WithField("session", session).Infoln("Acquired new session") lock := rcf.kpStore.NewUnmanagedLock(session, "") rcf.lock = &lock // TODO: restart the watch so that you get updates right away? } case err := <-rcErr: rcf.logger.WithError(err).Errorln("Could not read consul replication controllers") case rcFields := <-rcWatch: rcf.logger.WithField("n", len(rcFields)).Debugln("Received replication controller update") if rcf.lock == nil { // we can't claim new nodes because our session is invalidated. // raise an error and ignore this update rcf.logger.NoFields().Warnln("Received replication controller update, but do not have session to acquire locks") continue } // track which children were found in the returned set foundChildren := make(map[fields.ID]struct{}) for _, rcField := range rcFields { rcLogger := rcf.logger.SubLogger(logrus.Fields{ "rc": rcField.ID, "pod": rcField.Manifest.ID(), }) if _, ok := rcf.children[rcField.ID]; ok { // this one is already ours, skip rcLogger.NoFields().Debugln("Got replication controller already owned by self") foundChildren[rcField.ID] = struct{}{} continue } err := rcf.lock.Lock(kp.LockPath(kp.RCPath(rcField.ID.String()))) if _, ok := err.(kp.AlreadyLockedError); ok { // someone else must have gotten it first - log and move to // the next one rcLogger.NoFields().Debugln("Lock on replication controller was denied") continue } else if err != nil { rcLogger.NoFields().Errorln("Got error while locking replication controller - session may be expired") // stop processing this update and go back to the select // chances are this error is a network problem or session // expiry, and all the others in this update would also fail continue START_LOOP } // at this point the rc is ours, time to spin it up rcLogger.NoFields().Infoln("Acquired lock on new replication controller, spawning") newChild := New( rcField, rcf.kpStore, rcf.rcStore, rcf.scheduler, rcf.labeler, rcLogger, ) childQuit := make(chan struct{}) rcf.children[rcField.ID] = childRC{rc: newChild, quit: childQuit} foundChildren[rcField.ID] = struct{}{} go func() { // disabled-ness is handled in watchdesires for err := range newChild.WatchDesires(childQuit) { rcLogger.WithError(err).Errorln("Got error in replication controller loop") } }() } // now remove any children that were not found in the result set rcf.logger.NoFields().Debugln("Pruning replication controllers that have disappeared") for id := range rcf.children { if _, ok := foundChildren[id]; !ok { rcf.releaseChild(id) } } } } }