// waitForFleetdSocket returns if /var/run/fleet.sock exists, periodically // checking for states. func waitForFleetdSocket(cluster platform.Cluster, m0 platform.Member) (err error) { _, err = util.WaitForState( func() bool { stdout, _, _ := cluster.MemberCommand(m0, "test -S /var/run/fleet.sock && echo 1") if strings.TrimSpace(stdout) == "" { fmt.Errorf("Fleetd is not fully started, retrying...") return false } return true }, ) if err != nil { return fmt.Errorf("Fleetd socket not found: %v", err) } return nil }
// WaitForNUnitFiles runs fleetctl list-unit-files to verify the actual number of units // matched with the given expected number. It periodically runs list-unit-files // waiting until list-unit-files actually shows the expected units. func (nc *nspawnCluster) WaitForNUnitFiles(m Member, expectedUnits int) (map[string][]util.UnitFileState, error) { var nUnits int retStates := make(map[string][]util.UnitFileState) checkListUnitFiles := func() bool { outListUnitFiles, _, err := nc.Fleetctl(m, "list-unit-files", "--no-legend", "--full", "--fields", "unit,dstate,state") if err != nil { return false } // NOTE: There's no need to check if outListUnits is expected to be empty, // because ParseUnitFileStates() implicitly filters out such cases. // However, in case of ParseUnitFileStates() going away, we should not // forget about such special cases. units := strings.Split(strings.TrimSpace(outListUnitFiles), "\n") allStates := util.ParseUnitFileStates(units) nUnits = len(allStates) if nUnits != expectedUnits { // retry until number of units matched return false } for _, state := range allStates { name := state.Name if _, ok := retStates[name]; !ok { retStates[name] = []util.UnitFileState{} } retStates[name] = append(retStates[name], state) } return true } timeout, err := util.WaitForState(checkListUnitFiles) if err != nil { return nil, fmt.Errorf("failed to find %d units within %v (last found: %d)", expectedUnits, timeout, nUnits) } return retStates, nil }
// waitForReloadConfig returns if a message "Reloading configuration" exists // in the journal, periodically checking for the journal up to the timeout. func waitForReloadConfig(cluster platform.Cluster, m0 platform.Member) (err error) { _, err = util.WaitForState( func() bool { // NOTE: journalctl should run just simply like "journalctl -u fleet", // without being piped with grep. Doing // "journalctl -u fleet | grep \"Reloading configuration\"" is racy // in a subtle way, so that it sometimes fails only on semaphoreci. // - dpark 20160408 stdout, _, _ := cluster.MemberCommand(m0, "sudo", "journalctl --priority=info _PID=$(pidof fleetd)") journalfleet := strings.TrimSpace(stdout) if !strings.Contains(journalfleet, "Reloading configuration") { fmt.Errorf("Fleetd is not fully reconfigured, retrying... entire fleet journal:\n%v", journalfleet) return false } return true }, ) if err != nil { return fmt.Errorf("Reloading configuration log not found: %v", err) } return nil }
func (nc *nspawnCluster) WaitForNActiveUnits(m Member, count int) (map[string][]util.UnitState, error) { var nactive int states := make(map[string][]util.UnitState) timeout, err := util.WaitForState( func() bool { stdout, _, err := nc.Fleetctl(m, "list-units", "--no-legend", "--full", "--fields", "unit,active,machine") stdout = strings.TrimSpace(stdout) if err != nil { return false } lines := strings.Split(stdout, "\n") allStates := util.ParseUnitStates(lines) active := util.FilterActiveUnits(allStates) nactive = len(active) if nactive != count { return false } for _, state := range active { name := state.Name if _, ok := states[name]; !ok { states[name] = []util.UnitState{} } states[name] = append(states[name], state) } return true }, ) if err != nil { return nil, fmt.Errorf("failed to find %d active units within %v (last found: %d)", count, timeout, nactive) } return states, nil }
func TestScheduleOneWayConflict(t *testing.T) { cluster, err := platform.NewNspawnCluster("smoke") if err != nil { t.Fatal(err) } defer cluster.Destroy(t) // Start with a simple three-node cluster members, err := platform.CreateNClusterMembers(cluster, 1) if err != nil { t.Fatal(err) } m0 := members[0] if _, err := cluster.WaitForNMachines(m0, 1); err != nil { t.Fatal(err) } // Start a unit that conflicts with a yet-to-be-scheduled unit name := "fixtures/units/conflicts-with-hello.service" if stdout, stderr, err := cluster.Fleetctl(m0, "start", "--no-block", name); err != nil { t.Fatalf("Failed starting unit %s: \nstdout: %s\nstderr: %s\nerr: %v", name, stdout, stderr, err) } active, err := cluster.WaitForNActiveUnits(m0, 1) if err != nil { t.Fatal(err) } states, err := util.ActiveToSingleStates(active) if err != nil { t.Fatal(err) } // Start a unit that has not defined conflicts name = "fixtures/units/hello.service" if stdout, stderr, err := cluster.Fleetctl(m0, "start", "--no-block", name); err != nil { t.Fatalf("Failed starting unit %s: \nstdout: %s\nstderr: %s\nerr: %v", name, stdout, stderr, err) } // Both units should show up, but only conflicts-with-hello.service // should report ACTIVE stdout, _, err := cluster.Fleetctl(m0, "list-unit-files", "--no-legend") if err != nil { t.Fatalf("Failed to run list-unit-files: %v", err) } units := strings.Split(strings.TrimSpace(stdout), "\n") if len(units) != 2 { t.Fatalf("Did not find two units in cluster: \n%s", stdout) } active, err = cluster.WaitForNActiveUnits(m0, 1) if err != nil { t.Fatal(err) } states, err = util.ActiveToSingleStates(active) if err != nil { t.Fatal(err) } for unit := range states { if unit != "conflicts-with-hello.service" { t.Error("Incorrect unit started:", unit) } } // Destroying the conflicting unit should allow the other to start name = "conflicts-with-hello.service" if _, _, err := cluster.Fleetctl(m0, "destroy", name); err != nil { t.Fatalf("Failed destroying %s", name) } // NOTE: we need to sleep here shortly to avoid occasional errors of // conflicts-with-hello.service being rescheduled even after being destroyed. // In that case, the conflicts unit remains active, while the original // hello.service remains inactive. Then the test TestScheduleOneWayConflict // fails at the end with a message "Incorrect unit started". // This error seems to occur frequently when enable_grpc turned on. // - dpark 20160615 time.Sleep(1 * time.Second) // Wait for the destroyed unit to actually disappear timeout, err := util.WaitForState( func() bool { stdout, _, err := cluster.Fleetctl(m0, "list-units", "--no-legend", "--full", "--fields", "unit,active,machine") if err != nil { return false } lines := strings.Split(strings.TrimSpace(stdout), "\n") states := util.ParseUnitStates(lines) for _, state := range states { if state.Name == name { return false } } return true }, ) if err != nil { t.Fatalf("Destroyed unit %s not gone within %v", name, timeout) } active, err = cluster.WaitForNActiveUnits(m0, 1) if err != nil { t.Fatal(err) } states, err = util.ActiveToSingleStates(active) if err != nil { t.Fatal(err) } for unit := range states { if unit != "hello.service" { t.Error("Incorrect unit started:", unit) } } }
// Check that units states do not change on loss of connectivity to etcd. // // Note: this only tests the behaviour of the disconnected node; // but not the reaction of the rest of the cluster, // nor reconciliation after connectivity is restored. func TestSingleNodeConnectivityLoss(t *testing.T) { cluster, err := platform.NewNspawnCluster("smoke") if err != nil { t.Fatal(err) } defer cluster.Destroy(t) m0, err := cluster.CreateMember() if err != nil { t.Fatal(err) } _, err = cluster.WaitForNMachines(m0, 1) if err != nil { t.Fatal(err) } // Set up some units. stateMapping := map[string]struct { command []string runState string systemdFileState string systemdState []string }{ "inactive": {[]string{"submit"}, "", "", nil}, "loaded": {[]string{"load", "--no-block"}, "inactive", "linked-runtime", nil}, "launched": {[]string{"start", "--no-block"}, "active", "linked-runtime", []string{"loaded", "active", "running"}}, } createUnits := map[string][]string{} expectedUnitFiles := map[string]string{} expectedUnitStates := map[string]string{} expectedSystemdFiles := map[string]string{} expectedSystemdStates := map[string][]string{} for _, service := range []string{"single", "global"} { for state, mapping := range stateMapping { unitName := fmt.Sprintf("%s@%s.service", service, state) unitPath := fmt.Sprintf("fixtures/units/%s", unitName) createUnits[unitName] = append(mapping.command, unitPath) expectedUnitFiles[unitName] = state if mapping.runState != "" { expectedUnitStates[unitName] = mapping.runState } if mapping.systemdFileState != "" { expectedSystemdFiles[unitName] = mapping.systemdFileState } if mapping.systemdState != nil { expectedSystemdStates[unitName] = mapping.systemdState } } } for name, command := range createUnits { stdout, stderr, err := cluster.Fleetctl(m0, command...) if err != nil { t.Fatalf("Failed creating unit %s: %v\nstdout: %s\nstderr:%s", name, err, stdout, stderr) } } checkExpectedStates := func() (isExpected bool, expected, actual map[string]string) { // First check unit files. // These shouldn't change at all after intital submit -- but better safe than sorry... stdout, _, err := cluster.Fleetctl(m0, "list-unit-files", "--no-legend", "--full", "--fields", "unit,dstate") if err != nil { t.Errorf("Failed listing unit files: %v", err) } stdout = strings.TrimSpace(stdout) lines := strings.Split(stdout, "\n") actualUnitFiles := map[string]string{} if stdout != "" { for _, line := range lines { cols := strings.Fields(line) actualUnitFiles[cols[0]] = cols[1] } } if !reflect.DeepEqual(actualUnitFiles, expectedUnitFiles) { return false, expectedUnitFiles, actualUnitFiles } // Now check the actual unit states. stdout, _, err = cluster.Fleetctl(m0, "list-units", "--no-legend", "--full", "--fields", "unit,active") if err != nil { t.Errorf("Failed listing units: %v", err) } stdout = strings.TrimSpace(stdout) lines = strings.Split(stdout, "\n") actualUnitStates := map[string]string{} if stdout != "" { for _, line := range lines { cols := strings.Fields(line) actualUnitStates[cols[0]] = cols[1] } } return reflect.DeepEqual(actualUnitStates, expectedUnitStates), expectedUnitStates, actualUnitStates } // Wait for initial state being reached. timeout, err := util.WaitForState( func() bool { isExpected, _, _ := checkExpectedStates(); return isExpected }, ) if err != nil { t.Fatalf("Failed to reach expected initial state within %v.", timeout) } // Cut connection to etcd. // // We use REJECT here, so fleet knows immediately that it's disconnected, rather than waiting for a timeout. if _, err = cluster.MemberCommand(m0, "sudo", "iptables", "-I", "OUTPUT", "-p", "tcp", "-m", "multiport", "--dports=2379,4001", "-j", "REJECT"); err != nil { t.Fatal(err) } // Wait long enough to be reasonably confident that no more state changes will happen. ttl, _ := time.ParseDuration(util.FleetTTL) agentReconcileInterval := 5 * time.Second slack := 2 * time.Second time.Sleep(ttl + agentReconcileInterval + slack) // Check unit state after connection loss. // // Note: we cannot use fleetctl to check the state here, // as fleet is not available to give us this information... // We have to go deeper, and try to obtain the information from systemd directly. actualSystemdFiles := map[string]string{} var stdout string for name, _ := range expectedSystemdFiles { stdout, _ := cluster.MemberCommand(m0, "systemctl", "is-enabled", name) // do not check for error, as systemctl is-enabled returns exit status 1 for linked-runtime. stdout = strings.TrimSpace(stdout) if stdout == "" { continue } actualSystemdFiles[name] = strings.Split(stdout, "\n")[0] } if !reflect.DeepEqual(actualSystemdFiles, expectedSystemdFiles) { t.Fatalf("Units files not in expected state after losing connectivity.\nExpected: %v\nActual: %v", expectedSystemdFiles, actualSystemdFiles) } stdout, err = cluster.MemberCommand(m0, "systemctl", "list-units", "-t", "service", "--no-legend", "single@*.service", "global@*.service") if err != nil { t.Fatalf("Failed to retrieve systemd unit states: %v", err) } stdout = strings.TrimSpace(stdout) actualSystemdStates := map[string][]string{} if stdout != "" { for _, line := range strings.Split(stdout, "\n") { fields := strings.Fields(line) actualSystemdStates[fields[0]] = fields[1:4] } } if !reflect.DeepEqual(actualSystemdStates, expectedSystemdStates) { t.Fatalf("Units not in expected state after losing connectivity.\nExpected: %v\nActual: %v", expectedSystemdStates, actualSystemdStates) } // Restore etcd connection. if _, err = cluster.MemberCommand(m0, "sudo", "iptables", "-D", "OUTPUT", "-p", "tcp", "-m", "multiport", "--dports=2379,4001", "-j", "REJECT"); err != nil { t.Fatal(err) } // Again, wait long enough to be reasonably confident that no more state changes will happen. // // Here this should cover the time for fleet to realise connectivity is back, // and for the Agent to complete the second run after reconnection. // // (Unlike for the first run immediately after connectivity is back, by the time of the second run, // Engine leadership and Engine reconciliation should have been sorted out, // and everything should be back to normal...) time.Sleep(ttl + agentReconcileInterval + slack) // Check state after reconnect. var expected, actual map[string]string var isExpected bool timeout, err = util.WaitForState( func() bool { isExpected, expected, actual = checkExpectedStates(); return isExpected }, ) if err != nil { t.Fatalf("Failed to reach expected initial state within %v.", timeout) } if !isExpected { t.Fatalf("Units not in expected state after restoring connectivity.\nExpected: %v\nActual: %v", expected, actual) } // Additionally check the logs of all active units for possible temporary state flapping. stdout, err = cluster.MemberCommand(m0, "journalctl", "_PID=1") if err != nil { t.Fatalf("Failed to retrieve journal: %v", err) } if strings.Contains(stdout, "Stopping single@") || strings.Contains(stdout, "Stopping global@") { t.Fatalf("Units were unexpectedly stopped at some point:\n%s", stdout) } }
func TestScheduleOneWayConflict(t *testing.T) { cluster, err := platform.NewNspawnCluster("smoke") if err != nil { t.Fatal(err) } defer cluster.Destroy(t) // Start with a simple three-node cluster members, err := platform.CreateNClusterMembers(cluster, 1) if err != nil { t.Fatal(err) } m0 := members[0] if _, err := cluster.WaitForNMachines(m0, 1); err != nil { t.Fatal(err) } // Start a unit that conflicts with a yet-to-be-scheduled unit name := "fixtures/units/conflicts-with-hello.service" if stdout, stderr, err := cluster.Fleetctl(m0, "start", "--no-block", name); err != nil { t.Fatalf("Failed starting unit %s: \nstdout: %s\nstderr: %s\nerr: %v", name, stdout, stderr, err) } active, err := cluster.WaitForNActiveUnits(m0, 1) if err != nil { t.Fatal(err) } states, err := util.ActiveToSingleStates(active) if err != nil { t.Fatal(err) } // Start a unit that has not defined conflicts name = "fixtures/units/hello.service" if stdout, stderr, err := cluster.Fleetctl(m0, "start", "--no-block", name); err != nil { t.Fatalf("Failed starting unit %s: \nstdout: %s\nstderr: %s\nerr: %v", name, stdout, stderr, err) } // Both units should show up, but only conflicts-with-hello.service // should report ACTIVE stdout, _, err := cluster.Fleetctl(m0, "list-unit-files", "--no-legend") if err != nil { t.Fatalf("Failed to run list-unit-files: %v", err) } units := strings.Split(strings.TrimSpace(stdout), "\n") if len(units) != 2 { t.Fatalf("Did not find two units in cluster: \n%s", stdout) } active, err = cluster.WaitForNActiveUnits(m0, 1) if err != nil { t.Fatal(err) } states, err = util.ActiveToSingleStates(active) if err != nil { t.Fatal(err) } for unit := range states { if unit != "conflicts-with-hello.service" { t.Error("Incorrect unit started:", unit) } } // Destroying the conflicting unit should allow the other to start name = "conflicts-with-hello.service" if _, _, err := cluster.Fleetctl(m0, "destroy", name); err != nil { t.Fatalf("Failed destroying %s", name) } // Wait for the destroyed unit to actually disappear timeout, err := util.WaitForState( func() bool { stdout, _, err := cluster.Fleetctl(m0, "list-units", "--no-legend", "--full", "--fields", "unit,active,machine") if err != nil { return false } lines := strings.Split(strings.TrimSpace(stdout), "\n") states := util.ParseUnitStates(lines) for _, state := range states { if state.Name == name { return false } } return true }, ) if err != nil { t.Fatalf("Destroyed unit %s not gone within %v", name, timeout) } active, err = cluster.WaitForNActiveUnits(m0, 1) if err != nil { t.Fatal(err) } states, err = util.ActiveToSingleStates(active) if err != nil { t.Fatal(err) } for unit := range states { if unit != "hello.service" { t.Error("Incorrect unit started:", unit) } } }
// TestReplaceSerialization tests if the ExecStartPre of the new version // of the unit when it replaces the old one is excuted after // ExecStopPost of the old version. // This test is to make sure that two versions of the same unit will not // conflict with each other, that the directives are always serialized, // and it tries its best to avoid the following scenarios: // https://github.com/coreos/fleet/issues/1000 // https://github.com/systemd/systemd/issues/518 // Now we can't guarantee that that behaviour will not be triggered by // another external operation, but at least from the Unit replace // feature context we try to avoid it. func TestReplaceSerialization(t *testing.T) { cluster, err := platform.NewNspawnCluster("smoke") if err != nil { t.Fatal(err) } defer cluster.Destroy(t) m, err := cluster.CreateMember() if err != nil { t.Fatal(err) } _, err = cluster.WaitForNMachines(m, 1) if err != nil { t.Fatal(err) } tmpSyncFile := "/tmp/fleetSyncReplaceFile" syncOld := "echo 'sync'" syncNew := fmt.Sprintf("test -f %s", tmpSyncFile) tmpSyncService := "/tmp/replace-sync.service" syncService := "fixtures/units/replace-sync.service" stdout, stderr, err := cluster.Fleetctl(m, "start", syncService) if err != nil { t.Fatalf("Unable to start unit: \nstdout: %s\nstderr: %s\nerr: %v", stdout, stderr, err) } _, err = cluster.WaitForNActiveUnits(m, 1) if err != nil { t.Fatal(err) } // replace the unit content, make sure that: // It shows up and it did 'test -f /tmp/fleetSyncReplaceFile' correctly err = util.GenNewFleetService(tmpSyncService, syncService, syncNew, syncOld) if err != nil { t.Fatalf("Failed to generate a temp fleet service: %v", err) } stdout, stderr, err = cluster.Fleetctl(m, "start", "--replace", tmpSyncService) if err != nil { t.Fatalf("Failed to replace unit: \nstdout: %s\nstderr: %s\nerr: %v", stdout, stderr, err) } _, err = cluster.WaitForNActiveUnits(m, 1) if err != nil { t.Fatalf("Did not find 1 unit in cluster, unit replace failed: %v", err) } // Wait for the sync file, if the sync file is not created then // the previous unit failed, if it's created we continue. Here // the new version of the unit is probably already running and // the ExecStartPre is running at the same time, if it failed // then we probably will catch it later when we check its status tmpService := path.Base(tmpSyncService) timeout, err := util.WaitForState( func() bool { _, err = cluster.MemberCommand(m, syncNew) if err != nil { return false } return true }, ) if err != nil { t.Fatalf("Failed to check if file %s exists within %v", tmpSyncFile, timeout) } timeout, err = util.WaitForState( func() bool { stdout, _ = cluster.MemberCommand(m, "systemctl", "show", "--property=ActiveState", tmpService) if strings.TrimSpace(stdout) != "ActiveState=active" { return false } return true }, ) if err != nil { t.Fatalf("%s unit not reported as active within %v", tmpService, timeout) } timeout, err = util.WaitForState( func() bool { stdout, _ = cluster.MemberCommand(m, "systemctl", "show", "--property=Result", tmpService) if strings.TrimSpace(stdout) != "Result=success" { return false } return true }, ) if err != nil { t.Fatalf("Result for %s unit not reported as success withing %v", tmpService, timeout) } os.Remove(tmpSyncFile) os.Remove(tmpSyncService) }
// Load service and discovery units and test whether discovery unit adds itself as a dependency for the service. func TestInstallUnit(t *testing.T) { cluster, err := platform.NewNspawnCluster("smoke") if err != nil { t.Fatal(err) } defer cluster.Destroy(t) // Start with a two-nodes cluster members, err := platform.CreateNClusterMembers(cluster, 2) if err != nil { t.Fatal(err) } m0 := members[0] _, err = cluster.WaitForNMachines(m0, 2) if err != nil { t.Fatal(err) } // Load unit files stdout, stderr, err := cluster.Fleetctl(m0, "load", "fixtures/units/hello.service", "fixtures/units/discovery.service") if err != nil { t.Fatalf("Failed loading unit files: \nstdout: %s\nstderr: %s\nerr: %v", stdout, stderr, err) } checkState := func(match string) bool { stdout, _, err := cluster.Fleetctl(m0, "--strict-host-key-checking=false", "ssh", "discovery.service", "systemctl show --property=ActiveState discovery.service") if err != nil { t.Logf("Failed getting info using remote systemctl: %v", err) } stdout = strings.TrimSpace(stdout) return stdout == fmt.Sprintf("ActiveState=%s", match) } // Verify that discovery.service unit is loaded but not started timeout, err := util.WaitForState(func() bool { return checkState("inactive") }) if err != nil { t.Fatalf("discovery.service unit is not reported as inactive within %v: %v", timeout, err) } // Start hello.service unit stdout, stderr, err = cluster.Fleetctl(m0, "start", "fixtures/units/hello.service") if err != nil { t.Fatalf("Failed starting unit: \nstdout: %s\nstderr: %s\nerr: %v", stdout, stderr, err) } // Verify that discovery.service unit was started timeout, err = util.WaitForState(func() bool { return checkState("active") }) if err != nil { t.Fatalf("discovery.service unit is not reported as active within %v:\n%v", timeout, err) } // Stop hello.service unit stdout, stderr, err = cluster.Fleetctl(m0, "stop", "fixtures/units/hello.service") if err != nil { t.Fatalf("Failed stopping unit: \nstdout: %s\nstderr: %s\nerr: %v", stdout, stderr, err) } // Verify that discovery.service unit was stopped timeout, err = util.WaitForState(func() bool { return checkState("inactive") }) if err != nil { t.Fatalf("discovery.service unit is not reported as inactive within %v:\n%v", timeout, err) } }
// TestScheduleReplace starts 3 units, followed by starting another unit // that replaces the 1st unit. Then it verifies that the original unit // got rescheduled on a different machine. func TestScheduleReplace(t *testing.T) { cluster, err := platform.NewNspawnCluster("smoke") if err != nil { t.Fatal(err) } defer cluster.Destroy(t) members, err := platform.CreateNClusterMembers(cluster, 2) if err != nil { t.Fatal(err) } m0 := members[0] m1 := members[1] if _, err := cluster.WaitForNMachines(m0, 2); err != nil { t.Fatal(err) } // Start 3 units without Replaces, replace.0.service on m0, while both 1 and 2 on m1. // That's possible as replace.2.service has an option "MachineOf=replace.1.service". uNames := []string{ "fixtures/units/replace.0.service", "fixtures/units/replace.1.service", "fixtures/units/replace.2.service", "fixtures/units/replace-kick0.service", } if stdout, stderr, err := cluster.Fleetctl(m0, "start", "--no-block", uNames[0]); err != nil { t.Fatalf("Failed starting unit %s: \nstdout: %s\nstderr: %s\nerr: %v", uNames[0], stdout, stderr, err) } if stdout, stderr, err := cluster.Fleetctl(m1, "start", "--no-block", uNames[1]); err != nil { t.Fatalf("Failed starting unit %s: \nstdout: %s\nstderr: %s\nerr: %v", uNames[1], stdout, stderr, err) } if stdout, stderr, err := cluster.Fleetctl(m1, "start", "--no-block", uNames[2]); err != nil { t.Fatalf("Failed starting unit %s: \nstdout: %s\nstderr: %s\nerr: %v", uNames[2], stdout, stderr, err) } active, err := cluster.WaitForNActiveUnits(m0, 3) if err != nil { t.Fatal(err) } states, err := util.ActiveToSingleStates(active) if err != nil { t.Fatal(err) } oldMach := states[path.Base(uNames[0])].Machine // Start a unit replace-kick0.service that replaces replace.0.service // Then the kick0 unit will be scheduled to m0, as m0 is least loaded than m1. // So it's possible to trigger a situation where kick0 could kick the original unit 0. if stdout, stderr, err := cluster.Fleetctl(m0, "start", "--no-block", uNames[3]); err != nil { t.Fatalf("Failed starting unit %s: \nstdout: %s\nstderr: %s\nerr: %v", uNames[3], stdout, stderr, err) } // Here we need to wait up to 15 seconds, to avoid races, because the unit state // publisher could otherwise report unit states with old machine IDs to registry. checkReplacedMachines := func() bool { // Check that 4 units show up nUnits := 4 stdout, stderr, err := cluster.Fleetctl(m0, "list-unit-files", "--no-legend") if err != nil { t.Logf("Failed to run list-unit-files:\nstdout: %s\nstderr: %s\nerr: %v", stdout, stderr, err) return false } units := strings.Split(strings.TrimSpace(stdout), "\n") if len(units) != nUnits { t.Logf("Did not find two units in cluster: \n%s", stdout) return false } active, err = cluster.WaitForNActiveUnits(m0, nUnits) if err != nil { t.Log(err) return false } states, err = util.ActiveToSingleStates(active) if err != nil { t.Log(err) return false } // Check that replace.0.service is located on a different machine from // that of replace-kick0.service. uNameBase := make([]string, nUnits) machs := make([]string, nUnits) for i, uName := range uNames { uNameBase[i] = path.Base(uName) machs[i] = states[uNameBase[i]].Machine } if machs[0] == machs[3] { t.Logf("machine for %s is %s, the same as that of %s.", uNameBase[0], machs[0], uNameBase[3]) return false } if machs[3] != oldMach { t.Logf("machine for %s is %s, different from old machine %s.", uNameBase[3], machs[3], oldMach) return false } if machs[0] == oldMach { t.Logf("machine for %s is %s, the same as that of %s.", uNameBase[0], machs[0], oldMach) return false } return true } if timeout, err := util.WaitForState(checkReplacedMachines); err != nil { t.Fatalf("Cannot verify replaced units within %v\nerr: %v", timeout, err) } }