func getServerTime(server, networkPassword string) (timeResult, util.ServerStatus, error) { start := time.Now() status, err := util.GetServerStatus(server, networkPassword) return timeResult{ Start: start, End: time.Now(), Result: status.CurrentTime, }, status, err }
func main() { defer glog.Flush() glog.CopyStandardLogTo("INFO") flag.Parse() if strings.TrimSpace(*network) == "" { log.Fatalf("You need to specify -network") } binaryHash := fileHash(*binaryPath) glog.Infof("binaryHash = %s", binaryHash) servers := util.ResolveNetwork(*network) log.Printf("Checking network health\n") if statuses, err := util.EnsureNetworkHealthy(servers, *networkPassword); err != nil { log.Fatalf("Aborting upgrade for safety: %v", err) } else { if allNodesUpdated(statuses, binaryHash) { log.Printf("All nodes are already running the requested version.\n") return } } log.Printf("Restarting %q nodes until their binary hash is %s\n", *network, binaryHash) for rtry := 0; rtry < 5; rtry++ { for _, server := range servers { var statuses map[string]util.ServerStatus var err error started := time.Now() for time.Since(started) < *networkHealthTimeout { statuses, err = util.EnsureNetworkHealthy(servers, *networkPassword) if err != nil { log.Printf("Network is not healthy: %v\n", err) time.Sleep(1 * time.Second) continue } log.Printf("Network became healthy.\n") break } if err != nil { log.Fatalf("Network did not become healthy within %v, aborting. (reason: %v)\n", *networkHealthTimeout, err) } if statuses[server].ExecutableHash == binaryHash { if allNodesUpdated(statuses, binaryHash) { log.Printf("All done!\n") return } log.Printf("Skipping %q which is already running the requested version\n", server) continue } lastApplied := statuses[server].AppliedIndex log.Printf("Killing node %q\n", server) if err := quit(server); err != nil { log.Printf("%v\n", err) } for htry := 0; htry < 60; htry++ { time.Sleep(1 * time.Second) current, err := util.GetServerStatus(server, *networkPassword) if err != nil { log.Printf("Node unhealthy: %v\n", err) continue } if current.ExecutableHash != binaryHash { log.Printf("Node %q came up with hash %s instead of %s?!\n", server, current.ExecutableHash, binaryHash) break } if current.AppliedIndex < lastApplied { log.Printf("Node %q has not yet applied all messages it saw before, waiting (got %d, want ≥ %d)\n", server, current.AppliedIndex, lastApplied) continue } log.Printf("Node %q was upgraded and is healthy again\n", server) break } } } log.Printf("All done!\n") }
func TestMessageOfDeath(t *testing.T) { tempdir, err := ioutil.TempDir("", "robustirc-message-of-death-") if err != nil { t.Fatalf("Could not create tempdir: %v", err) } l, err := localnet.NewLocalnet(-1, tempdir) if err != nil { t.Fatalf("Could not start local RobustIRC network: %v", err) } defer l.Kill(true) l.EnablePanicCommand = "1" // For each of the nodes, start a goroutine that verifies that the node crashes, then start it again var wg sync.WaitGroup for i := 0; i < 3; i++ { cmd, tempdir, addr := l.StartIRCServer(i == 0) wg.Add(1) go func(cmd *exec.Cmd, tempdir string, addr string) { defer wg.Done() terminated := make(chan error) skipped := make(chan bool) go func() { terminated <- cmd.Wait() }() go func() { // Poll messages of death counter. parser := &prometheus_text.Parser{} for { time.Sleep(50 * time.Millisecond) req, err := http.NewRequest("GET", addr+"metrics", nil) if err != nil { continue } resp, err := l.Httpclient.Do(req) if err != nil { continue } if resp.StatusCode != http.StatusOK { continue } metrics, err := parser.TextToMetricFamilies(resp.Body) if err != nil { continue } applied, ok := metrics["applied_messages"] if !ok { continue } for _, m := range applied.GetMetric() { for _, labelpair := range m.GetLabel() { if labelpair.GetName() == "type" && labelpair.GetValue() == types.RobustType(types.RobustMessageOfDeath).String() { if m.GetCounter().GetValue() > 0 { skipped <- true } } } } } }() // Wait for the server to either crash or skip a message of death. select { case <-terminated: t.Logf("Node terminated (as expected)") case <-skipped: t.Logf("Node skipped message of death") } // Run restart.sh for that node. rcmd := exec.Command(filepath.Join(tempdir, "restart.sh")) if err := rcmd.Start(); err != nil { t.Errorf("Cannot restart node: %v", err) return } l.RecordResource("pid", strconv.Itoa(cmd.Process.Pid)) // Ensure the node comes back up. started := time.Now() for time.Since(started) < 10*time.Second { if _, err := util.GetServerStatus(addr, l.NetworkPassword); err != nil { t.Logf("Node %s unhealthy: %v", addr, err) time.Sleep(1 * time.Second) continue } t.Logf("Node %s became healthy", addr) return } t.Errorf("Node did not become healthy within 10s") }(cmd, tempdir, addr) } // Connect and send the PANIC message. session, err := robustsession.Create(strings.Join(l.Servers(), ","), filepath.Join(tempdir, "cert.pem")) if err != nil { t.Fatalf("Could not create robustsession: %v", err) } foundjoin := make(chan bool) go func() { for msg := range session.Messages { if !strings.HasPrefix(msg, ":mod!1@") || !strings.HasSuffix(msg, " JOIN :#mod") { continue } select { case foundjoin <- true: default: t.Errorf("Found JOIN too early (channel write blocks)") } } }() go func() { for err := range session.Errors { t.Errorf("RobustSession error: %v", err) } }() session.PostMessage("NICK mod") session.PostMessage("USER 1 2 3 4") session.PostMessage("PANIC") t.Logf("Message of death sent") wg.Wait() healthy := false for try := 0; try < 5; try++ { if l.Healthy() { healthy = true break } time.Sleep(1 * time.Second) } if !healthy { t.Fatalf("Expected recovery, but not all nodes are healthy") } // Verify sending a JOIN now results in an output message. session.PostMessage("JOIN #mod") select { case <-foundjoin: t.Logf("JOIN reply received, network progressing") case <-time.After(5 * time.Second): t.Errorf("Timeout waiting for JOIN message") } }