// EnsureNetworkHealthy returns nil when all of the following is true: // • all nodes are reachable // • all nodes return the same leader // • all nodes are either follower or leader (i.e. not candidate/initializing) // • all follower nodes were recently contacted by the leader func EnsureNetworkHealthy(servers []string, networkPassword string) (map[string]ServerStatus, error) { var leader string statuses, err := CollectStatuses(servers, networkPassword) if err != nil { return statuses, err } for _, status := range statuses { // No error checking since this was _parsed_ from JSON, so it must be valid. pretty, _ := json.MarshalIndent(status, "", " ") glog.Infof("%s\n", pretty) if status.State != "Leader" && status.State != "Follower" { return statuses, fmt.Errorf("Server %q in state %q, need Leader or Follower", status.Server, status.State) } if leader == "" { leader = status.Leader } else if leader != status.Leader { return statuses, fmt.Errorf("Server %q thinks %q is leader, others think %q is leader", status.Server, status.Leader, leader) } if status.State == "Follower" && time.Since(status.LastContact) > 2*time.Second { return statuses, fmt.Errorf("Server %q was last contacted by the leader at %v, which is over 2 seconds ago", status.Server, status.LastContact) } } if leader == "" { return statuses, fmt.Errorf("There is no leader currently") } return statuses, nil }
func main() { defer glog.Flush() glog.CopyStandardLogTo("INFO") flag.Parse() if strings.TrimSpace(*network) == "" { log.Fatalf("You need to specify -network") } binaryHash := fileHash(*binaryPath) glog.Infof("binaryHash = %s", binaryHash) servers := util.ResolveNetwork(*network) log.Printf("Checking network health\n") if statuses, err := util.EnsureNetworkHealthy(servers, *networkPassword); err != nil { log.Fatalf("Aborting upgrade for safety: %v", err) } else { if allNodesUpdated(statuses, binaryHash) { log.Printf("All nodes are already running the requested version.\n") return } } log.Printf("Restarting %q nodes until their binary hash is %s\n", *network, binaryHash) for rtry := 0; rtry < 5; rtry++ { for _, server := range servers { var statuses map[string]util.ServerStatus var err error started := time.Now() for time.Since(started) < *networkHealthTimeout { statuses, err = util.EnsureNetworkHealthy(servers, *networkPassword) if err != nil { log.Printf("Network is not healthy: %v\n", err) time.Sleep(1 * time.Second) continue } log.Printf("Network became healthy.\n") break } if err != nil { log.Fatalf("Network did not become healthy within %v, aborting. (reason: %v)\n", *networkHealthTimeout, err) } if statuses[server].ExecutableHash == binaryHash { if allNodesUpdated(statuses, binaryHash) { log.Printf("All done!\n") return } log.Printf("Skipping %q which is already running the requested version\n", server) continue } lastApplied := statuses[server].AppliedIndex log.Printf("Killing node %q\n", server) if err := quit(server); err != nil { log.Printf("%v\n", err) } for htry := 0; htry < 60; htry++ { time.Sleep(1 * time.Second) current, err := util.GetServerStatus(server, *networkPassword) if err != nil { log.Printf("Node unhealthy: %v\n", err) continue } if current.ExecutableHash != binaryHash { log.Printf("Node %q came up with hash %s instead of %s?!\n", server, current.ExecutableHash, binaryHash) break } if current.AppliedIndex < lastApplied { log.Printf("Node %q has not yet applied all messages it saw before, waiting (got %d, want ≥ %d)\n", server, current.AppliedIndex, lastApplied) continue } log.Printf("Node %q was upgraded and is healthy again\n", server) break } } } log.Printf("All done!\n") }