// exitOnRecover is used to circumvent the recover handler that net/http // installs. We need to exit in order to get restarted by the init // system/supervisor and get into a clean state again. func exitOnRecover() { if r := recover(); r != nil { // This mimics go/src/net/http/server.go. const size = 64 << 10 buf := make([]byte, size) buf = buf[:runtime.Stack(buf, false)] glog.Errorf("http: panic serving: %v\n%s", r, buf) glog.Flush() os.Exit(1) } }
func TestMain(m *testing.M) { defer glog.Flush() flag.Parse() tempdir, err := ioutil.TempDir("", "robustirc-test-raftdir-") if err != nil { log.Fatal(err) } raftDir = &tempdir // TODO: cleanup tmp-outputstream and permanent-compaction* os.Exit(m.Run()) }
func main() { defer glog.Flush() glog.CopyStandardLogTo("INFO") flag.Parse() if strings.TrimSpace(*network) == "" { log.Fatalf("You need to specify -network") } binaryHash := fileHash(*binaryPath) glog.Infof("binaryHash = %s", binaryHash) servers := util.ResolveNetwork(*network) log.Printf("Checking network health\n") if statuses, err := util.EnsureNetworkHealthy(servers, *networkPassword); err != nil { log.Fatalf("Aborting upgrade for safety: %v", err) } else { if allNodesUpdated(statuses, binaryHash) { log.Printf("All nodes are already running the requested version.\n") return } } log.Printf("Restarting %q nodes until their binary hash is %s\n", *network, binaryHash) for rtry := 0; rtry < 5; rtry++ { for _, server := range servers { var statuses map[string]util.ServerStatus var err error started := time.Now() for time.Since(started) < *networkHealthTimeout { statuses, err = util.EnsureNetworkHealthy(servers, *networkPassword) if err != nil { log.Printf("Network is not healthy: %v\n", err) time.Sleep(1 * time.Second) continue } log.Printf("Network became healthy.\n") break } if err != nil { log.Fatalf("Network did not become healthy within %v, aborting. (reason: %v)\n", *networkHealthTimeout, err) } if statuses[server].ExecutableHash == binaryHash { if allNodesUpdated(statuses, binaryHash) { log.Printf("All done!\n") return } log.Printf("Skipping %q which is already running the requested version\n", server) continue } lastApplied := statuses[server].AppliedIndex log.Printf("Killing node %q\n", server) if err := quit(server); err != nil { log.Printf("%v\n", err) } for htry := 0; htry < 60; htry++ { time.Sleep(1 * time.Second) current, err := util.GetServerStatus(server, *networkPassword) if err != nil { log.Printf("Node unhealthy: %v\n", err) continue } if current.ExecutableHash != binaryHash { log.Printf("Node %q came up with hash %s instead of %s?!\n", server, current.ExecutableHash, binaryHash) break } if current.AppliedIndex < lastApplied { log.Printf("Node %q has not yet applied all messages it saw before, waiting (got %d, want ≥ %d)\n", server, current.AppliedIndex, lastApplied) continue } log.Printf("Node %q was upgraded and is healthy again\n", server) break } } } log.Printf("All done!\n") }
func main() { flag.Usage = func() { // It is unfortunate that we need to re-implement flag.PrintDefaults(), // but I cannot see any other way to achieve the grouping of flags. fmt.Fprintf(os.Stderr, "RobustIRC server (= node)\n") fmt.Fprintf(os.Stderr, "\n") fmt.Fprintf(os.Stderr, "The following flags are REQUIRED:\n") printDefault(flag.Lookup("network_name")) printDefault(flag.Lookup("network_password")) printDefault(flag.Lookup("peer_addr")) printDefault(flag.Lookup("tls_cert_path")) printDefault(flag.Lookup("tls_key_path")) fmt.Fprintf(os.Stderr, "\n") fmt.Fprintf(os.Stderr, "The following flags are only relevant when bootstrapping the network (once):\n") printDefault(flag.Lookup("join")) printDefault(flag.Lookup("singlenode")) fmt.Fprintf(os.Stderr, "\n") fmt.Fprintf(os.Stderr, "The following flags are optional:\n") printDefault(flag.Lookup("dump_canary_state")) printDefault(flag.Lookup("dump_heap_profile")) printDefault(flag.Lookup("canary_compaction_start")) printDefault(flag.Lookup("listen")) printDefault(flag.Lookup("raftdir")) printDefault(flag.Lookup("tls_ca_file")) printDefault(flag.Lookup("version")) fmt.Fprintf(os.Stderr, "\n") fmt.Fprintf(os.Stderr, "The following flags are optional and provided by glog:\n") printDefault(flag.Lookup("alsologtostderr")) printDefault(flag.Lookup("log_backtrace_at")) printDefault(flag.Lookup("log_dir")) printDefault(flag.Lookup("log_total_bytes")) printDefault(flag.Lookup("logtostderr")) printDefault(flag.Lookup("stderrthreshold")) printDefault(flag.Lookup("v")) printDefault(flag.Lookup("vmodule")) } flag.Parse() // Store logs in -raftdir, unless otherwise specified. if flag.Lookup("log_dir").Value.String() == "" { flag.Set("log_dir", *raftDir) } defer glog.Flush() glog.MaxSize = 64 * 1024 * 1024 glog.CopyStandardLogTo("INFO") log.Printf("RobustIRC %s\n", Version) if *version { return } if _, err := os.Stat(filepath.Join(*raftDir, "deletestate")); err == nil { if err := os.RemoveAll(*raftDir); err != nil { log.Fatal(err) } if err := os.Mkdir(*raftDir, 0700); err != nil { log.Fatal(err) } log.Printf("Deleted %q because %q existed\n", *raftDir, filepath.Join(*raftDir, "deletestate")) } if err := outputstream.DeleteOldDatabases(*raftDir); err != nil { log.Fatalf("Could not delete old outputstream databases: %v\n", err) } if err := deleteOldCompactionDatabases(*raftDir); err != nil { glog.Errorf("Could not delete old compaction databases: %v (ignoring)\n", err) } log.Printf("Initializing RobustIRC…\n") if *networkPassword == "" { *networkPassword = os.Getenv("ROBUSTIRC_NETWORK_PASSWORD") } if *networkPassword == "" { log.Fatalf("-network_password not set. You MUST protect your network.\n") } digest := sha1.New() digest.Write([]byte(*networkPassword)) passwordHash := "{SHA}" + base64.StdEncoding.EncodeToString(digest.Sum(nil)) if *network == "" { log.Fatalf("-network_name not set, but required.\n") } if *peerAddr == "" { log.Printf("-peer_addr not set, initializing to %q. Make sure %q is a host:port string that other raft nodes can connect to!\n", *listen, *listen) *peerAddr = *listen } ircServer = ircserver.NewIRCServer(*raftDir, *network, time.Now()) transport := rafthttp.NewHTTPTransport( *peerAddr, // Not deadlined, otherwise snapshot installments fail. robusthttp.Client(*networkPassword, false), nil, "") peerStore = raft.NewJSONPeers(*raftDir, transport) if *join == "" && !*singleNode { peers, err := peerStore.Peers() if err != nil { log.Fatal(err.Error()) } if len(peers) == 0 { if !*timesafeguard.DisableTimesafeguard { log.Fatalf("No peers known and -join not specified. Joining the network is not safe because timesafeguard cannot be called.\n") } } else { if len(peers) == 1 && peers[0] == *peerAddr { // To prevent crashlooping too frequently in case the init system directly restarts our process. time.Sleep(10 * time.Second) log.Fatalf("Only known peer is myself (%q), implying this node was removed from the network. Please kill the process and remove the data.\n", *peerAddr) } if err := timesafeguard.SynchronizedWithNetwork(*peerAddr, peers, *networkPassword); err != nil { log.Fatal(err.Error()) } } } var p []string config := raft.DefaultConfig() config.Logger = log.New(glog.LogBridgeFor("INFO"), "", log.Lshortfile) if *singleNode { config.EnableSingleNode = true } // Keep 5 snapshots in *raftDir/snapshots, log to stderr. fss, err := raft.NewFileSnapshotStore(*raftDir, 5, nil) if err != nil { log.Fatal(err) } // How often to check whether a snapshot should be taken. The check is // cheap, and the default value far too high for networks with a high // number of messages/s. // At the same time, it is important that we don’t check too early, // otherwise recovering from the most recent snapshot doesn’t work because // after recovering, a new snapshot (over the 0 committed messages) will be // taken immediately, effectively overwriting the result of the snapshot // recovery. config.SnapshotInterval = 300 * time.Second // Batch as many messages as possible into a single appendEntries RPC. // There is no downside to setting this too high. config.MaxAppendEntries = 1024 // It could be that the heartbeat goroutine is not scheduled for a while, // so relax the default of 500ms. config.LeaderLeaseTimeout = timesafeguard.ElectionTimeout config.HeartbeatTimeout = timesafeguard.ElectionTimeout config.ElectionTimeout = timesafeguard.ElectionTimeout // We use prometheus, so hook up the metrics package (used by raft) to // prometheus as well. sink, err := metrics_prometheus.NewPrometheusSink() if err != nil { log.Fatal(err) } metrics.NewGlobal(metrics.DefaultConfig("raftmetrics"), sink) bootstrapping := *singleNode || *join != "" logStore, err := raft_store.NewLevelDBStore(filepath.Join(*raftDir, "raftlog"), bootstrapping) if err != nil { log.Fatal(err) } ircStore, err = raft_store.NewLevelDBStore(filepath.Join(*raftDir, "irclog"), bootstrapping) if err != nil { log.Fatal(err) } fsm := &FSM{ store: logStore, ircstore: ircStore, lastSnapshotState: make(map[uint64][]byte), } logcache, err := raft.NewLogCache(config.MaxAppendEntries, logStore) if err != nil { log.Fatal(err) } node, err = raft.NewRaft(config, fsm, logcache, logStore, fss, peerStore, transport) if err != nil { log.Fatal(err) } if *dumpCanaryState != "" { canary(fsm, *dumpCanaryState) if *dumpHeapProfile != "" { debug.FreeOSMemory() f, err := os.Create(*dumpHeapProfile) if err != nil { log.Fatal(err) } defer f.Close() pprof.WriteHeapProfile(f) } return } go func() { for { secondsInState.WithLabelValues(node.State().String()).Inc() time.Sleep(1 * time.Second) } }() privaterouter := httprouter.New() privaterouter.Handler("GET", "/", exitOnRecoverHandleFunc(handleStatus)) privaterouter.Handler("GET", "/irclog", exitOnRecoverHandleFunc(handleIrclog)) privaterouter.Handler("POST", "/raft/*rest", exitOnRecoverHandler(transport)) privaterouter.Handler("POST", "/join", exitOnRecoverHandleFunc(handleJoin)) privaterouter.Handler("POST", "/part", exitOnRecoverHandleFunc(handlePart)) privaterouter.Handler("GET", "/snapshot", exitOnRecoverHandleFunc(handleSnapshot)) privaterouter.Handler("GET", "/leader", exitOnRecoverHandleFunc(handleLeader)) privaterouter.Handler("POST", "/quit", exitOnRecoverHandleFunc(handleQuit)) privaterouter.Handler("GET", "/config", exitOnRecoverHandleFunc(handleGetConfig)) privaterouter.Handler("POST", "/config", exitOnRecoverHandleFunc(handlePostConfig)) privaterouter.Handler("GET", "/metrics", exitOnRecoverHandler(prometheus.Handler())) publicrouter := httprouter.New() publicrouter.Handle("POST", "/robustirc/v1/:sessionid", exitOnRecoverHandle(handleCreateSession)) publicrouter.Handle("POST", "/robustirc/v1/:sessionid/message", exitOnRecoverHandle(handlePostMessage)) publicrouter.Handle("GET", "/robustirc/v1/:sessionid/messages", exitOnRecoverHandle(handleGetMessages)) publicrouter.Handle("DELETE", "/robustirc/v1/:sessionid", exitOnRecoverHandle(handleDeleteSession)) a := auth.NewBasicAuthenticator("robustirc", func(user, realm string) string { if user == "robustirc" { return passwordHash } return "" }) http.Handle("/robustirc/", publicrouter) http.Handle("/", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { if username := a.CheckAuth(r); username == "" { a.RequireAuth(w, r) } else { privaterouter.ServeHTTP(w, r) } })) srv := http.Server{Addr: *listen} if err := http2.ConfigureServer(&srv, nil); err != nil { log.Fatal(err) } // Manually create the net.TCPListener so that joinMaster() does not run // into connection refused errors (the master will try to contact the // node before acknowledging the join). srv.TLSConfig.Certificates = make([]tls.Certificate, 1) srv.TLSConfig.Certificates[0], err = tls.LoadX509KeyPair(*tlsCertPath, *tlsKeyPath) if err != nil { log.Fatal(err) } ln, err := net.Listen("tcp", *listen) if err != nil { log.Fatal(err) } tlsListener := tls.NewListener(tcpKeepAliveListener{ln.(*net.TCPListener)}, srv.TLSConfig) go srv.Serve(tlsListener) log.Printf("RobustIRC listening on %q. For status, see %s\n", *peerAddr, fmt.Sprintf("https://*****:*****@%s/", *networkPassword, *peerAddr)) if *join != "" { if err := timesafeguard.SynchronizedWithMasterAndNetwork(*peerAddr, *join, *networkPassword); err != nil { log.Fatal(err.Error()) } p = joinMaster(*join, peerStore) // TODO(secure): properly handle joins on the server-side where the joining node is already in the network. } if len(p) > 0 { node.SetPeers(p) } expireSessionsTimer := time.After(expireSessionsInterval) secondTicker := time.Tick(1 * time.Second) for { select { case <-secondTicker: if node.State() == raft.Shutdown { log.Fatal("Node removed from the network (in raft state shutdown), terminating.") } case <-expireSessionsTimer: expireSessionsTimer = time.After(expireSessionsInterval) // Race conditions (a node becoming a leader or ceasing to be the // leader shortly before/after this runs) are okay, since the timer // is triggered often enough on every node so that it will // eventually run on the leader. if node.State() != raft.Leader { continue } applyMu.Lock() for _, msg := range ircServer.ExpireSessions() { // Cannot fail, no user input. msgbytes, _ := json.Marshal(msg) f := node.Apply(msgbytes, 10*time.Second) if err := f.Error(); err != nil { log.Printf("Apply(): %v\n", err) break } } applyMu.Unlock() } } }