func applyAndCompact(t *testing.T, input []string) []string { tempdir, err := ioutil.TempDir("", "robust-test-") if err != nil { t.Fatalf("ioutil.TempDir: %v", err) } defer os.RemoveAll(tempdir) logstore, err := raft_store.NewLevelDBStore(filepath.Join(tempdir, "raftlog"), false) if err != nil { t.Fatalf("Unexpected error in NewLevelDBStore: %v", err) } ircstore, err := raft_store.NewLevelDBStore(filepath.Join(tempdir, "irclog"), false) if err != nil { t.Fatalf("Unexpected error in NewLevelDBStore: %v", err) } fsm := FSM{logstore, ircstore} var logs []*raft.Log for _, msg := range input { logs = append(logs, &raft.Log{ Type: raft.LogCommand, Index: uint64(len(logs) + 1), Data: []byte(msg), }) } if err := logstore.StoreLogs(logs); err != nil { t.Fatalf("Unexpected error in store.StoreLogs: %v", err) } for _, log := range logs { fsm.Apply(log) } rawsnap, err := fsm.Snapshot() if err != nil { t.Fatalf("Unexpected error in fsm.Snapshot: %v", err) } s := rawsnap.(*robustSnapshot) sink := inMemorySink{} s.Persist(&sink) dec := json.NewDecoder(&sink.b) var output []string for { var l raft.Log if err := dec.Decode(&l); err != nil { if err == io.EOF { break } t.Fatalf("Unexpected error in json.Decode: %v", err) } output = append(output, string(l.Data)) } return output }
func createIrcServer(tempdir string) (*raft_store.LevelDBStore, *raft_store.LevelDBStore, FSM, error) { ircServer = ircserver.NewIRCServer("", "testnetwork", time.Now()) flag.Set("raftdir", tempdir) logstore, err := raft_store.NewLevelDBStore(filepath.Join(tempdir, "raftlog"), false) if err != nil { return nil, nil, FSM{}, err } ircstore, err := raft_store.NewLevelDBStore(filepath.Join(tempdir, "irclog"), false) if err != nil { return nil, nil, FSM{}, err } fsm := FSM{store: logstore, ircstore: ircstore} return logstore, ircstore, fsm, nil }
func main() { flag.Usage = func() { // It is unfortunate that we need to re-implement flag.PrintDefaults(), // but I cannot see any other way to achieve the grouping of flags. fmt.Fprintf(os.Stderr, "RobustIRC server (= node)\n") fmt.Fprintf(os.Stderr, "\n") fmt.Fprintf(os.Stderr, "The following flags are REQUIRED:\n") printDefault(flag.Lookup("network_name")) printDefault(flag.Lookup("network_password")) printDefault(flag.Lookup("peer_addr")) printDefault(flag.Lookup("tls_cert_path")) printDefault(flag.Lookup("tls_key_path")) fmt.Fprintf(os.Stderr, "\n") fmt.Fprintf(os.Stderr, "The following flags are only relevant when bootstrapping the network (once):\n") printDefault(flag.Lookup("join")) printDefault(flag.Lookup("singlenode")) fmt.Fprintf(os.Stderr, "\n") fmt.Fprintf(os.Stderr, "The following flags are optional:\n") printDefault(flag.Lookup("dump_canary_state")) printDefault(flag.Lookup("dump_heap_profile")) printDefault(flag.Lookup("canary_compaction_start")) printDefault(flag.Lookup("listen")) printDefault(flag.Lookup("raftdir")) printDefault(flag.Lookup("tls_ca_file")) printDefault(flag.Lookup("version")) fmt.Fprintf(os.Stderr, "\n") fmt.Fprintf(os.Stderr, "The following flags are optional and provided by glog:\n") printDefault(flag.Lookup("alsologtostderr")) printDefault(flag.Lookup("log_backtrace_at")) printDefault(flag.Lookup("log_dir")) printDefault(flag.Lookup("log_total_bytes")) printDefault(flag.Lookup("logtostderr")) printDefault(flag.Lookup("stderrthreshold")) printDefault(flag.Lookup("v")) printDefault(flag.Lookup("vmodule")) } flag.Parse() // Store logs in -raftdir, unless otherwise specified. if flag.Lookup("log_dir").Value.String() == "" { flag.Set("log_dir", *raftDir) } defer glog.Flush() glog.MaxSize = 64 * 1024 * 1024 glog.CopyStandardLogTo("INFO") log.Printf("RobustIRC %s\n", Version) if *version { return } if _, err := os.Stat(filepath.Join(*raftDir, "deletestate")); err == nil { if err := os.RemoveAll(*raftDir); err != nil { log.Fatal(err) } if err := os.Mkdir(*raftDir, 0700); err != nil { log.Fatal(err) } log.Printf("Deleted %q because %q existed\n", *raftDir, filepath.Join(*raftDir, "deletestate")) } if err := outputstream.DeleteOldDatabases(*raftDir); err != nil { log.Fatalf("Could not delete old outputstream databases: %v\n", err) } if err := deleteOldCompactionDatabases(*raftDir); err != nil { glog.Errorf("Could not delete old compaction databases: %v (ignoring)\n", err) } log.Printf("Initializing RobustIRC…\n") if *networkPassword == "" { *networkPassword = os.Getenv("ROBUSTIRC_NETWORK_PASSWORD") } if *networkPassword == "" { log.Fatalf("-network_password not set. You MUST protect your network.\n") } digest := sha1.New() digest.Write([]byte(*networkPassword)) passwordHash := "{SHA}" + base64.StdEncoding.EncodeToString(digest.Sum(nil)) if *network == "" { log.Fatalf("-network_name not set, but required.\n") } if *peerAddr == "" { log.Printf("-peer_addr not set, initializing to %q. Make sure %q is a host:port string that other raft nodes can connect to!\n", *listen, *listen) *peerAddr = *listen } ircServer = ircserver.NewIRCServer(*raftDir, *network, time.Now()) transport := rafthttp.NewHTTPTransport( *peerAddr, // Not deadlined, otherwise snapshot installments fail. robusthttp.Client(*networkPassword, false), nil, "") peerStore = raft.NewJSONPeers(*raftDir, transport) if *join == "" && !*singleNode { peers, err := peerStore.Peers() if err != nil { log.Fatal(err.Error()) } if len(peers) == 0 { if !*timesafeguard.DisableTimesafeguard { log.Fatalf("No peers known and -join not specified. Joining the network is not safe because timesafeguard cannot be called.\n") } } else { if len(peers) == 1 && peers[0] == *peerAddr { // To prevent crashlooping too frequently in case the init system directly restarts our process. time.Sleep(10 * time.Second) log.Fatalf("Only known peer is myself (%q), implying this node was removed from the network. Please kill the process and remove the data.\n", *peerAddr) } if err := timesafeguard.SynchronizedWithNetwork(*peerAddr, peers, *networkPassword); err != nil { log.Fatal(err.Error()) } } } var p []string config := raft.DefaultConfig() config.Logger = log.New(glog.LogBridgeFor("INFO"), "", log.Lshortfile) if *singleNode { config.EnableSingleNode = true } // Keep 5 snapshots in *raftDir/snapshots, log to stderr. fss, err := raft.NewFileSnapshotStore(*raftDir, 5, nil) if err != nil { log.Fatal(err) } // How often to check whether a snapshot should be taken. The check is // cheap, and the default value far too high for networks with a high // number of messages/s. // At the same time, it is important that we don’t check too early, // otherwise recovering from the most recent snapshot doesn’t work because // after recovering, a new snapshot (over the 0 committed messages) will be // taken immediately, effectively overwriting the result of the snapshot // recovery. config.SnapshotInterval = 300 * time.Second // Batch as many messages as possible into a single appendEntries RPC. // There is no downside to setting this too high. config.MaxAppendEntries = 1024 // It could be that the heartbeat goroutine is not scheduled for a while, // so relax the default of 500ms. config.LeaderLeaseTimeout = timesafeguard.ElectionTimeout config.HeartbeatTimeout = timesafeguard.ElectionTimeout config.ElectionTimeout = timesafeguard.ElectionTimeout // We use prometheus, so hook up the metrics package (used by raft) to // prometheus as well. sink, err := metrics_prometheus.NewPrometheusSink() if err != nil { log.Fatal(err) } metrics.NewGlobal(metrics.DefaultConfig("raftmetrics"), sink) bootstrapping := *singleNode || *join != "" logStore, err := raft_store.NewLevelDBStore(filepath.Join(*raftDir, "raftlog"), bootstrapping) if err != nil { log.Fatal(err) } ircStore, err = raft_store.NewLevelDBStore(filepath.Join(*raftDir, "irclog"), bootstrapping) if err != nil { log.Fatal(err) } fsm := &FSM{ store: logStore, ircstore: ircStore, lastSnapshotState: make(map[uint64][]byte), } logcache, err := raft.NewLogCache(config.MaxAppendEntries, logStore) if err != nil { log.Fatal(err) } node, err = raft.NewRaft(config, fsm, logcache, logStore, fss, peerStore, transport) if err != nil { log.Fatal(err) } if *dumpCanaryState != "" { canary(fsm, *dumpCanaryState) if *dumpHeapProfile != "" { debug.FreeOSMemory() f, err := os.Create(*dumpHeapProfile) if err != nil { log.Fatal(err) } defer f.Close() pprof.WriteHeapProfile(f) } return } go func() { for { secondsInState.WithLabelValues(node.State().String()).Inc() time.Sleep(1 * time.Second) } }() privaterouter := httprouter.New() privaterouter.Handler("GET", "/", exitOnRecoverHandleFunc(handleStatus)) privaterouter.Handler("GET", "/irclog", exitOnRecoverHandleFunc(handleIrclog)) privaterouter.Handler("POST", "/raft/*rest", exitOnRecoverHandler(transport)) privaterouter.Handler("POST", "/join", exitOnRecoverHandleFunc(handleJoin)) privaterouter.Handler("POST", "/part", exitOnRecoverHandleFunc(handlePart)) privaterouter.Handler("GET", "/snapshot", exitOnRecoverHandleFunc(handleSnapshot)) privaterouter.Handler("GET", "/leader", exitOnRecoverHandleFunc(handleLeader)) privaterouter.Handler("POST", "/quit", exitOnRecoverHandleFunc(handleQuit)) privaterouter.Handler("GET", "/config", exitOnRecoverHandleFunc(handleGetConfig)) privaterouter.Handler("POST", "/config", exitOnRecoverHandleFunc(handlePostConfig)) privaterouter.Handler("GET", "/metrics", exitOnRecoverHandler(prometheus.Handler())) publicrouter := httprouter.New() publicrouter.Handle("POST", "/robustirc/v1/:sessionid", exitOnRecoverHandle(handleCreateSession)) publicrouter.Handle("POST", "/robustirc/v1/:sessionid/message", exitOnRecoverHandle(handlePostMessage)) publicrouter.Handle("GET", "/robustirc/v1/:sessionid/messages", exitOnRecoverHandle(handleGetMessages)) publicrouter.Handle("DELETE", "/robustirc/v1/:sessionid", exitOnRecoverHandle(handleDeleteSession)) a := auth.NewBasicAuthenticator("robustirc", func(user, realm string) string { if user == "robustirc" { return passwordHash } return "" }) http.Handle("/robustirc/", publicrouter) http.Handle("/", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { if username := a.CheckAuth(r); username == "" { a.RequireAuth(w, r) } else { privaterouter.ServeHTTP(w, r) } })) srv := http.Server{Addr: *listen} if err := http2.ConfigureServer(&srv, nil); err != nil { log.Fatal(err) } // Manually create the net.TCPListener so that joinMaster() does not run // into connection refused errors (the master will try to contact the // node before acknowledging the join). srv.TLSConfig.Certificates = make([]tls.Certificate, 1) srv.TLSConfig.Certificates[0], err = tls.LoadX509KeyPair(*tlsCertPath, *tlsKeyPath) if err != nil { log.Fatal(err) } ln, err := net.Listen("tcp", *listen) if err != nil { log.Fatal(err) } tlsListener := tls.NewListener(tcpKeepAliveListener{ln.(*net.TCPListener)}, srv.TLSConfig) go srv.Serve(tlsListener) log.Printf("RobustIRC listening on %q. For status, see %s\n", *peerAddr, fmt.Sprintf("https://*****:*****@%s/", *networkPassword, *peerAddr)) if *join != "" { if err := timesafeguard.SynchronizedWithMasterAndNetwork(*peerAddr, *join, *networkPassword); err != nil { log.Fatal(err.Error()) } p = joinMaster(*join, peerStore) // TODO(secure): properly handle joins on the server-side where the joining node is already in the network. } if len(p) > 0 { node.SetPeers(p) } expireSessionsTimer := time.After(expireSessionsInterval) secondTicker := time.Tick(1 * time.Second) for { select { case <-secondTicker: if node.State() == raft.Shutdown { log.Fatal("Node removed from the network (in raft state shutdown), terminating.") } case <-expireSessionsTimer: expireSessionsTimer = time.After(expireSessionsInterval) // Race conditions (a node becoming a leader or ceasing to be the // leader shortly before/after this runs) are okay, since the timer // is triggered often enough on every node so that it will // eventually run on the leader. if node.State() != raft.Leader { continue } applyMu.Lock() for _, msg := range ircServer.ExpireSessions() { // Cannot fail, no user input. msgbytes, _ := json.Marshal(msg) f := node.Apply(msgbytes, 10*time.Second) if err := f.Error(); err != nil { log.Printf("Apply(): %v\n", err) break } } applyMu.Unlock() } } }
// TestCompaction does a full snapshot, persists it to disk, restores it and // makes sure the state matches expectations. The other test functions directly // test what should be compacted. func TestCompaction(t *testing.T) { ircServer = ircserver.NewIRCServer("", "testnetwork", time.Now()) tempdir, err := ioutil.TempDir("", "robust-test-") if err != nil { t.Fatalf("ioutil.TempDir: %v", err) } defer os.RemoveAll(tempdir) flag.Set("raftdir", tempdir) logstore, err := raft_store.NewLevelDBStore(filepath.Join(tempdir, "raftlog"), false) if err != nil { t.Fatalf("Unexpected error in NewLevelDBStore: %v", err) } ircstore, err := raft_store.NewLevelDBStore(filepath.Join(tempdir, "irclog"), false) if err != nil { t.Fatalf("Unexpected error in NewLevelDBStore: %v", err) } fsm := FSM{logstore, ircstore} var logs []*raft.Log logs = appendLog(logs, `{"Id": {"Id": 1}, "Type": 0, "Data": "auth"}`) logs = appendLog(logs, `{"Id": {"Id": 2}, "Session": {"Id": 1}, "Type": 2, "Data": "NICK sECuRE"}`) logs = appendLog(logs, `{"Id": {"Id": 3}, "Session": {"Id": 1}, "Type": 2, "Data": "USER blah 0 * :Michael Stapelberg"}`) logs = appendLog(logs, `{"Id": {"Id": 4}, "Session": {"Id": 1}, "Type": 2, "Data": "NICK secure_"}`) logs = appendLog(logs, `{"Id": {"Id": 5}, "Session": {"Id": 1}, "Type": 2, "Data": "JOIN #chaos-hd"}`) logs = appendLog(logs, `{"Id": {"Id": 6}, "Session": {"Id": 1}, "Type": 2, "Data": "JOIN #i3"}`) logs = appendLog(logs, `{"Id": {"Id": 7}, "Session": {"Id": 1}, "Type": 2, "Data": "PRIVMSG #chaos-hd :heya"}`) logs = appendLog(logs, `{"Id": {"Id": 8}, "Session": {"Id": 1}, "Type": 2, "Data": "PRIVMSG #chaos-hd :newer message"}`) logs = appendLog(logs, `{"Id": {"Id": 9}, "Session": {"Id": 1}, "Type": 2, "Data": "PART #i3"}`) // These messages are too new to be compacted. nowID := time.Now().UnixNano() logs = appendLog(logs, `{"Id": {"Id": `+strconv.FormatInt(nowID, 10)+`}, "Session": {"Id": 1}, "Type": 2, "Data": "PART #chaos-hd"}`) nowID++ logs = appendLog(logs, `{"Id": {"Id": `+strconv.FormatInt(nowID, 10)+`}, "Session": {"Id": 1}, "Type": 2, "Data": "JOIN #chaos-hd"}`) if err := logstore.StoreLogs(logs); err != nil { t.Fatalf("Unexpected error in store.StoreLogs: %v", err) } for _, log := range logs { fsm.Apply(log) } verifyEndState(t) snapshot, err := fsm.Snapshot() if err != nil { t.Fatalf("Unexpected error in fsm.Snapshot(): %v", err) } robustsnap, ok := snapshot.(*robustSnapshot) if !ok { t.Fatalf("fsm.Snapshot() return value is not a robustSnapshot") } if robustsnap.lastIndex != uint64(len(logs)) { t.Fatalf("snapshot does not retain the last message, got: %d, want: %d", robustsnap.lastIndex, len(logs)) } fss, err := raft.NewFileSnapshotStore(tempdir, 5, nil) if err != nil { t.Fatalf("%v", err) } sink, err := fss.Create(uint64(len(logs)), 1, []byte{}) if err != nil { t.Fatalf("fss.Create: %v", err) } if err := snapshot.Persist(sink); err != nil { t.Fatalf("Unexpected error in snapshot.Persist(): %v", err) } snapshots, err := fss.List() if err != nil { t.Fatalf("fss.List(): %v", err) } if len(snapshots) != 1 { t.Fatalf("len(snapshots): got %d, want 1", len(snapshots)) } _, readcloser, err := fss.Open(snapshots[0].ID) if err != nil { t.Fatalf("fss.Open(%s): %v", snapshots[0].ID, err) } if err := fsm.Restore(readcloser); err != nil { t.Fatalf("fsm.Restore(): %v", err) } first, _ := ircstore.FirstIndex() last, _ := ircstore.LastIndex() if last-first >= uint64(len(logs)) { t.Fatalf("Compaction did not decrease log size. got: %d, want: < %d", last-first, len(logs)) } verifyEndState(t) }
// TestCompaction does a full snapshot, persists it to disk, restores it and // makes sure the state matches expectations. The other test functions directly // test what should be compacted. func TestCompaction(t *testing.T) { ircServer = ircserver.NewIRCServer("", "testnetwork", time.Now()) tempdir, err := ioutil.TempDir("", "robust-test-") if err != nil { t.Fatalf("ioutil.TempDir: %v", err) } defer os.RemoveAll(tempdir) flag.Set("raftdir", tempdir) logstore, err := raft_store.NewLevelDBStore(filepath.Join(tempdir, "raftlog"), false) if err != nil { t.Fatalf("Unexpected error in NewLevelDBStore: %v", err) } ircstore, err := raft_store.NewLevelDBStore(filepath.Join(tempdir, "irclog"), false) if err != nil { t.Fatalf("Unexpected error in NewLevelDBStore: %v", err) } fsm := FSM{ store: logstore, ircstore: ircstore, lastSnapshotState: make(map[uint64][]byte), } var logs []*raft.Log logs = appendLog(logs, `{"Id": {"Id": 1}, "Type": 0, "Data": "auth"}`) logs = appendLog(logs, `{"Id": {"Id": 2}, "Session": {"Id": 1}, "Type": 2, "Data": "NICK sECuRE"}`) logs = appendLog(logs, `{"Id": {"Id": 3}, "Session": {"Id": 1}, "Type": 2, "Data": "USER blah 0 * :Michael Stapelberg"}`) logs = appendLog(logs, `{"Id": {"Id": 4}, "Session": {"Id": 1}, "Type": 2, "Data": "NICK secure_"}`) logs = appendLog(logs, `{"Id": {"Id": 5}, "Session": {"Id": 1}, "Type": 2, "Data": "JOIN #chaos-hd"}`) logs = appendLog(logs, `{"Id": {"Id": 6}, "Session": {"Id": 1}, "Type": 2, "Data": "JOIN #i3"}`) logs = appendLog(logs, `{"Id": {"Id": 7}, "Session": {"Id": 1}, "Type": 2, "Data": "PRIVMSG #chaos-hd :heya"}`) logs = appendLog(logs, `{"Id": {"Id": 8}, "Session": {"Id": 1}, "Type": 2, "Data": "PRIVMSG #chaos-hd :newer message"}`) logs = appendLog(logs, `{"Id": {"Id": 9}, "Session": {"Id": 1}, "Type": 2, "Data": "PART #i3"}`) // These messages are too new to be compacted. nowID := time.Now().UnixNano() logs = appendLog(logs, `{"Id": {"Id": `+strconv.FormatInt(nowID, 10)+`}, "Session": {"Id": 1}, "Type": 2, "Data": "PART #chaos-hd"}`) nowID++ logs = appendLog(logs, `{"Id": {"Id": `+strconv.FormatInt(nowID, 10)+`}, "Session": {"Id": 1}, "Type": 2, "Data": "JOIN #chaos-hd"}`) if err := logstore.StoreLogs(logs); err != nil { t.Fatalf("Unexpected error in store.StoreLogs: %v", err) } for _, log := range logs { fsm.Apply(log) } verifyEndState(t) fss, err := raft.NewFileSnapshotStore(tempdir, 5, nil) if err != nil { t.Fatalf("%v", err) } // Snapshot twice so that we know state is carried over from one // snapshot to the next. if err := snapshot(&fsm, fss, uint64(len(logs))); err != nil { t.Fatal(err) } if err := snapshot(&fsm, fss, uint64(len(logs))); err != nil { t.Fatal(err) } ircServer = ircserver.NewIRCServer("", "testnetwork", time.Now()) if err := restore(&fsm, fss, uint64(len(logs))); err != nil { t.Fatal(err) } verifyEndState(t) // Restore() a fresh FSM, then take another snapshot, restore it // and verify the end state. This covers the code path where the // previous snapshot was not done in the same process run. ircstore = fsm.ircstore fsm = FSM{ store: logstore, ircstore: ircstore, lastSnapshotState: make(map[uint64][]byte), } if err := restore(&fsm, fss, uint64(len(logs))); err != nil { t.Fatal(err) } if err := snapshot(&fsm, fss, uint64(len(logs))); err != nil { t.Fatal(err) } ircServer = ircserver.NewIRCServer("", "testnetwork", time.Now()) if err := restore(&fsm, fss, uint64(len(logs))); err != nil { t.Fatal(err) } verifyEndState(t) }