Example #1
0
func applyAndCompact(t *testing.T, input []string) []string {
	tempdir, err := ioutil.TempDir("", "robust-test-")
	if err != nil {
		t.Fatalf("ioutil.TempDir: %v", err)
	}
	defer os.RemoveAll(tempdir)

	logstore, err := raft_store.NewLevelDBStore(filepath.Join(tempdir, "raftlog"), false)
	if err != nil {
		t.Fatalf("Unexpected error in NewLevelDBStore: %v", err)
	}
	ircstore, err := raft_store.NewLevelDBStore(filepath.Join(tempdir, "irclog"), false)
	if err != nil {
		t.Fatalf("Unexpected error in NewLevelDBStore: %v", err)
	}
	fsm := FSM{logstore, ircstore}

	var logs []*raft.Log
	for _, msg := range input {
		logs = append(logs, &raft.Log{
			Type:  raft.LogCommand,
			Index: uint64(len(logs) + 1),
			Data:  []byte(msg),
		})
	}

	if err := logstore.StoreLogs(logs); err != nil {
		t.Fatalf("Unexpected error in store.StoreLogs: %v", err)
	}

	for _, log := range logs {
		fsm.Apply(log)
	}

	rawsnap, err := fsm.Snapshot()
	if err != nil {
		t.Fatalf("Unexpected error in fsm.Snapshot: %v", err)
	}
	s := rawsnap.(*robustSnapshot)
	sink := inMemorySink{}
	s.Persist(&sink)

	dec := json.NewDecoder(&sink.b)
	var output []string
	for {
		var l raft.Log
		if err := dec.Decode(&l); err != nil {
			if err == io.EOF {
				break
			}
			t.Fatalf("Unexpected error in json.Decode: %v", err)
		}
		output = append(output, string(l.Data))
	}

	return output
}
Example #2
0
func createIrcServer(tempdir string) (*raft_store.LevelDBStore, *raft_store.LevelDBStore, FSM, error) {
	ircServer = ircserver.NewIRCServer("", "testnetwork", time.Now())
	flag.Set("raftdir", tempdir)

	logstore, err := raft_store.NewLevelDBStore(filepath.Join(tempdir, "raftlog"), false)
	if err != nil {
		return nil, nil, FSM{}, err
	}
	ircstore, err := raft_store.NewLevelDBStore(filepath.Join(tempdir, "irclog"), false)
	if err != nil {
		return nil, nil, FSM{}, err
	}
	fsm := FSM{store: logstore, ircstore: ircstore}
	return logstore, ircstore, fsm, nil
}
Example #3
0
func main() {
	flag.Usage = func() {
		// It is unfortunate that we need to re-implement flag.PrintDefaults(),
		// but I cannot see any other way to achieve the grouping of flags.
		fmt.Fprintf(os.Stderr, "RobustIRC server (= node)\n")
		fmt.Fprintf(os.Stderr, "\n")
		fmt.Fprintf(os.Stderr, "The following flags are REQUIRED:\n")
		printDefault(flag.Lookup("network_name"))
		printDefault(flag.Lookup("network_password"))
		printDefault(flag.Lookup("peer_addr"))
		printDefault(flag.Lookup("tls_cert_path"))
		printDefault(flag.Lookup("tls_key_path"))
		fmt.Fprintf(os.Stderr, "\n")
		fmt.Fprintf(os.Stderr, "The following flags are only relevant when bootstrapping the network (once):\n")
		printDefault(flag.Lookup("join"))
		printDefault(flag.Lookup("singlenode"))
		fmt.Fprintf(os.Stderr, "\n")
		fmt.Fprintf(os.Stderr, "The following flags are optional:\n")
		printDefault(flag.Lookup("dump_canary_state"))
		printDefault(flag.Lookup("dump_heap_profile"))
		printDefault(flag.Lookup("canary_compaction_start"))
		printDefault(flag.Lookup("listen"))
		printDefault(flag.Lookup("raftdir"))
		printDefault(flag.Lookup("tls_ca_file"))
		printDefault(flag.Lookup("version"))
		fmt.Fprintf(os.Stderr, "\n")
		fmt.Fprintf(os.Stderr, "The following flags are optional and provided by glog:\n")
		printDefault(flag.Lookup("alsologtostderr"))
		printDefault(flag.Lookup("log_backtrace_at"))
		printDefault(flag.Lookup("log_dir"))
		printDefault(flag.Lookup("log_total_bytes"))
		printDefault(flag.Lookup("logtostderr"))
		printDefault(flag.Lookup("stderrthreshold"))
		printDefault(flag.Lookup("v"))
		printDefault(flag.Lookup("vmodule"))
	}
	flag.Parse()

	// Store logs in -raftdir, unless otherwise specified.
	if flag.Lookup("log_dir").Value.String() == "" {
		flag.Set("log_dir", *raftDir)
	}

	defer glog.Flush()
	glog.MaxSize = 64 * 1024 * 1024
	glog.CopyStandardLogTo("INFO")

	log.Printf("RobustIRC %s\n", Version)
	if *version {
		return
	}

	if _, err := os.Stat(filepath.Join(*raftDir, "deletestate")); err == nil {
		if err := os.RemoveAll(*raftDir); err != nil {
			log.Fatal(err)
		}
		if err := os.Mkdir(*raftDir, 0700); err != nil {
			log.Fatal(err)
		}
		log.Printf("Deleted %q because %q existed\n", *raftDir, filepath.Join(*raftDir, "deletestate"))
	}

	if err := outputstream.DeleteOldDatabases(*raftDir); err != nil {
		log.Fatalf("Could not delete old outputstream databases: %v\n", err)
	}

	if err := deleteOldCompactionDatabases(*raftDir); err != nil {
		glog.Errorf("Could not delete old compaction databases: %v (ignoring)\n", err)
	}

	log.Printf("Initializing RobustIRC…\n")

	if *networkPassword == "" {
		*networkPassword = os.Getenv("ROBUSTIRC_NETWORK_PASSWORD")
	}
	if *networkPassword == "" {
		log.Fatalf("-network_password not set. You MUST protect your network.\n")
	}
	digest := sha1.New()
	digest.Write([]byte(*networkPassword))
	passwordHash := "{SHA}" + base64.StdEncoding.EncodeToString(digest.Sum(nil))

	if *network == "" {
		log.Fatalf("-network_name not set, but required.\n")
	}

	if *peerAddr == "" {
		log.Printf("-peer_addr not set, initializing to %q. Make sure %q is a host:port string that other raft nodes can connect to!\n", *listen, *listen)
		*peerAddr = *listen
	}

	ircServer = ircserver.NewIRCServer(*raftDir, *network, time.Now())

	transport := rafthttp.NewHTTPTransport(
		*peerAddr,
		// Not deadlined, otherwise snapshot installments fail.
		robusthttp.Client(*networkPassword, false),
		nil,
		"")

	peerStore = raft.NewJSONPeers(*raftDir, transport)

	if *join == "" && !*singleNode {
		peers, err := peerStore.Peers()
		if err != nil {
			log.Fatal(err.Error())
		}
		if len(peers) == 0 {
			if !*timesafeguard.DisableTimesafeguard {
				log.Fatalf("No peers known and -join not specified. Joining the network is not safe because timesafeguard cannot be called.\n")
			}
		} else {
			if len(peers) == 1 && peers[0] == *peerAddr {
				// To prevent crashlooping too frequently in case the init system directly restarts our process.
				time.Sleep(10 * time.Second)
				log.Fatalf("Only known peer is myself (%q), implying this node was removed from the network. Please kill the process and remove the data.\n", *peerAddr)
			}
			if err := timesafeguard.SynchronizedWithNetwork(*peerAddr, peers, *networkPassword); err != nil {
				log.Fatal(err.Error())
			}
		}
	}

	var p []string

	config := raft.DefaultConfig()
	config.Logger = log.New(glog.LogBridgeFor("INFO"), "", log.Lshortfile)
	if *singleNode {
		config.EnableSingleNode = true
	}

	// Keep 5 snapshots in *raftDir/snapshots, log to stderr.
	fss, err := raft.NewFileSnapshotStore(*raftDir, 5, nil)
	if err != nil {
		log.Fatal(err)
	}

	// How often to check whether a snapshot should be taken. The check is
	// cheap, and the default value far too high for networks with a high
	// number of messages/s.
	// At the same time, it is important that we don’t check too early,
	// otherwise recovering from the most recent snapshot doesn’t work because
	// after recovering, a new snapshot (over the 0 committed messages) will be
	// taken immediately, effectively overwriting the result of the snapshot
	// recovery.
	config.SnapshotInterval = 300 * time.Second

	// Batch as many messages as possible into a single appendEntries RPC.
	// There is no downside to setting this too high.
	config.MaxAppendEntries = 1024

	// It could be that the heartbeat goroutine is not scheduled for a while,
	// so relax the default of 500ms.
	config.LeaderLeaseTimeout = timesafeguard.ElectionTimeout
	config.HeartbeatTimeout = timesafeguard.ElectionTimeout
	config.ElectionTimeout = timesafeguard.ElectionTimeout

	// We use prometheus, so hook up the metrics package (used by raft) to
	// prometheus as well.
	sink, err := metrics_prometheus.NewPrometheusSink()
	if err != nil {
		log.Fatal(err)
	}
	metrics.NewGlobal(metrics.DefaultConfig("raftmetrics"), sink)

	bootstrapping := *singleNode || *join != ""
	logStore, err := raft_store.NewLevelDBStore(filepath.Join(*raftDir, "raftlog"), bootstrapping)
	if err != nil {
		log.Fatal(err)
	}
	ircStore, err = raft_store.NewLevelDBStore(filepath.Join(*raftDir, "irclog"), bootstrapping)
	if err != nil {
		log.Fatal(err)
	}
	fsm := &FSM{
		store:             logStore,
		ircstore:          ircStore,
		lastSnapshotState: make(map[uint64][]byte),
	}
	logcache, err := raft.NewLogCache(config.MaxAppendEntries, logStore)
	if err != nil {
		log.Fatal(err)
	}

	node, err = raft.NewRaft(config, fsm, logcache, logStore, fss, peerStore, transport)
	if err != nil {
		log.Fatal(err)
	}

	if *dumpCanaryState != "" {
		canary(fsm, *dumpCanaryState)
		if *dumpHeapProfile != "" {
			debug.FreeOSMemory()
			f, err := os.Create(*dumpHeapProfile)
			if err != nil {
				log.Fatal(err)
			}
			defer f.Close()
			pprof.WriteHeapProfile(f)
		}
		return
	}

	go func() {
		for {
			secondsInState.WithLabelValues(node.State().String()).Inc()
			time.Sleep(1 * time.Second)
		}
	}()

	privaterouter := httprouter.New()
	privaterouter.Handler("GET", "/", exitOnRecoverHandleFunc(handleStatus))
	privaterouter.Handler("GET", "/irclog", exitOnRecoverHandleFunc(handleIrclog))
	privaterouter.Handler("POST", "/raft/*rest", exitOnRecoverHandler(transport))
	privaterouter.Handler("POST", "/join", exitOnRecoverHandleFunc(handleJoin))
	privaterouter.Handler("POST", "/part", exitOnRecoverHandleFunc(handlePart))
	privaterouter.Handler("GET", "/snapshot", exitOnRecoverHandleFunc(handleSnapshot))
	privaterouter.Handler("GET", "/leader", exitOnRecoverHandleFunc(handleLeader))
	privaterouter.Handler("POST", "/quit", exitOnRecoverHandleFunc(handleQuit))
	privaterouter.Handler("GET", "/config", exitOnRecoverHandleFunc(handleGetConfig))
	privaterouter.Handler("POST", "/config", exitOnRecoverHandleFunc(handlePostConfig))
	privaterouter.Handler("GET", "/metrics", exitOnRecoverHandler(prometheus.Handler()))

	publicrouter := httprouter.New()
	publicrouter.Handle("POST", "/robustirc/v1/:sessionid", exitOnRecoverHandle(handleCreateSession))
	publicrouter.Handle("POST", "/robustirc/v1/:sessionid/message", exitOnRecoverHandle(handlePostMessage))
	publicrouter.Handle("GET", "/robustirc/v1/:sessionid/messages", exitOnRecoverHandle(handleGetMessages))
	publicrouter.Handle("DELETE", "/robustirc/v1/:sessionid", exitOnRecoverHandle(handleDeleteSession))

	a := auth.NewBasicAuthenticator("robustirc", func(user, realm string) string {
		if user == "robustirc" {
			return passwordHash
		}
		return ""
	})

	http.Handle("/robustirc/", publicrouter)

	http.Handle("/", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
		if username := a.CheckAuth(r); username == "" {
			a.RequireAuth(w, r)
		} else {
			privaterouter.ServeHTTP(w, r)
		}
	}))

	srv := http.Server{Addr: *listen}
	if err := http2.ConfigureServer(&srv, nil); err != nil {
		log.Fatal(err)
	}

	// Manually create the net.TCPListener so that joinMaster() does not run
	// into connection refused errors (the master will try to contact the
	// node before acknowledging the join).
	srv.TLSConfig.Certificates = make([]tls.Certificate, 1)
	srv.TLSConfig.Certificates[0], err = tls.LoadX509KeyPair(*tlsCertPath, *tlsKeyPath)
	if err != nil {
		log.Fatal(err)
	}

	ln, err := net.Listen("tcp", *listen)
	if err != nil {
		log.Fatal(err)
	}

	tlsListener := tls.NewListener(tcpKeepAliveListener{ln.(*net.TCPListener)}, srv.TLSConfig)
	go srv.Serve(tlsListener)

	log.Printf("RobustIRC listening on %q. For status, see %s\n",
		*peerAddr,
		fmt.Sprintf("https://*****:*****@%s/", *networkPassword, *peerAddr))

	if *join != "" {
		if err := timesafeguard.SynchronizedWithMasterAndNetwork(*peerAddr, *join, *networkPassword); err != nil {
			log.Fatal(err.Error())
		}

		p = joinMaster(*join, peerStore)
		// TODO(secure): properly handle joins on the server-side where the joining node is already in the network.
	}

	if len(p) > 0 {
		node.SetPeers(p)
	}

	expireSessionsTimer := time.After(expireSessionsInterval)
	secondTicker := time.Tick(1 * time.Second)
	for {
		select {
		case <-secondTicker:
			if node.State() == raft.Shutdown {
				log.Fatal("Node removed from the network (in raft state shutdown), terminating.")
			}
		case <-expireSessionsTimer:
			expireSessionsTimer = time.After(expireSessionsInterval)

			// Race conditions (a node becoming a leader or ceasing to be the
			// leader shortly before/after this runs) are okay, since the timer
			// is triggered often enough on every node so that it will
			// eventually run on the leader.
			if node.State() != raft.Leader {
				continue
			}

			applyMu.Lock()
			for _, msg := range ircServer.ExpireSessions() {
				// Cannot fail, no user input.
				msgbytes, _ := json.Marshal(msg)
				f := node.Apply(msgbytes, 10*time.Second)
				if err := f.Error(); err != nil {
					log.Printf("Apply(): %v\n", err)
					break
				}
			}
			applyMu.Unlock()
		}
	}
}
Example #4
0
// TestCompaction does a full snapshot, persists it to disk, restores it and
// makes sure the state matches expectations. The other test functions directly
// test what should be compacted.
func TestCompaction(t *testing.T) {
	ircServer = ircserver.NewIRCServer("", "testnetwork", time.Now())

	tempdir, err := ioutil.TempDir("", "robust-test-")
	if err != nil {
		t.Fatalf("ioutil.TempDir: %v", err)
	}
	defer os.RemoveAll(tempdir)

	flag.Set("raftdir", tempdir)

	logstore, err := raft_store.NewLevelDBStore(filepath.Join(tempdir, "raftlog"), false)
	if err != nil {
		t.Fatalf("Unexpected error in NewLevelDBStore: %v", err)
	}
	ircstore, err := raft_store.NewLevelDBStore(filepath.Join(tempdir, "irclog"), false)
	if err != nil {
		t.Fatalf("Unexpected error in NewLevelDBStore: %v", err)
	}
	fsm := FSM{logstore, ircstore}

	var logs []*raft.Log
	logs = appendLog(logs, `{"Id": {"Id": 1}, "Type": 0, "Data": "auth"}`)
	logs = appendLog(logs, `{"Id": {"Id": 2}, "Session": {"Id": 1}, "Type": 2, "Data": "NICK sECuRE"}`)
	logs = appendLog(logs, `{"Id": {"Id": 3}, "Session": {"Id": 1}, "Type": 2, "Data": "USER blah 0 * :Michael Stapelberg"}`)
	logs = appendLog(logs, `{"Id": {"Id": 4}, "Session": {"Id": 1}, "Type": 2, "Data": "NICK secure_"}`)
	logs = appendLog(logs, `{"Id": {"Id": 5}, "Session": {"Id": 1}, "Type": 2, "Data": "JOIN #chaos-hd"}`)
	logs = appendLog(logs, `{"Id": {"Id": 6}, "Session": {"Id": 1}, "Type": 2, "Data": "JOIN #i3"}`)
	logs = appendLog(logs, `{"Id": {"Id": 7}, "Session": {"Id": 1}, "Type": 2, "Data": "PRIVMSG #chaos-hd :heya"}`)
	logs = appendLog(logs, `{"Id": {"Id": 8}, "Session": {"Id": 1}, "Type": 2, "Data": "PRIVMSG #chaos-hd :newer message"}`)
	logs = appendLog(logs, `{"Id": {"Id": 9}, "Session": {"Id": 1}, "Type": 2, "Data": "PART #i3"}`)

	// These messages are too new to be compacted.
	nowID := time.Now().UnixNano()
	logs = appendLog(logs, `{"Id": {"Id": `+strconv.FormatInt(nowID, 10)+`}, "Session": {"Id": 1}, "Type": 2, "Data": "PART #chaos-hd"}`)
	nowID++
	logs = appendLog(logs, `{"Id": {"Id": `+strconv.FormatInt(nowID, 10)+`}, "Session": {"Id": 1}, "Type": 2, "Data": "JOIN #chaos-hd"}`)

	if err := logstore.StoreLogs(logs); err != nil {
		t.Fatalf("Unexpected error in store.StoreLogs: %v", err)
	}
	for _, log := range logs {
		fsm.Apply(log)
	}

	verifyEndState(t)

	snapshot, err := fsm.Snapshot()
	if err != nil {
		t.Fatalf("Unexpected error in fsm.Snapshot(): %v", err)
	}

	robustsnap, ok := snapshot.(*robustSnapshot)
	if !ok {
		t.Fatalf("fsm.Snapshot() return value is not a robustSnapshot")
	}
	if robustsnap.lastIndex != uint64(len(logs)) {
		t.Fatalf("snapshot does not retain the last message, got: %d, want: %d", robustsnap.lastIndex, len(logs))
	}

	fss, err := raft.NewFileSnapshotStore(tempdir, 5, nil)
	if err != nil {
		t.Fatalf("%v", err)
	}

	sink, err := fss.Create(uint64(len(logs)), 1, []byte{})
	if err != nil {
		t.Fatalf("fss.Create: %v", err)
	}

	if err := snapshot.Persist(sink); err != nil {
		t.Fatalf("Unexpected error in snapshot.Persist(): %v", err)
	}

	snapshots, err := fss.List()
	if err != nil {
		t.Fatalf("fss.List(): %v", err)
	}
	if len(snapshots) != 1 {
		t.Fatalf("len(snapshots): got %d, want 1", len(snapshots))
	}
	_, readcloser, err := fss.Open(snapshots[0].ID)
	if err != nil {
		t.Fatalf("fss.Open(%s): %v", snapshots[0].ID, err)
	}

	if err := fsm.Restore(readcloser); err != nil {
		t.Fatalf("fsm.Restore(): %v", err)
	}

	first, _ := ircstore.FirstIndex()
	last, _ := ircstore.LastIndex()

	if last-first >= uint64(len(logs)) {
		t.Fatalf("Compaction did not decrease log size. got: %d, want: < %d", last-first, len(logs))
	}

	verifyEndState(t)
}
Example #5
0
// TestCompaction does a full snapshot, persists it to disk, restores it and
// makes sure the state matches expectations. The other test functions directly
// test what should be compacted.
func TestCompaction(t *testing.T) {
	ircServer = ircserver.NewIRCServer("", "testnetwork", time.Now())

	tempdir, err := ioutil.TempDir("", "robust-test-")
	if err != nil {
		t.Fatalf("ioutil.TempDir: %v", err)
	}
	defer os.RemoveAll(tempdir)

	flag.Set("raftdir", tempdir)

	logstore, err := raft_store.NewLevelDBStore(filepath.Join(tempdir, "raftlog"), false)
	if err != nil {
		t.Fatalf("Unexpected error in NewLevelDBStore: %v", err)
	}
	ircstore, err := raft_store.NewLevelDBStore(filepath.Join(tempdir, "irclog"), false)
	if err != nil {
		t.Fatalf("Unexpected error in NewLevelDBStore: %v", err)
	}
	fsm := FSM{
		store:             logstore,
		ircstore:          ircstore,
		lastSnapshotState: make(map[uint64][]byte),
	}

	var logs []*raft.Log
	logs = appendLog(logs, `{"Id": {"Id": 1}, "Type": 0, "Data": "auth"}`)
	logs = appendLog(logs, `{"Id": {"Id": 2}, "Session": {"Id": 1}, "Type": 2, "Data": "NICK sECuRE"}`)
	logs = appendLog(logs, `{"Id": {"Id": 3}, "Session": {"Id": 1}, "Type": 2, "Data": "USER blah 0 * :Michael Stapelberg"}`)
	logs = appendLog(logs, `{"Id": {"Id": 4}, "Session": {"Id": 1}, "Type": 2, "Data": "NICK secure_"}`)
	logs = appendLog(logs, `{"Id": {"Id": 5}, "Session": {"Id": 1}, "Type": 2, "Data": "JOIN #chaos-hd"}`)
	logs = appendLog(logs, `{"Id": {"Id": 6}, "Session": {"Id": 1}, "Type": 2, "Data": "JOIN #i3"}`)
	logs = appendLog(logs, `{"Id": {"Id": 7}, "Session": {"Id": 1}, "Type": 2, "Data": "PRIVMSG #chaos-hd :heya"}`)
	logs = appendLog(logs, `{"Id": {"Id": 8}, "Session": {"Id": 1}, "Type": 2, "Data": "PRIVMSG #chaos-hd :newer message"}`)
	logs = appendLog(logs, `{"Id": {"Id": 9}, "Session": {"Id": 1}, "Type": 2, "Data": "PART #i3"}`)

	// These messages are too new to be compacted.
	nowID := time.Now().UnixNano()
	logs = appendLog(logs, `{"Id": {"Id": `+strconv.FormatInt(nowID, 10)+`}, "Session": {"Id": 1}, "Type": 2, "Data": "PART #chaos-hd"}`)
	nowID++
	logs = appendLog(logs, `{"Id": {"Id": `+strconv.FormatInt(nowID, 10)+`}, "Session": {"Id": 1}, "Type": 2, "Data": "JOIN #chaos-hd"}`)

	if err := logstore.StoreLogs(logs); err != nil {
		t.Fatalf("Unexpected error in store.StoreLogs: %v", err)
	}
	for _, log := range logs {
		fsm.Apply(log)
	}

	verifyEndState(t)

	fss, err := raft.NewFileSnapshotStore(tempdir, 5, nil)
	if err != nil {
		t.Fatalf("%v", err)
	}

	// Snapshot twice so that we know state is carried over from one
	// snapshot to the next.
	if err := snapshot(&fsm, fss, uint64(len(logs))); err != nil {
		t.Fatal(err)
	}
	if err := snapshot(&fsm, fss, uint64(len(logs))); err != nil {
		t.Fatal(err)
	}

	ircServer = ircserver.NewIRCServer("", "testnetwork", time.Now())

	if err := restore(&fsm, fss, uint64(len(logs))); err != nil {
		t.Fatal(err)
	}

	verifyEndState(t)

	// Restore() a fresh FSM, then take another snapshot, restore it
	// and verify the end state. This covers the code path where the
	// previous snapshot was not done in the same process run.
	ircstore = fsm.ircstore
	fsm = FSM{
		store:             logstore,
		ircstore:          ircstore,
		lastSnapshotState: make(map[uint64][]byte),
	}

	if err := restore(&fsm, fss, uint64(len(logs))); err != nil {
		t.Fatal(err)
	}

	if err := snapshot(&fsm, fss, uint64(len(logs))); err != nil {
		t.Fatal(err)
	}

	ircServer = ircserver.NewIRCServer("", "testnetwork", time.Now())

	if err := restore(&fsm, fss, uint64(len(logs))); err != nil {
		t.Fatal(err)
	}

	verifyEndState(t)
}