Exemplo n.º 1
0
// exitOnRecover is used to circumvent the recover handler that net/http
// installs. We need to exit in order to get restarted by the init
// system/supervisor and get into a clean state again.
func exitOnRecover() {
	if r := recover(); r != nil {
		// This mimics go/src/net/http/server.go.
		const size = 64 << 10
		buf := make([]byte, size)
		buf = buf[:runtime.Stack(buf, false)]
		glog.Errorf("http: panic serving: %v\n%s", r, buf)
		glog.Flush()
		os.Exit(1)
	}
}
Exemplo n.º 2
0
func TestMain(m *testing.M) {
	defer glog.Flush()
	flag.Parse()
	tempdir, err := ioutil.TempDir("", "robustirc-test-raftdir-")
	if err != nil {
		log.Fatal(err)
	}
	raftDir = &tempdir
	// TODO: cleanup tmp-outputstream and permanent-compaction*
	os.Exit(m.Run())
}
Exemplo n.º 3
0
func main() {
	defer glog.Flush()
	glog.CopyStandardLogTo("INFO")
	flag.Parse()
	if strings.TrimSpace(*network) == "" {
		log.Fatalf("You need to specify -network")
	}

	binaryHash := fileHash(*binaryPath)
	glog.Infof("binaryHash = %s", binaryHash)

	servers := util.ResolveNetwork(*network)
	log.Printf("Checking network health\n")
	if statuses, err := util.EnsureNetworkHealthy(servers, *networkPassword); err != nil {
		log.Fatalf("Aborting upgrade for safety: %v", err)
	} else {
		if allNodesUpdated(statuses, binaryHash) {
			log.Printf("All nodes are already running the requested version.\n")
			return
		}
	}

	log.Printf("Restarting %q nodes until their binary hash is %s\n", *network, binaryHash)

	for rtry := 0; rtry < 5; rtry++ {
		for _, server := range servers {
			var statuses map[string]util.ServerStatus
			var err error

			started := time.Now()
			for time.Since(started) < *networkHealthTimeout {
				statuses, err = util.EnsureNetworkHealthy(servers, *networkPassword)
				if err != nil {
					log.Printf("Network is not healthy: %v\n", err)
					time.Sleep(1 * time.Second)
					continue
				}
				log.Printf("Network became healthy.\n")
				break
			}
			if err != nil {
				log.Fatalf("Network did not become healthy within %v, aborting. (reason: %v)\n", *networkHealthTimeout, err)
			}

			if statuses[server].ExecutableHash == binaryHash {
				if allNodesUpdated(statuses, binaryHash) {
					log.Printf("All done!\n")
					return
				}
				log.Printf("Skipping %q which is already running the requested version\n", server)
				continue
			}

			lastApplied := statuses[server].AppliedIndex

			log.Printf("Killing node %q\n", server)
			if err := quit(server); err != nil {
				log.Printf("%v\n", err)
			}

			for htry := 0; htry < 60; htry++ {
				time.Sleep(1 * time.Second)
				current, err := util.GetServerStatus(server, *networkPassword)
				if err != nil {
					log.Printf("Node unhealthy: %v\n", err)
					continue
				}
				if current.ExecutableHash != binaryHash {
					log.Printf("Node %q came up with hash %s instead of %s?!\n",
						server, current.ExecutableHash, binaryHash)
					break
				}
				if current.AppliedIndex < lastApplied {
					log.Printf("Node %q has not yet applied all messages it saw before, waiting (got %d, want ≥ %d)\n",
						server, current.AppliedIndex, lastApplied)
					continue
				}
				log.Printf("Node %q was upgraded and is healthy again\n", server)
				break
			}
		}
	}

	log.Printf("All done!\n")
}
Exemplo n.º 4
0
func main() {
	flag.Usage = func() {
		// It is unfortunate that we need to re-implement flag.PrintDefaults(),
		// but I cannot see any other way to achieve the grouping of flags.
		fmt.Fprintf(os.Stderr, "RobustIRC server (= node)\n")
		fmt.Fprintf(os.Stderr, "\n")
		fmt.Fprintf(os.Stderr, "The following flags are REQUIRED:\n")
		printDefault(flag.Lookup("network_name"))
		printDefault(flag.Lookup("network_password"))
		printDefault(flag.Lookup("peer_addr"))
		printDefault(flag.Lookup("tls_cert_path"))
		printDefault(flag.Lookup("tls_key_path"))
		fmt.Fprintf(os.Stderr, "\n")
		fmt.Fprintf(os.Stderr, "The following flags are only relevant when bootstrapping the network (once):\n")
		printDefault(flag.Lookup("join"))
		printDefault(flag.Lookup("singlenode"))
		fmt.Fprintf(os.Stderr, "\n")
		fmt.Fprintf(os.Stderr, "The following flags are optional:\n")
		printDefault(flag.Lookup("dump_canary_state"))
		printDefault(flag.Lookup("dump_heap_profile"))
		printDefault(flag.Lookup("canary_compaction_start"))
		printDefault(flag.Lookup("listen"))
		printDefault(flag.Lookup("raftdir"))
		printDefault(flag.Lookup("tls_ca_file"))
		printDefault(flag.Lookup("version"))
		fmt.Fprintf(os.Stderr, "\n")
		fmt.Fprintf(os.Stderr, "The following flags are optional and provided by glog:\n")
		printDefault(flag.Lookup("alsologtostderr"))
		printDefault(flag.Lookup("log_backtrace_at"))
		printDefault(flag.Lookup("log_dir"))
		printDefault(flag.Lookup("log_total_bytes"))
		printDefault(flag.Lookup("logtostderr"))
		printDefault(flag.Lookup("stderrthreshold"))
		printDefault(flag.Lookup("v"))
		printDefault(flag.Lookup("vmodule"))
	}
	flag.Parse()

	// Store logs in -raftdir, unless otherwise specified.
	if flag.Lookup("log_dir").Value.String() == "" {
		flag.Set("log_dir", *raftDir)
	}

	defer glog.Flush()
	glog.MaxSize = 64 * 1024 * 1024
	glog.CopyStandardLogTo("INFO")

	log.Printf("RobustIRC %s\n", Version)
	if *version {
		return
	}

	if _, err := os.Stat(filepath.Join(*raftDir, "deletestate")); err == nil {
		if err := os.RemoveAll(*raftDir); err != nil {
			log.Fatal(err)
		}
		if err := os.Mkdir(*raftDir, 0700); err != nil {
			log.Fatal(err)
		}
		log.Printf("Deleted %q because %q existed\n", *raftDir, filepath.Join(*raftDir, "deletestate"))
	}

	if err := outputstream.DeleteOldDatabases(*raftDir); err != nil {
		log.Fatalf("Could not delete old outputstream databases: %v\n", err)
	}

	if err := deleteOldCompactionDatabases(*raftDir); err != nil {
		glog.Errorf("Could not delete old compaction databases: %v (ignoring)\n", err)
	}

	log.Printf("Initializing RobustIRC…\n")

	if *networkPassword == "" {
		*networkPassword = os.Getenv("ROBUSTIRC_NETWORK_PASSWORD")
	}
	if *networkPassword == "" {
		log.Fatalf("-network_password not set. You MUST protect your network.\n")
	}
	digest := sha1.New()
	digest.Write([]byte(*networkPassword))
	passwordHash := "{SHA}" + base64.StdEncoding.EncodeToString(digest.Sum(nil))

	if *network == "" {
		log.Fatalf("-network_name not set, but required.\n")
	}

	if *peerAddr == "" {
		log.Printf("-peer_addr not set, initializing to %q. Make sure %q is a host:port string that other raft nodes can connect to!\n", *listen, *listen)
		*peerAddr = *listen
	}

	ircServer = ircserver.NewIRCServer(*raftDir, *network, time.Now())

	transport := rafthttp.NewHTTPTransport(
		*peerAddr,
		// Not deadlined, otherwise snapshot installments fail.
		robusthttp.Client(*networkPassword, false),
		nil,
		"")

	peerStore = raft.NewJSONPeers(*raftDir, transport)

	if *join == "" && !*singleNode {
		peers, err := peerStore.Peers()
		if err != nil {
			log.Fatal(err.Error())
		}
		if len(peers) == 0 {
			if !*timesafeguard.DisableTimesafeguard {
				log.Fatalf("No peers known and -join not specified. Joining the network is not safe because timesafeguard cannot be called.\n")
			}
		} else {
			if len(peers) == 1 && peers[0] == *peerAddr {
				// To prevent crashlooping too frequently in case the init system directly restarts our process.
				time.Sleep(10 * time.Second)
				log.Fatalf("Only known peer is myself (%q), implying this node was removed from the network. Please kill the process and remove the data.\n", *peerAddr)
			}
			if err := timesafeguard.SynchronizedWithNetwork(*peerAddr, peers, *networkPassword); err != nil {
				log.Fatal(err.Error())
			}
		}
	}

	var p []string

	config := raft.DefaultConfig()
	config.Logger = log.New(glog.LogBridgeFor("INFO"), "", log.Lshortfile)
	if *singleNode {
		config.EnableSingleNode = true
	}

	// Keep 5 snapshots in *raftDir/snapshots, log to stderr.
	fss, err := raft.NewFileSnapshotStore(*raftDir, 5, nil)
	if err != nil {
		log.Fatal(err)
	}

	// How often to check whether a snapshot should be taken. The check is
	// cheap, and the default value far too high for networks with a high
	// number of messages/s.
	// At the same time, it is important that we don’t check too early,
	// otherwise recovering from the most recent snapshot doesn’t work because
	// after recovering, a new snapshot (over the 0 committed messages) will be
	// taken immediately, effectively overwriting the result of the snapshot
	// recovery.
	config.SnapshotInterval = 300 * time.Second

	// Batch as many messages as possible into a single appendEntries RPC.
	// There is no downside to setting this too high.
	config.MaxAppendEntries = 1024

	// It could be that the heartbeat goroutine is not scheduled for a while,
	// so relax the default of 500ms.
	config.LeaderLeaseTimeout = timesafeguard.ElectionTimeout
	config.HeartbeatTimeout = timesafeguard.ElectionTimeout
	config.ElectionTimeout = timesafeguard.ElectionTimeout

	// We use prometheus, so hook up the metrics package (used by raft) to
	// prometheus as well.
	sink, err := metrics_prometheus.NewPrometheusSink()
	if err != nil {
		log.Fatal(err)
	}
	metrics.NewGlobal(metrics.DefaultConfig("raftmetrics"), sink)

	bootstrapping := *singleNode || *join != ""
	logStore, err := raft_store.NewLevelDBStore(filepath.Join(*raftDir, "raftlog"), bootstrapping)
	if err != nil {
		log.Fatal(err)
	}
	ircStore, err = raft_store.NewLevelDBStore(filepath.Join(*raftDir, "irclog"), bootstrapping)
	if err != nil {
		log.Fatal(err)
	}
	fsm := &FSM{
		store:             logStore,
		ircstore:          ircStore,
		lastSnapshotState: make(map[uint64][]byte),
	}
	logcache, err := raft.NewLogCache(config.MaxAppendEntries, logStore)
	if err != nil {
		log.Fatal(err)
	}

	node, err = raft.NewRaft(config, fsm, logcache, logStore, fss, peerStore, transport)
	if err != nil {
		log.Fatal(err)
	}

	if *dumpCanaryState != "" {
		canary(fsm, *dumpCanaryState)
		if *dumpHeapProfile != "" {
			debug.FreeOSMemory()
			f, err := os.Create(*dumpHeapProfile)
			if err != nil {
				log.Fatal(err)
			}
			defer f.Close()
			pprof.WriteHeapProfile(f)
		}
		return
	}

	go func() {
		for {
			secondsInState.WithLabelValues(node.State().String()).Inc()
			time.Sleep(1 * time.Second)
		}
	}()

	privaterouter := httprouter.New()
	privaterouter.Handler("GET", "/", exitOnRecoverHandleFunc(handleStatus))
	privaterouter.Handler("GET", "/irclog", exitOnRecoverHandleFunc(handleIrclog))
	privaterouter.Handler("POST", "/raft/*rest", exitOnRecoverHandler(transport))
	privaterouter.Handler("POST", "/join", exitOnRecoverHandleFunc(handleJoin))
	privaterouter.Handler("POST", "/part", exitOnRecoverHandleFunc(handlePart))
	privaterouter.Handler("GET", "/snapshot", exitOnRecoverHandleFunc(handleSnapshot))
	privaterouter.Handler("GET", "/leader", exitOnRecoverHandleFunc(handleLeader))
	privaterouter.Handler("POST", "/quit", exitOnRecoverHandleFunc(handleQuit))
	privaterouter.Handler("GET", "/config", exitOnRecoverHandleFunc(handleGetConfig))
	privaterouter.Handler("POST", "/config", exitOnRecoverHandleFunc(handlePostConfig))
	privaterouter.Handler("GET", "/metrics", exitOnRecoverHandler(prometheus.Handler()))

	publicrouter := httprouter.New()
	publicrouter.Handle("POST", "/robustirc/v1/:sessionid", exitOnRecoverHandle(handleCreateSession))
	publicrouter.Handle("POST", "/robustirc/v1/:sessionid/message", exitOnRecoverHandle(handlePostMessage))
	publicrouter.Handle("GET", "/robustirc/v1/:sessionid/messages", exitOnRecoverHandle(handleGetMessages))
	publicrouter.Handle("DELETE", "/robustirc/v1/:sessionid", exitOnRecoverHandle(handleDeleteSession))

	a := auth.NewBasicAuthenticator("robustirc", func(user, realm string) string {
		if user == "robustirc" {
			return passwordHash
		}
		return ""
	})

	http.Handle("/robustirc/", publicrouter)

	http.Handle("/", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
		if username := a.CheckAuth(r); username == "" {
			a.RequireAuth(w, r)
		} else {
			privaterouter.ServeHTTP(w, r)
		}
	}))

	srv := http.Server{Addr: *listen}
	if err := http2.ConfigureServer(&srv, nil); err != nil {
		log.Fatal(err)
	}

	// Manually create the net.TCPListener so that joinMaster() does not run
	// into connection refused errors (the master will try to contact the
	// node before acknowledging the join).
	srv.TLSConfig.Certificates = make([]tls.Certificate, 1)
	srv.TLSConfig.Certificates[0], err = tls.LoadX509KeyPair(*tlsCertPath, *tlsKeyPath)
	if err != nil {
		log.Fatal(err)
	}

	ln, err := net.Listen("tcp", *listen)
	if err != nil {
		log.Fatal(err)
	}

	tlsListener := tls.NewListener(tcpKeepAliveListener{ln.(*net.TCPListener)}, srv.TLSConfig)
	go srv.Serve(tlsListener)

	log.Printf("RobustIRC listening on %q. For status, see %s\n",
		*peerAddr,
		fmt.Sprintf("https://*****:*****@%s/", *networkPassword, *peerAddr))

	if *join != "" {
		if err := timesafeguard.SynchronizedWithMasterAndNetwork(*peerAddr, *join, *networkPassword); err != nil {
			log.Fatal(err.Error())
		}

		p = joinMaster(*join, peerStore)
		// TODO(secure): properly handle joins on the server-side where the joining node is already in the network.
	}

	if len(p) > 0 {
		node.SetPeers(p)
	}

	expireSessionsTimer := time.After(expireSessionsInterval)
	secondTicker := time.Tick(1 * time.Second)
	for {
		select {
		case <-secondTicker:
			if node.State() == raft.Shutdown {
				log.Fatal("Node removed from the network (in raft state shutdown), terminating.")
			}
		case <-expireSessionsTimer:
			expireSessionsTimer = time.After(expireSessionsInterval)

			// Race conditions (a node becoming a leader or ceasing to be the
			// leader shortly before/after this runs) are okay, since the timer
			// is triggered often enough on every node so that it will
			// eventually run on the leader.
			if node.State() != raft.Leader {
				continue
			}

			applyMu.Lock()
			for _, msg := range ircServer.ExpireSessions() {
				// Cannot fail, no user input.
				msgbytes, _ := json.Marshal(msg)
				f := node.Apply(msgbytes, 10*time.Second)
				if err := f.Error(); err != nil {
					log.Printf("Apply(): %v\n", err)
					break
				}
			}
			applyMu.Unlock()
		}
	}
}