// NewNetwork creates nodeCount gossip nodes. func NewNetwork(stopper *stop.Stopper, nodeCount int, createResolvers bool) *Network { log.Infof(context.TODO(), "simulating gossip network with %d nodes", nodeCount) n := &Network{ Nodes: []*Node{}, Stopper: stopper, } n.rpcContext = rpc.NewContext( log.AmbientContext{}, &base.Config{Insecure: true}, hlc.NewClock(hlc.UnixNano, time.Nanosecond), n.Stopper, ) var err error n.tlsConfig, err = n.rpcContext.GetServerTLSConfig() if err != nil { log.Fatal(context.TODO(), err) } for i := 0; i < nodeCount; i++ { node, err := n.CreateNode() if err != nil { log.Fatal(context.TODO(), err) } // Build a resolver for each instance or we'll get data races. if createResolvers { r, err := resolver.NewResolverFromAddress(n.Nodes[0].Addr()) if err != nil { log.Fatalf(context.TODO(), "bad gossip address %s: %s", n.Nodes[0].Addr(), err) } node.Gossip.SetResolvers([]resolver.Resolver{r}) } } return n }
// initNodeID updates the internal NodeDescriptor with the given ID. If zero is // supplied, a new NodeID is allocated with the first invocation. For all other // values, the supplied ID is stored into the descriptor (unless one has been // set previously, in which case a fatal error occurs). // // Upon setting a new NodeID, the descriptor is gossiped and the NodeID is // stored into the gossip instance. func (n *Node) initNodeID(id roachpb.NodeID) { ctx := n.AnnotateCtx(context.TODO()) if id < 0 { log.Fatalf(ctx, "NodeID must not be negative") } if o := n.Descriptor.NodeID; o > 0 { if id == 0 { return } log.Fatalf(ctx, "cannot initialize NodeID to %d, already have %d", id, o) } var err error if id == 0 { ctxWithSpan, span := n.AnnotateCtxWithSpan(ctx, "alloc-node-id") id, err = allocateNodeID(ctxWithSpan, n.storeCfg.DB) if err != nil { log.Fatal(ctxWithSpan, err) } log.Infof(ctxWithSpan, "new node allocated ID %d", id) if id == 0 { log.Fatal(ctxWithSpan, "new node allocated illegal ID 0") } span.Finish() n.storeCfg.Gossip.NodeID.Set(ctx, id) } else { log.Infof(ctx, "node ID %d initialized", id) } // Gossip the node descriptor to make this node addressable by node ID. n.Descriptor.NodeID = id if err = n.storeCfg.Gossip.SetNodeDescriptor(&n.Descriptor); err != nil { log.Fatalf(ctx, "couldn't gossip descriptor for node %d: %s", n.Descriptor.NodeID, err) } }
func (z *zeroSum) setup() uint32 { db := z.DB[0] if _, err := db.Exec("CREATE DATABASE IF NOT EXISTS zerosum"); err != nil { log.Fatal(context.Background(), err) } accounts := ` CREATE TABLE IF NOT EXISTS accounts ( id INT PRIMARY KEY, balance INT NOT NULL ) ` if _, err := db.Exec(accounts); err != nil { log.Fatal(context.Background(), err) } tableIDQuery := ` SELECT tables.id FROM system.namespace tables JOIN system.namespace dbs ON dbs.id = tables.parentid WHERE dbs.name = $1 AND tables.name = $2 ` var tableID uint32 if err := db.QueryRow(tableIDQuery, "zerosum", "accounts").Scan(&tableID); err != nil { log.Fatal(context.Background(), err) } return tableID }
// Freeze freezes (or thaws) the cluster. The freeze request is sent to the // specified node. func (c *Cluster) Freeze(nodeIdx int, freeze bool) { addr := c.RPCAddr(nodeIdx) conn, err := c.rpcCtx.GRPCDial(addr) if err != nil { log.Fatalf(context.Background(), "unable to dial: %s: %v", addr, err) } adminClient := serverpb.NewAdminClient(conn) stream, err := adminClient.ClusterFreeze( context.Background(), &serverpb.ClusterFreezeRequest{Freeze: freeze}) if err != nil { log.Fatal(context.Background(), err) } for { resp, err := stream.Recv() if err != nil { if err == io.EOF { break } log.Fatal(context.Background(), err) } fmt.Println(resp.Message) } fmt.Println("ok") }
// UpdateZoneConfig updates the default zone config for the cluster. func (c *Cluster) UpdateZoneConfig(rangeMinBytes, rangeMaxBytes int64) { zone := config.DefaultZoneConfig() zone.RangeMinBytes = rangeMinBytes zone.RangeMaxBytes = rangeMaxBytes buf, err := protoutil.Marshal(&zone) if err != nil { log.Fatal(context.Background(), err) } _, err = c.DB[0].Exec(`UPSERT INTO system.zones (id, config) VALUES (0, $1)`, buf) if err != nil { log.Fatal(context.Background(), err) } }
func newCLITest() cliTest { // Reset the client context for each test. We don't reset the // pointer (because they are tied into the flags), but instead // overwrite the existing struct's values. baseCfg.InitDefaults() cliCtx.InitCLIDefaults() osStderr = os.Stdout s, err := serverutils.StartServerRaw(base.TestServerArgs{}) if err != nil { log.Fatalf(context.Background(), "Could not start server: %s", err) } tempDir, err := ioutil.TempDir("", "cli-test") if err != nil { log.Fatal(context.Background(), err) } // Copy these assets to disk from embedded strings, so this test can // run from a standalone binary. // Disable embedded certs, or the security library will try to load // our real files as embedded assets. security.ResetReadFileFn() assets := []string{ filepath.Join(security.EmbeddedCertsDir, security.EmbeddedCACert), filepath.Join(security.EmbeddedCertsDir, security.EmbeddedCAKey), filepath.Join(security.EmbeddedCertsDir, security.EmbeddedNodeCert), filepath.Join(security.EmbeddedCertsDir, security.EmbeddedNodeKey), filepath.Join(security.EmbeddedCertsDir, security.EmbeddedRootCert), filepath.Join(security.EmbeddedCertsDir, security.EmbeddedRootKey), } for _, a := range assets { securitytest.RestrictedCopy(nil, a, tempDir, filepath.Base(a)) } return cliTest{ TestServer: s.(*server.TestServer), certsDir: tempDir, cleanupFunc: func() { if err := os.RemoveAll(tempDir); err != nil { log.Fatal(context.Background(), err) } }, } }
func (c *Cluster) makeNode(nodeIdx int, extraArgs, extraEnv []string) *Node { name := fmt.Sprintf("%d", nodeIdx+1) dir := filepath.Join(dataDir, name) logDir := filepath.Join(dir, "logs") if err := os.MkdirAll(logDir, 0755); err != nil { log.Fatal(context.Background(), err) } args := []string{ cockroachBin, "start", "--insecure", fmt.Sprintf("--port=%d", c.RPCPort(nodeIdx)), fmt.Sprintf("--http-port=%d", c.HTTPPort(nodeIdx)), fmt.Sprintf("--store=%s", dir), fmt.Sprintf("--cache=256MiB"), fmt.Sprintf("--logtostderr"), } if nodeIdx > 0 { args = append(args, fmt.Sprintf("--join=localhost:%d", c.RPCPort(0))) } args = append(args, extraArgs...) node := &Node{ logDir: logDir, args: args, env: extraEnv, } node.Start() return node }
func (a *allocSim) setup() { db := a.DB[0] if _, err := db.Exec("CREATE DATABASE IF NOT EXISTS allocsim"); err != nil { log.Fatal(context.Background(), err) } blocks := ` CREATE TABLE IF NOT EXISTS blocks ( id INT NOT NULL, num INT NOT NULL, data BYTES NOT NULL, PRIMARY KEY (id, num) ) ` if _, err := db.Exec(blocks); err != nil { log.Fatal(context.Background(), err) } }
// SimulateNetwork runs until the simCallback returns false. // // At each cycle, every node gossips a key equal to its address (unique) // with the cycle as the value. The received cycle value can be used // to determine the aging of information between any two nodes in the // network. // // At each cycle of the simulation, node 0 gossips the sentinel. // // The simulation callback receives the cycle and the network as arguments. func (n *Network) SimulateNetwork(simCallback func(cycle int, network *Network) bool) { n.Start() nodes := n.Nodes for cycle := 1; ; cycle++ { // Node 0 gossips sentinel & cluster ID every cycle. if err := nodes[0].Gossip.AddInfo( gossip.KeySentinel, encoding.EncodeUint64Ascending(nil, uint64(cycle)), time.Hour, ); err != nil { log.Fatal(context.TODO(), err) } if err := nodes[0].Gossip.AddInfo( gossip.KeyClusterID, encoding.EncodeUint64Ascending(nil, uint64(cycle)), 0*time.Second, ); err != nil { log.Fatal(context.TODO(), err) } // Every node gossips every cycle. for _, node := range nodes { if err := node.Gossip.AddInfo( node.Addr().String(), encoding.EncodeUint64Ascending(nil, uint64(cycle)), time.Hour, ); err != nil { log.Fatal(context.TODO(), err) } node.Gossip.SimulationCycle() } // If the simCallback returns false, we're done with the // simulation; exit the loop. This condition is tested here // instead of in the for statement in order to guarantee // we run at least one iteration of this loop in order to // gossip the cluster ID and sentinel. if !simCallback(cycle, n) { break } time.Sleep(5 * time.Millisecond) } log.Infof(context.TODO(), "gossip network simulation: total infos sent=%d, received=%d", n.infosSent(), n.infosReceived()) }
// Start starts all gossip nodes. // TODO(spencer): make all methods in Network return errors instead of // fatal logging. func (n *Network) Start() { if n.started { return } n.started = true for _, node := range n.Nodes { if err := n.StartNode(node); err != nil { log.Fatal(context.TODO(), err) } } }
// RestrictedCopy creates an on-disk copy of the embedded security asset // with the provided path. The copy will be created in the provided directory. // Returns the path of the file and a cleanup function that will delete the file. // // The file will have restrictive file permissions (0600), making it // appropriate for usage by libraries that require security assets to have such // restrictive permissions. func RestrictedCopy(t util.Tester, path, tempdir, name string) string { contents, err := Asset(path) if err != nil { if t == nil { log.Fatal(context.TODO(), err) } else { t.Fatal(err) } } return util.CreateRestrictedFile(t, contents, tempdir, name) }
// CreateRestrictedFile creates a file on disk which contains the // supplied byte string as its content. The resulting file will have restrictive // permissions; specifically, u=rw (0600). Returns the path of the created file // along with a function that will delete the created file. // // This is needed for some Go libraries (e.g. postgres SQL driver) which will // refuse to open certificate files that have overly permissive permissions. func CreateRestrictedFile(t Tester, contents []byte, tempdir, name string) string { tempPath := filepath.Join(tempdir, name) if err := ioutil.WriteFile(tempPath, contents, 0600); err != nil { if t == nil { log.Fatal(context.TODO(), err) } else { t.Fatal(err) } } return tempPath }
// bootstrapStores bootstraps uninitialized stores once the cluster // and node IDs have been established for this node. Store IDs are // allocated via a sequence id generator stored at a system key per // node. func (n *Node) bootstrapStores( ctx context.Context, bootstraps []*storage.Store, stopper *stop.Stopper, ) { if n.ClusterID == *uuid.EmptyUUID { panic("ClusterID missing during store bootstrap of auxiliary store") } // Bootstrap all waiting stores by allocating a new store id for // each and invoking store.Bootstrap() to persist. inc := int64(len(bootstraps)) firstID, err := allocateStoreIDs(ctx, n.Descriptor.NodeID, inc, n.storeCfg.DB) if err != nil { log.Fatal(ctx, err) } sIdent := roachpb.StoreIdent{ ClusterID: n.ClusterID, NodeID: n.Descriptor.NodeID, StoreID: firstID, } for _, s := range bootstraps { if err := s.Bootstrap(sIdent); err != nil { log.Fatal(ctx, err) } if err := s.Start(ctx, stopper); err != nil { log.Fatal(ctx, err) } n.addStore(s) sIdent.StoreID++ log.Infof(ctx, "bootstrapped store %s", s) // Done regularly in Node.startGossip, but this cuts down the time // until this store is used for range allocations. if err := s.GossipStore(ctx); err != nil { log.Warningf(ctx, "error doing initial gossiping: %s", err) } } // write a new status summary after all stores have been bootstrapped; this // helps the UI remain responsive when new nodes are added. if err := n.writeSummaries(ctx); err != nil { log.Warningf(ctx, "error writing node summary after store bootstrap: %s", err) } }
func (a *allocSim) rangeInfo() (total int, replicas []int, leases []int) { replicas = make([]int, len(a.Nodes)) leases = make([]int, len(a.Nodes)) // Retrieve the metrics for each node and extract the replica and leaseholder // counts. var wg sync.WaitGroup wg.Add(len(a.Status)) for i := range a.Status { go func(i int) { defer wg.Done() resp, err := a.Status[i].Metrics(context.Background(), &serverpb.MetricsRequest{ NodeId: fmt.Sprintf("%d", i+1), }) if err != nil { log.Fatal(context.Background(), err) } var metrics map[string]interface{} if err := json.Unmarshal(resp.Data, &metrics); err != nil { log.Fatal(context.Background(), err) } stores := metrics["stores"].(map[string]interface{}) for _, v := range stores { storeMetrics := v.(map[string]interface{}) if v, ok := storeMetrics["replicas"]; ok { replicas[i] += int(v.(float64)) } if v, ok := storeMetrics["replicas.leaseholders"]; ok { leases[i] += int(v.(float64)) } } }(i) } wg.Wait() for _, v := range replicas { total += v } return total, replicas, leases }
func (c *Cluster) makeDB(nodeIdx, numWorkers int, dbName string) *gosql.DB { url := fmt.Sprintf("postgresql://root@localhost:%d/%s?sslmode=disable", c.RPCPort(nodeIdx), dbName) conn, err := gosql.Open("postgres", url) if err != nil { log.Fatal(context.Background(), err) } if numWorkers == 0 { numWorkers = 1 } conn.SetMaxOpenConns(numWorkers) conn.SetMaxIdleConns(numWorkers) return conn }
func (z *zeroSum) worker() { r := newRand() zipf := z.accountDistribution(r) for { from := zipf.Uint64() to := zipf.Uint64() if from == to { continue } db := z.DB[z.RandNode(r.Intn)] err := crdb.ExecuteTx(db, func(tx *gosql.Tx) error { rows, err := tx.Query(`SELECT id, balance FROM accounts WHERE id IN ($1, $2)`, from, to) if err != nil { return err } var fromBalance, toBalance int64 for rows.Next() { var id uint64 var balance int64 if err = rows.Scan(&id, &balance); err != nil { log.Fatal(context.Background(), err) } switch id { case from: fromBalance = balance case to: toBalance = balance default: panic(fmt.Sprintf("got unexpected account %d", id)) } } upsert := `UPSERT INTO accounts VALUES ($1, $3), ($2, $4)` _, err = tx.Exec(upsert, to, from, toBalance+1, fromBalance-1) return err }) if err != nil { z.maybeLogError(err) } else { atomic.AddUint64(&z.stats.ops, 1) z.accounts.Lock() z.accounts.m[from] = struct{}{} z.accounts.m[to] = struct{}{} z.accounts.Unlock() } } }
// NewExecutor creates an Executor and registers a callback on the // system config. func NewExecutor( cfg ExecutorConfig, stopper *stop.Stopper, startupMemMetrics *MemoryMetrics, ) *Executor { exec := &Executor{ cfg: cfg, reCache: parser.NewRegexpCache(512), Latency: metric.NewLatency(MetaLatency, cfg.MetricsSampleInterval), TxnBeginCount: metric.NewCounter(MetaTxnBegin), TxnCommitCount: metric.NewCounter(MetaTxnCommit), TxnAbortCount: metric.NewCounter(MetaTxnAbort), TxnRollbackCount: metric.NewCounter(MetaTxnRollback), SelectCount: metric.NewCounter(MetaSelect), UpdateCount: metric.NewCounter(MetaUpdate), InsertCount: metric.NewCounter(MetaInsert), DeleteCount: metric.NewCounter(MetaDelete), DdlCount: metric.NewCounter(MetaDdl), MiscCount: metric.NewCounter(MetaMisc), QueryCount: metric.NewCounter(MetaQuery), } exec.systemConfigCond = sync.NewCond(exec.systemConfigMu.RLocker()) gossipUpdateC := cfg.Gossip.RegisterSystemConfigChannel() stopper.RunWorker(func() { for { select { case <-gossipUpdateC: sysCfg, _ := cfg.Gossip.GetSystemConfig() exec.updateSystemConfig(sysCfg) case <-stopper.ShouldStop(): return } } }) ctx := log.WithLogTag(context.Background(), "startup", nil) startupSession := NewSession(ctx, SessionArgs{}, exec, nil, startupMemMetrics) if err := exec.virtualSchemas.init(&startupSession.planner); err != nil { log.Fatal(ctx, err) } startupSession.Finish(exec) return exec }
func TestAllocateWithStopper(t *testing.T) { defer leaktest.AfterTest(t)() store, _, stopper := createTestStore(t) idAlloc, err := newIDAllocator( log.AmbientContext{}, keys.RangeIDGenerator, store.cfg.DB, 2, 10, stopper, ) if err != nil { log.Fatal(context.Background(), err) } stopper.Stop() if _, err := idAlloc.Allocate(); err == nil { t.Errorf("unexpected success") } else if !strings.Contains(err.Error(), "system is draining") { t.Errorf("unexpected error: %s", err) } }
func hasImage(l *LocalCluster, ref string) bool { name := strings.Split(ref, ":")[0] images, err := l.client.ImageList(context.Background(), types.ImageListOptions{MatchName: name}) if err != nil { log.Fatal(context.TODO(), err) } for _, image := range images { for _, repoTag := range image.RepoTags { // The Image.RepoTags field contains strings of the form <repo>:<tag>. if ref == repoTag { return true } } } for _, image := range images { for _, tag := range image.RepoTags { log.Infof(context.TODO(), "ImageList %s %s", tag, image.ID) } } return false }
// MakeServer constructs a Server that tracks active connections, closing them // when signalled by stopper. func MakeServer(stopper *stop.Stopper, tlsConfig *tls.Config, handler http.Handler) Server { var mu syncutil.Mutex activeConns := make(map[net.Conn]struct{}) server := Server{ Server: &http.Server{ Handler: handler, TLSConfig: tlsConfig, ConnState: func(conn net.Conn, state http.ConnState) { mu.Lock() switch state { case http.StateNew: activeConns[conn] = struct{}{} case http.StateClosed: delete(activeConns, conn) } mu.Unlock() }, ErrorLog: httpLogger, }, } // net/http.(*Server).Serve/http2.ConfigureServer are not thread safe with // respect to net/http.(*Server).TLSConfig, so we call it synchronously here. if err := http2.ConfigureServer(server.Server, nil); err != nil { log.Fatal(context.TODO(), err) } stopper.RunWorker(func() { <-stopper.ShouldStop() mu.Lock() for conn := range activeConns { conn.Close() } mu.Unlock() }) return server }
// maybeTransferRaftLeadership attempts to transfer the leadership away from // this node to target, if this node is the current raft leader. // The transfer might silently fail, particularly (only?) if the transferee is // behind on applying the log. func (r *Replica) maybeTransferRaftLeadership( ctx context.Context, replicaID roachpb.ReplicaID, target roachpb.ReplicaID, ) { err := r.withRaftGroup(func(raftGroup *raft.RawNode) (bool, error) { if raftGroup.Status().RaftState == raft.StateLeader { // Only the raft leader can attempt a leadership transfer. log.Infof(ctx, "range %s: transferring raft leadership to replica ID %v", r, target) raftGroup.TransferLeader(uint64(target)) } return true, nil }) if err != nil { // An error here indicates that this Replica has been destroyed // while lacking the necessary synchronization (or even worse, it // fails spuriously - could be a storage error), and so we avoid // sweeping that under the rug. // // TODO(tschottdorf): this error is not handled any more // at this level. log.Fatal(ctx, NewReplicaCorruptionError(err)) } }
// maybeTransferRaftLeadership attempts to transfer the leadership // away from this node to target, if this node is the current raft // leader. We don't attempt to transfer leadership if the transferee // is behind on applying the log. func (r *Replica) maybeTransferRaftLeadership(ctx context.Context, target roachpb.ReplicaID) { err := r.withRaftGroup(func(raftGroup *raft.RawNode) (bool, error) { // Only the raft leader can attempt a leadership transfer. if status := raftGroup.Status(); status.RaftState == raft.StateLeader { // Only attempt this if the target has all the log entries. if pr, ok := status.Progress[uint64(target)]; ok && pr.Match == r.mu.lastIndex { log.VEventf(ctx, 1, "transferring raft leadership to replica ID %v", target) r.store.metrics.RangeRaftLeaderTransfers.Inc(1) raftGroup.TransferLeader(uint64(target)) } } return true, nil }) if err != nil { // An error here indicates that this Replica has been destroyed // while lacking the necessary synchronization (or even worse, it // fails spuriously - could be a storage error), and so we avoid // sweeping that under the rug. // // TODO(tschottdorf): this error is not handled any more // at this level. log.Fatal(ctx, NewReplicaCorruptionError(err)) } }
// sendPartialBatch sends the supplied batch to the range specified by // desc. The batch request is first truncated so that it contains only // requests which intersect the range descriptor and keys for each // request are limited to the range's key span. The send occurs in a // retry loop to handle send failures. On failure to send to any // replicas, we backoff and retry by refetching the range // descriptor. If the underlying range seems to have split, we // recursively invoke divideAndSendBatchToRanges to re-enumerate the // ranges in the span and resend to each. func (ds *DistSender) sendPartialBatch( ctx context.Context, ba roachpb.BatchRequest, rs roachpb.RSpan, desc *roachpb.RangeDescriptor, evictToken *EvictionToken, isFirst bool, ) response { var reply *roachpb.BatchResponse var pErr *roachpb.Error isReverse := ba.IsReverse() // Truncate the request to range descriptor. intersected, err := rs.Intersect(desc) if err != nil { return response{pErr: roachpb.NewError(err)} } truncBA, numActive, err := truncate(ba, intersected) if numActive == 0 && err == nil { // This shouldn't happen in the wild, but some tests exercise it. return response{ pErr: roachpb.NewErrorf("truncation resulted in empty batch on %s: %s", intersected, ba), } } if err != nil { return response{pErr: roachpb.NewError(err)} } // Start a retry loop for sending the batch to the range. for r := retry.StartWithCtx(ctx, ds.rpcRetryOptions); r.Next(); { // If we've cleared the descriptor on a send failure, re-lookup. if desc == nil { var descKey roachpb.RKey if isReverse { descKey = intersected.EndKey } else { descKey = intersected.Key } desc, evictToken, err = ds.getDescriptor(ctx, descKey, nil, isReverse) if err != nil { log.ErrEventf(ctx, "range descriptor re-lookup failed: %s", err) continue } } reply, pErr = ds.sendSingleRange(ctx, truncBA, desc) // If sending succeeded, return immediately. if pErr == nil { return response{reply: reply} } log.ErrEventf(ctx, "reply error %s: %s", ba, pErr) // Error handling: If the error indicates that our range // descriptor is out of date, evict it from the cache and try // again. Errors that apply only to a single replica were // handled in send(). // // TODO(bdarnell): Don't retry endlessly. If we fail twice in a // row and the range descriptor hasn't changed, return the error // to our caller. switch tErr := pErr.GetDetail().(type) { case *roachpb.SendError: // We've tried all the replicas without success. Either // they're all down, or we're using an out-of-date range // descriptor. Invalidate the cache and try again with the new // metadata. log.Event(ctx, "evicting range descriptor on send error and backoff for re-lookup") if err := evictToken.Evict(ctx); err != nil { return response{pErr: roachpb.NewError(err)} } // Clear the descriptor to reload on the next attempt. desc = nil continue case *roachpb.RangeKeyMismatchError: // Range descriptor might be out of date - evict it. This is // likely the result of a range split. If we have new range // descriptors, insert them instead as long as they are different // from the last descriptor to avoid endless loops. var replacements []roachpb.RangeDescriptor different := func(rd *roachpb.RangeDescriptor) bool { return !desc.RSpan().Equal(rd.RSpan()) } if tErr.MismatchedRange != nil && different(tErr.MismatchedRange) { replacements = append(replacements, *tErr.MismatchedRange) } if tErr.SuggestedRange != nil && different(tErr.SuggestedRange) { if includesFrontOfCurSpan(isReverse, tErr.SuggestedRange, rs) { replacements = append(replacements, *tErr.SuggestedRange) } } // Same as Evict() if replacements is empty. if err := evictToken.EvictAndReplace(ctx, replacements...); err != nil { return response{pErr: roachpb.NewError(err)} } // On addressing errors (likely a split), we need to re-invoke // the range descriptor lookup machinery, so we recurse by // sending batch to just the partial span this descriptor was // supposed to cover. log.VEventf(ctx, 1, "likely split; resending batch to span: %s", tErr) reply, pErr = ds.divideAndSendBatchToRanges(ctx, ba, intersected, isFirst) return response{reply: reply, pErr: pErr} } break } // Propagate error if either the retry closer or context done // channels were closed. if pErr == nil { if pErr = ds.deduceRetryEarlyExitError(ctx); pErr == nil { log.Fatal(ctx, "exited retry loop without an error") } } return response{pErr: pErr} }
// outputDotFile generates a .dot file describing the current state of // the gossip network. nodes is a map from network address to gossip // node. edgeSet is empty on the first invocation, but // its content is set to encompass the entire set of edges in the // network when this method returns. It should be resupplied with each // successive invocation, as it is used to determine which edges are // new and which have been deleted and show those changes visually in // the output graph. New edges are drawn green; edges which were // removed over the course of the last simulation step(s) are drawn in // a lightly-dashed red. // // The format of the output looks like this: // // digraph G { // node [shape=record]; // node1 [fontsize=12,label="{Node 1|MH=3}"] // node1 -> node3 [color=green] // node1 -> node4 // node1 -> node5 [color=red,style=dotted] // node2 [fontsize=24,label="{Node 2|MH=2}"] // node2 -> node5 // node3 [fontsize=18,label="{Node 3|MH=5}"] // node3 -> node5 // node3 -> node4 // node4 [fontsize=24,label="{Node 4|MH=4}"] // node4 -> node2 // node5 [fontsize=24,label="{Node 5|MH=1}"] // node5 -> node2 // node5 -> node3 // } // // Returns the name of the output file and a boolean for whether or not // the network has quiesced (that is, no new edges, and all nodes are // connected). func outputDotFile( dotFN string, cycle int, network *simulation.Network, edgeSet map[string]edge, ) (string, bool) { f, err := os.Create(dotFN) if err != nil { log.Fatalf(context.TODO(), "unable to create temp file: %s", err) } defer f.Close() // Determine maximum number of incoming connections. Create outgoing // edges, keeping track of which are new since last time (added=true). outgoingMap := make(edgeMap) var maxIncoming int quiescent := true // The order the graph file is written influences the arrangement // of nodes in the output image, so it makes sense to eliminate // randomness here. Unfortunately with graphviz it's fairly hard // to get a consistent ordering. for _, simNode := range network.Nodes { node := simNode.Gossip incoming := node.Incoming() for _, iNode := range incoming { e := edge{dest: node.NodeID.Get()} key := fmt.Sprintf("%d:%d", iNode, node.NodeID.Get()) if _, ok := edgeSet[key]; !ok { e.added = true quiescent = false } delete(edgeSet, key) outgoingMap.addEdge(iNode, e) } if len(incoming) > maxIncoming { maxIncoming = len(incoming) } } // Find all edges which were deleted. for key, e := range edgeSet { e.added = false e.deleted = true quiescent = false nodeID, err := strconv.Atoi(strings.Split(key, ":")[0]) if err != nil { log.Fatal(context.TODO(), err) } outgoingMap.addEdge(roachpb.NodeID(nodeID), e) delete(edgeSet, key) } fmt.Fprintln(f, "digraph G {") fmt.Fprintln(f, "node [shape=record];") for _, simNode := range network.Nodes { node := simNode.Gossip var missing []roachpb.NodeID var totalAge int64 for _, otherNode := range network.Nodes { if otherNode == simNode { continue // skip the node's own info } infoKey := otherNode.Addr().String() // GetInfo returns an error if the info is missing. if info, err := node.GetInfo(infoKey); err != nil { missing = append(missing, otherNode.Gossip.NodeID.Get()) quiescent = false } else { _, val, err := encoding.DecodeUint64Ascending(info) if err != nil { log.Fatalf(context.TODO(), "bad decode of node info cycle: %s", err) } totalAge += int64(cycle) - int64(val) } } log.Infof(context.TODO(), "node %d: missing infos for nodes %s", node.NodeID.Get(), missing) var sentinelAge int64 // GetInfo returns an error if the info is missing. if info, err := node.GetInfo(gossip.KeySentinel); err != nil { log.Infof(context.TODO(), "error getting info for sentinel gossip key %q: %s", gossip.KeySentinel, err) } else { _, val, err := encoding.DecodeUint64Ascending(info) if err != nil { log.Fatalf(context.TODO(), "bad decode of sentinel cycle: %s", err) } sentinelAge = int64(cycle) - int64(val) } var age, nodeColor string if len(missing) > 0 { nodeColor = "color=red," age = fmt.Sprintf("missing %d", len(missing)) } else { age = strconv.FormatFloat(float64(totalAge)/float64(len(network.Nodes)-1-len(missing)), 'f', 4, 64) } fontSize := minDotFontSize if maxIncoming > 0 { fontSize = minDotFontSize + int(math.Floor(float64(len(node.Incoming())* (maxDotFontSize-minDotFontSize))/float64(maxIncoming))) } fmt.Fprintf(f, "\t%s [%sfontsize=%d,label=\"{%s|AA=%s, MH=%d, SA=%d}\"]\n", node.NodeID.Get(), nodeColor, fontSize, node.NodeID.Get(), age, node.MaxHops(), sentinelAge) outgoing := outgoingMap[node.NodeID.Get()] for _, e := range outgoing { destSimNode, ok := network.GetNodeFromID(e.dest) if !ok { continue } dest := destSimNode.Gossip style := "" if e.added { style = " [color=green]" } else if e.deleted { style = " [color=red,style=dotted]" } fmt.Fprintf(f, "\t%s -> %s%s\n", node.NodeID.Get(), dest.NodeID.Get(), style) if !e.deleted { edgeSet[fmt.Sprintf("%d:%d", node.NodeID.Get(), e.dest)] = e } } } fmt.Fprintln(f, "}") return f.Name(), quiescent }
// FatalIfUnexpected calls Log.Fatal(err) unless err is nil, // cmux.ErrListenerClosed, or the net package's errClosed. func FatalIfUnexpected(err error) { if err != nil && !IsClosedConnection(err) { log.Fatal(context.TODO(), err) } }
// Start starts the server on the specified port, starts gossip and initializes // the node using the engines from the server's context. // // The passed context can be used to trace the server startup. The context // should represent the general startup operation. func (s *Server) Start(ctx context.Context) error { ctx = s.AnnotateCtx(ctx) startTime := timeutil.Now() tlsConfig, err := s.cfg.GetServerTLSConfig() if err != nil { return err } httpServer := netutil.MakeServer(s.stopper, tlsConfig, s) plainRedirectServer := netutil.MakeServer(s.stopper, tlsConfig, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { http.Redirect(w, r, "https://"+r.Host+r.RequestURI, http.StatusPermanentRedirect) })) // The following code is a specialization of util/net.go's ListenAndServe // which adds pgwire support. A single port is used to serve all protocols // (pg, http, h2) via the following construction: // // non-TLS case: // net.Listen -> cmux.New // | // - -> pgwire.Match -> pgwire.Server.ServeConn // - -> cmux.Any -> grpc.(*Server).Serve // // TLS case: // net.Listen -> cmux.New // | // - -> pgwire.Match -> pgwire.Server.ServeConn // - -> cmux.Any -> grpc.(*Server).Serve // // Note that the difference between the TLS and non-TLS cases exists due to // Go's lack of an h2c (HTTP2 Clear Text) implementation. See inline comments // in util.ListenAndServe for an explanation of how h2c is implemented there // and here. ln, err := net.Listen("tcp", s.cfg.Addr) if err != nil { return err } log.Eventf(ctx, "listening on port %s", s.cfg.Addr) unresolvedListenAddr, err := officialAddr(s.cfg.Addr, ln.Addr()) if err != nil { return err } s.cfg.Addr = unresolvedListenAddr.String() unresolvedAdvertAddr, err := officialAddr(s.cfg.AdvertiseAddr, ln.Addr()) if err != nil { return err } s.cfg.AdvertiseAddr = unresolvedAdvertAddr.String() s.rpcContext.SetLocalInternalServer(s.node) m := cmux.New(ln) pgL := m.Match(pgwire.Match) anyL := m.Match(cmux.Any()) httpLn, err := net.Listen("tcp", s.cfg.HTTPAddr) if err != nil { return err } unresolvedHTTPAddr, err := officialAddr(s.cfg.HTTPAddr, httpLn.Addr()) if err != nil { return err } s.cfg.HTTPAddr = unresolvedHTTPAddr.String() workersCtx := s.AnnotateCtx(context.Background()) s.stopper.RunWorker(func() { <-s.stopper.ShouldQuiesce() if err := httpLn.Close(); err != nil { log.Fatal(workersCtx, err) } }) if tlsConfig != nil { httpMux := cmux.New(httpLn) clearL := httpMux.Match(cmux.HTTP1()) tlsL := httpMux.Match(cmux.Any()) s.stopper.RunWorker(func() { netutil.FatalIfUnexpected(httpMux.Serve()) }) s.stopper.RunWorker(func() { netutil.FatalIfUnexpected(plainRedirectServer.Serve(clearL)) }) httpLn = tls.NewListener(tlsL, tlsConfig) } s.stopper.RunWorker(func() { netutil.FatalIfUnexpected(httpServer.Serve(httpLn)) }) s.stopper.RunWorker(func() { <-s.stopper.ShouldQuiesce() netutil.FatalIfUnexpected(anyL.Close()) <-s.stopper.ShouldStop() s.grpc.Stop() }) s.stopper.RunWorker(func() { netutil.FatalIfUnexpected(s.grpc.Serve(anyL)) }) s.stopper.RunWorker(func() { pgCtx := s.pgServer.AmbientCtx.AnnotateCtx(context.Background()) netutil.FatalIfUnexpected(httpServer.ServeWith(s.stopper, pgL, func(conn net.Conn) { connCtx := log.WithLogTagStr(pgCtx, "client", conn.RemoteAddr().String()) if err := s.pgServer.ServeConn(connCtx, conn); err != nil && !netutil.IsClosedConnection(err) { // Report the error on this connection's context, so that we // know which remote client caused the error when looking at // the logs. log.Error(connCtx, err) } })) }) if len(s.cfg.SocketFile) != 0 { // Unix socket enabled: postgres protocol only. unixLn, err := net.Listen("unix", s.cfg.SocketFile) if err != nil { return err } s.stopper.RunWorker(func() { <-s.stopper.ShouldQuiesce() if err := unixLn.Close(); err != nil { log.Fatal(workersCtx, err) } }) s.stopper.RunWorker(func() { pgCtx := s.pgServer.AmbientCtx.AnnotateCtx(context.Background()) netutil.FatalIfUnexpected(httpServer.ServeWith(s.stopper, unixLn, func(conn net.Conn) { connCtx := log.WithLogTagStr(pgCtx, "client", conn.RemoteAddr().String()) if err := s.pgServer.ServeConn(connCtx, conn); err != nil && !netutil.IsClosedConnection(err) { // Report the error on this connection's context, so that we // know which remote client caused the error when looking at // the logs. log.Error(connCtx, err) } })) }) } // Enable the debug endpoints first to provide an earlier window // into what's going on with the node in advance of exporting node // functionality. // TODO(marc): when cookie-based authentication exists, // apply it for all web endpoints. s.mux.HandleFunc(debugEndpoint, http.HandlerFunc(handleDebug)) s.gossip.Start(unresolvedAdvertAddr) log.Event(ctx, "started gossip") s.engines, err = s.cfg.CreateEngines() if err != nil { return errors.Wrap(err, "failed to create engines") } s.stopper.AddCloser(&s.engines) // We might have to sleep a bit to protect against this node producing non- // monotonic timestamps. Before restarting, its clock might have been driven // by other nodes' fast clocks, but when we restarted, we lost all this // information. For example, a client might have written a value at a // timestamp that's in the future of the restarted node's clock, and if we // don't do something, the same client's read would not return the written // value. So, we wait up to MaxOffset; we couldn't have served timestamps more // than MaxOffset in the future (assuming that MaxOffset was not changed, see // #9733). // // As an optimization for tests, we don't sleep if all the stores are brand // new. In this case, the node will not serve anything anyway until it // synchronizes with other nodes. { anyStoreBootstrapped := false for _, e := range s.engines { if _, err := storage.ReadStoreIdent(ctx, e); err != nil { // NotBootstrappedError is expected. if _, ok := err.(*storage.NotBootstrappedError); !ok { return err } } else { anyStoreBootstrapped = true break } } if anyStoreBootstrapped { sleepDuration := s.clock.MaxOffset() - timeutil.Since(startTime) if sleepDuration > 0 { log.Infof(ctx, "sleeping for %s to guarantee HLC monotonicity", sleepDuration) time.Sleep(sleepDuration) } } } // Now that we have a monotonic HLC wrt previous incarnations of the process, // init all the replicas. err = s.node.start( ctx, unresolvedAdvertAddr, s.engines, s.cfg.NodeAttributes, s.cfg.Locality, ) if err != nil { return err } log.Event(ctx, "started node") s.nodeLiveness.StartHeartbeat(ctx, s.stopper) // We can now add the node registry. s.recorder.AddNode(s.registry, s.node.Descriptor, s.node.startedAt) // Begin recording runtime statistics. s.startSampleEnvironment(s.cfg.MetricsSampleInterval) // Begin recording time series data collected by the status monitor. s.tsDB.PollSource( s.cfg.AmbientCtx, s.recorder, s.cfg.MetricsSampleInterval, ts.Resolution10s, s.stopper, ) // Begin recording status summaries. s.node.startWriteSummaries(s.cfg.MetricsSampleInterval) // Create and start the schema change manager only after a NodeID // has been assigned. testingKnobs := &sql.SchemaChangerTestingKnobs{} if s.cfg.TestingKnobs.SQLSchemaChanger != nil { testingKnobs = s.cfg.TestingKnobs.SQLSchemaChanger.(*sql.SchemaChangerTestingKnobs) } sql.NewSchemaChangeManager(testingKnobs, *s.db, s.gossip, s.leaseMgr).Start(s.stopper) s.distSQLServer.Start() log.Infof(ctx, "starting %s server at %s", s.cfg.HTTPRequestScheme(), unresolvedHTTPAddr) log.Infof(ctx, "starting grpc/postgres server at %s", unresolvedListenAddr) log.Infof(ctx, "advertising CockroachDB node at %s", unresolvedAdvertAddr) if len(s.cfg.SocketFile) != 0 { log.Infof(ctx, "starting postgres server at unix:%s", s.cfg.SocketFile) } s.stopper.RunWorker(func() { netutil.FatalIfUnexpected(m.Serve()) }) log.Event(ctx, "accepting connections") // Initialize grpc-gateway mux and context. jsonpb := &protoutil.JSONPb{ EnumsAsInts: true, EmitDefaults: true, Indent: " ", } protopb := new(protoutil.ProtoPb) gwMux := gwruntime.NewServeMux( gwruntime.WithMarshalerOption(gwruntime.MIMEWildcard, jsonpb), gwruntime.WithMarshalerOption(httputil.JSONContentType, jsonpb), gwruntime.WithMarshalerOption(httputil.AltJSONContentType, jsonpb), gwruntime.WithMarshalerOption(httputil.ProtoContentType, protopb), gwruntime.WithMarshalerOption(httputil.AltProtoContentType, protopb), ) gwCtx, gwCancel := context.WithCancel(s.AnnotateCtx(context.Background())) s.stopper.AddCloser(stop.CloserFn(gwCancel)) // Setup HTTP<->gRPC handlers. conn, err := s.rpcContext.GRPCDial(s.cfg.Addr) if err != nil { return errors.Errorf("error constructing grpc-gateway: %s; are your certificates valid?", err) } for _, gw := range []grpcGatewayServer{s.admin, s.status, &s.tsServer} { if err := gw.RegisterGateway(gwCtx, gwMux, conn); err != nil { return err } } var uiFileSystem http.FileSystem uiDebug := envutil.EnvOrDefaultBool("COCKROACH_DEBUG_UI", false) if uiDebug { uiFileSystem = http.Dir("pkg/ui") } else { uiFileSystem = &assetfs.AssetFS{ Asset: ui.Asset, AssetDir: ui.AssetDir, AssetInfo: ui.AssetInfo, } } uiFileServer := http.FileServer(uiFileSystem) s.mux.HandleFunc("/", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { if r.URL.Path == "/" { if uiDebug { r.URL.Path = "debug.html" } else { r.URL.Path = "release.html" } } uiFileServer.ServeHTTP(w, r) })) // TODO(marc): when cookie-based authentication exists, // apply it for all web endpoints. s.mux.Handle(adminPrefix, gwMux) s.mux.Handle(ts.URLPrefix, gwMux) s.mux.Handle(statusPrefix, gwMux) s.mux.Handle("/health", gwMux) s.mux.Handle(statusVars, http.HandlerFunc(s.status.handleVars)) log.Event(ctx, "added http endpoints") if err := sdnotify.Ready(); err != nil { log.Errorf(ctx, "failed to signal readiness using systemd protocol: %s", err) } log.Event(ctx, "server ready") return nil }
// NewServer creates a Server from a server.Context. func NewServer(cfg Config, stopper *stop.Stopper) (*Server, error) { if _, err := net.ResolveTCPAddr("tcp", cfg.AdvertiseAddr); err != nil { return nil, errors.Errorf("unable to resolve RPC address %q: %v", cfg.AdvertiseAddr, err) } if cfg.AmbientCtx.Tracer == nil { cfg.AmbientCtx.Tracer = tracing.NewTracer() } // Try loading the TLS configs before anything else. if _, err := cfg.GetServerTLSConfig(); err != nil { return nil, err } if _, err := cfg.GetClientTLSConfig(); err != nil { return nil, err } s := &Server{ mux: http.NewServeMux(), clock: hlc.NewClock(hlc.UnixNano, cfg.MaxOffset), stopper: stopper, cfg: cfg, } // Add a dynamic log tag value for the node ID. // // We need to pass an ambient context to the various server components, but we // won't know the node ID until we Start(). At that point it's too late to // change the ambient contexts in the components (various background processes // will have already started using them). // // NodeIDContainer allows us to add the log tag to the context now and update // the value asynchronously. It's not significantly more expensive than a // regular tag since it's just doing an (atomic) load when a log/trace message // is constructed. The node ID is set by the Store if this host was // bootstrapped; otherwise a new one is allocated in Node. s.cfg.AmbientCtx.AddLogTag("n", &s.nodeIDContainer) ctx := s.AnnotateCtx(context.Background()) if s.cfg.Insecure { log.Warning(ctx, "running in insecure mode, this is strongly discouraged. See --insecure.") } s.rpcContext = rpc.NewContext(s.cfg.AmbientCtx, s.cfg.Config, s.clock, s.stopper) s.rpcContext.HeartbeatCB = func() { if err := s.rpcContext.RemoteClocks.VerifyClockOffset(); err != nil { log.Fatal(ctx, err) } } s.grpc = rpc.NewServer(s.rpcContext) s.registry = metric.NewRegistry() s.gossip = gossip.New( s.cfg.AmbientCtx, &s.nodeIDContainer, s.rpcContext, s.grpc, s.cfg.GossipBootstrapResolvers, s.stopper, s.registry, ) s.storePool = storage.NewStorePool( s.cfg.AmbientCtx, s.gossip, s.clock, s.rpcContext, s.cfg.TimeUntilStoreDead, s.stopper, /* deterministic */ false, ) // A custom RetryOptions is created which uses stopper.ShouldQuiesce() as // the Closer. This prevents infinite retry loops from occurring during // graceful server shutdown // // Such a loop loop occurs with the DistSender attempts a connection to the // local server during shutdown, and receives an internal server error (HTTP // Code 5xx). This is the correct error for a server to return when it is // shutting down, and is normally retryable in a cluster environment. // However, on a single-node setup (such as a test), retries will never // succeed because the only server has been shut down; thus, thus the // DistSender needs to know that it should not retry in this situation. retryOpts := base.DefaultRetryOptions() retryOpts.Closer = s.stopper.ShouldQuiesce() distSenderCfg := kv.DistSenderConfig{ AmbientCtx: s.cfg.AmbientCtx, Clock: s.clock, RPCContext: s.rpcContext, RPCRetryOptions: &retryOpts, } s.distSender = kv.NewDistSender(distSenderCfg, s.gossip) txnMetrics := kv.MakeTxnMetrics(s.cfg.MetricsSampleInterval) s.registry.AddMetricStruct(txnMetrics) s.txnCoordSender = kv.NewTxnCoordSender( s.cfg.AmbientCtx, s.distSender, s.clock, s.cfg.Linearizable, s.stopper, txnMetrics, ) s.db = client.NewDB(s.txnCoordSender) // Use the range lease expiration and renewal durations as the node // liveness expiration and heartbeat interval. active, renewal := storage.RangeLeaseDurations( storage.RaftElectionTimeout(s.cfg.RaftTickInterval, s.cfg.RaftElectionTimeoutTicks)) s.nodeLiveness = storage.NewNodeLiveness( s.cfg.AmbientCtx, s.clock, s.db, s.gossip, active, renewal, ) s.registry.AddMetricStruct(s.nodeLiveness.Metrics()) s.raftTransport = storage.NewRaftTransport( s.cfg.AmbientCtx, storage.GossipAddressResolver(s.gossip), s.grpc, s.rpcContext, ) s.kvDB = kv.NewDBServer(s.cfg.Config, s.txnCoordSender, s.stopper) roachpb.RegisterExternalServer(s.grpc, s.kvDB) // Set up internal memory metrics for use by internal SQL executors. s.internalMemMetrics = sql.MakeMemMetrics("internal") s.registry.AddMetricStruct(s.internalMemMetrics) // Set up Lease Manager var lmKnobs sql.LeaseManagerTestingKnobs if cfg.TestingKnobs.SQLLeaseManager != nil { lmKnobs = *s.cfg.TestingKnobs.SQLLeaseManager.(*sql.LeaseManagerTestingKnobs) } s.leaseMgr = sql.NewLeaseManager(&s.nodeIDContainer, *s.db, s.clock, lmKnobs, s.stopper, &s.internalMemMetrics) s.leaseMgr.RefreshLeases(s.stopper, s.db, s.gossip) // Set up the DistSQL server distSQLCfg := distsql.ServerConfig{ AmbientContext: s.cfg.AmbientCtx, DB: s.db, RPCContext: s.rpcContext, Stopper: s.stopper, } s.distSQLServer = distsql.NewServer(distSQLCfg) distsql.RegisterDistSQLServer(s.grpc, s.distSQLServer) // Set up admin memory metrics for use by admin SQL executors. s.adminMemMetrics = sql.MakeMemMetrics("admin") s.registry.AddMetricStruct(s.adminMemMetrics) // Set up Executor execCfg := sql.ExecutorConfig{ AmbientCtx: s.cfg.AmbientCtx, NodeID: &s.nodeIDContainer, DB: s.db, Gossip: s.gossip, LeaseManager: s.leaseMgr, Clock: s.clock, DistSQLSrv: s.distSQLServer, MetricsSampleInterval: s.cfg.MetricsSampleInterval, } if s.cfg.TestingKnobs.SQLExecutor != nil { execCfg.TestingKnobs = s.cfg.TestingKnobs.SQLExecutor.(*sql.ExecutorTestingKnobs) } else { execCfg.TestingKnobs = &sql.ExecutorTestingKnobs{} } if s.cfg.TestingKnobs.SQLSchemaChanger != nil { execCfg.SchemaChangerTestingKnobs = s.cfg.TestingKnobs.SQLSchemaChanger.(*sql.SchemaChangerTestingKnobs) } else { execCfg.SchemaChangerTestingKnobs = &sql.SchemaChangerTestingKnobs{} } s.sqlExecutor = sql.NewExecutor(execCfg, s.stopper, &s.adminMemMetrics) s.registry.AddMetricStruct(s.sqlExecutor) s.pgServer = pgwire.MakeServer( s.cfg.AmbientCtx, s.cfg.Config, s.sqlExecutor, &s.internalMemMetrics, s.cfg.SQLMemoryPoolSize, ) s.registry.AddMetricStruct(s.pgServer.Metrics()) s.tsDB = ts.NewDB(s.db) s.tsServer = ts.MakeServer(s.cfg.AmbientCtx, s.tsDB, s.cfg.TimeSeriesServerConfig, s.stopper) // TODO(bdarnell): make StoreConfig configurable. storeCfg := storage.StoreConfig{ AmbientCtx: s.cfg.AmbientCtx, Clock: s.clock, DB: s.db, Gossip: s.gossip, NodeLiveness: s.nodeLiveness, Transport: s.raftTransport, RaftTickInterval: s.cfg.RaftTickInterval, ScanInterval: s.cfg.ScanInterval, ScanMaxIdleTime: s.cfg.ScanMaxIdleTime, ConsistencyCheckInterval: s.cfg.ConsistencyCheckInterval, ConsistencyCheckPanicOnFailure: s.cfg.ConsistencyCheckPanicOnFailure, MetricsSampleInterval: s.cfg.MetricsSampleInterval, StorePool: s.storePool, SQLExecutor: sql.InternalExecutor{ LeaseManager: s.leaseMgr, }, LogRangeEvents: s.cfg.EventLogEnabled, AllocatorOptions: storage.AllocatorOptions{ AllowRebalance: true, }, RangeLeaseActiveDuration: active, RangeLeaseRenewalDuration: renewal, TimeSeriesDataStore: s.tsDB, } if s.cfg.TestingKnobs.Store != nil { storeCfg.TestingKnobs = *s.cfg.TestingKnobs.Store.(*storage.StoreTestingKnobs) } s.recorder = status.NewMetricsRecorder(s.clock) s.registry.AddMetricStruct(s.rpcContext.RemoteClocks.Metrics()) s.runtime = status.MakeRuntimeStatSampler(s.clock) s.registry.AddMetricStruct(s.runtime) s.node = NewNode(storeCfg, s.recorder, s.registry, s.stopper, txnMetrics, sql.MakeEventLogger(s.leaseMgr)) roachpb.RegisterInternalServer(s.grpc, s.node) storage.RegisterConsistencyServer(s.grpc, s.node.storesServer) storage.RegisterFreezeServer(s.grpc, s.node.storesServer) s.admin = newAdminServer(s) s.status = newStatusServer( s.cfg.AmbientCtx, s.db, s.gossip, s.recorder, s.rpcContext, s.node.stores, ) for _, gw := range []grpcGatewayServer{s.admin, s.status, &s.tsServer} { gw.RegisterService(s.grpc) } return s, nil }
// EnsureMigrations should be run during node startup to ensure that all // required migrations have been run (and running all those that are definitely // safe to run). func (m *Manager) EnsureMigrations(ctx context.Context) error { // First, check whether there are any migrations that need to be run. completedMigrations, err := m.getCompletedMigrations(ctx) if err != nil { return err } allMigrationsCompleted := true for _, migration := range backwardCompatibleMigrations { key := migrationKey(migration) if _, ok := completedMigrations[string(key)]; !ok { allMigrationsCompleted = false } } if allMigrationsCompleted { return nil } // If there are any, grab the migration lease to ensure that only one // node is ever doing migrations at a time. // Note that we shouldn't ever let client.LeaseNotAvailableErrors cause us // to stop trying, because if we return an error the server will be shut down, // and this server being down may prevent the leaseholder from finishing. var lease *client.Lease if log.V(1) { log.Info(ctx, "trying to acquire lease") } for r := retry.StartWithCtx(ctx, base.DefaultRetryOptions()); r.Next(); { lease, err = m.leaseManager.AcquireLease(ctx, keys.MigrationLease) if err == nil { break } log.Errorf(ctx, "failed attempt to acquire migration lease: %s", err) } if err != nil { return errors.Wrapf(err, "failed to acquire lease for running necessary migrations") } // Ensure that we hold the lease throughout the migration process and release // it when we're done. done := make(chan interface{}, 1) defer func() { done <- nil if log.V(1) { log.Info(ctx, "trying to release the lease") } if err := m.leaseManager.ReleaseLease(ctx, lease); err != nil { log.Errorf(ctx, "failed to release migration lease: %s", err) } }() if err := m.stopper.RunAsyncTask(ctx, func(ctx context.Context) { select { case <-done: return case <-time.After(leaseRefreshInterval): if err := m.leaseManager.ExtendLease(ctx, lease); err != nil { log.Warningf(ctx, "unable to extend ownership of expiration lease: %s", err) } if m.leaseManager.TimeRemaining(lease) < leaseRefreshInterval { // Note that we may be able to do better than this by influencing the // deadline of migrations' transactions based on the least expiration // time, but simply kill the process for now for the sake of simplicity. log.Fatal(ctx, "not enough time left on migration lease, terminating for safety") } } }); err != nil { return err } // Re-get the list of migrations in case any of them were completed between // our initial check and our grabbing of the lease. completedMigrations, err = m.getCompletedMigrations(ctx) if err != nil { return err } startTime := timeutil.Now().String() r := runner{ db: m.db, sqlExecutor: m.sqlExecutor, } for _, migration := range backwardCompatibleMigrations { key := migrationKey(migration) if _, ok := completedMigrations[string(key)]; ok { continue } if log.V(1) { log.Infof(ctx, "running migration %q", migration.name) } if err := migration.workFn(ctx, r); err != nil { return errors.Wrapf(err, "failed to run migration %q", migration.name) } if log.V(1) { log.Infof(ctx, "trying to persist record of completing migration %s", migration.name) } if err := m.db.Put(ctx, key, startTime); err != nil { return errors.Wrapf(err, "failed to persist record of completing migration %q", migration.name) } } return nil }
func (r *Replica) handleProposalData( ctx context.Context, originReplica roachpb.ReplicaDescriptor, pd ProposalData, ) { if pd.BlockReads { r.readOnlyCmdMu.Lock() defer r.readOnlyCmdMu.Unlock() pd.BlockReads = false } // Update MVCC stats and Raft portion of ReplicaState. r.mu.Lock() r.mu.state.Stats = pd.State.Stats r.mu.state.RaftAppliedIndex = pd.State.RaftAppliedIndex r.mu.state.LeaseAppliedIndex = pd.State.LeaseAppliedIndex r.mu.Unlock() pd.State.Stats = enginepb.MVCCStats{} pd.State.LeaseAppliedIndex = 0 pd.State.RaftAppliedIndex = 0 // The above are always present, so we assert only if there are // "nontrivial" actions below. shouldAssert := (pd.ReplicatedProposalData != storagebase.ReplicatedProposalData{}) // Process Split or Merge. This needs to happen after stats update because // of the ContainsEstimates hack. if pd.Split != nil { // TODO(tschottdorf): We want to let the usual MVCCStats-delta // machinery update our stats for the left-hand side. But there is no // way to pass up an MVCCStats object that will clear out the // ContainsEstimates flag. We should introduce one, but the migration // makes this worth a separate effort (ContainsEstimates would need to // have three possible values, 'UNCHANGED', 'NO', and 'YES'). // Until then, we're left with this rather crude hack. { r.mu.Lock() r.mu.state.Stats.ContainsEstimates = false stats := r.mu.state.Stats r.mu.Unlock() if err := setMVCCStats(ctx, r.store.Engine(), r.RangeID, stats); err != nil { log.Fatal(ctx, errors.Wrap(err, "unable to write MVCC stats")) } } splitPostApply( r.AnnotateCtx(context.TODO()), pd.Split.RHSDelta, &pd.Split.SplitTrigger, r, ) pd.Split = nil } if pd.Merge != nil { if err := r.store.MergeRange(ctx, r, pd.Merge.LeftDesc.EndKey, pd.Merge.RightDesc.RangeID, ); err != nil { // Our in-memory state has diverged from the on-disk state. log.Fatalf(ctx, "failed to update store after merging range: %s", err) } pd.Merge = nil } // Update the remaining ReplicaState. if pd.State.Frozen != storagebase.ReplicaState_FROZEN_UNSPECIFIED { r.mu.Lock() r.mu.state.Frozen = pd.State.Frozen r.mu.Unlock() } pd.State.Frozen = storagebase.ReplicaState_FrozenEnum(0) if newDesc := pd.State.Desc; newDesc != nil { pd.State.Desc = nil // for assertion if err := r.setDesc(newDesc); err != nil { // Log the error. There's not much we can do because the commit may // have already occurred at this point. log.Fatalf( ctx, "failed to update range descriptor to %+v: %s", newDesc, err, ) } } if newLease := pd.State.Lease; newLease != nil { pd.State.Lease = nil // for assertion r.mu.Lock() replicaID := r.mu.replicaID prevLease := r.mu.state.Lease r.mu.state.Lease = newLease r.mu.Unlock() r.leasePostApply(ctx, newLease, replicaID, prevLease) } if newTruncState := pd.State.TruncatedState; newTruncState != nil { pd.State.TruncatedState = nil // for assertion r.mu.Lock() r.mu.state.TruncatedState = newTruncState r.mu.Unlock() // Clear any entries in the Raft log entry cache for this range up // to and including the most recently truncated index. r.store.raftEntryCache.clearTo(r.RangeID, newTruncState.Index+1) } if newThresh := pd.State.GCThreshold; newThresh != hlc.ZeroTimestamp { r.mu.Lock() r.mu.state.GCThreshold = newThresh r.mu.Unlock() pd.State.GCThreshold = hlc.ZeroTimestamp } if newThresh := pd.State.TxnSpanGCThreshold; newThresh != hlc.ZeroTimestamp { r.mu.Lock() r.mu.state.TxnSpanGCThreshold = newThresh r.mu.Unlock() pd.State.TxnSpanGCThreshold = hlc.ZeroTimestamp } // ====================== // Non-state updates and actions. // ====================== r.store.metrics.addMVCCStats(pd.delta) pd.delta = enginepb.MVCCStats{} if originReplica.StoreID == r.store.StoreID() { // On the replica on which this command originated, resolve skipped // intents asynchronously - even on failure. // // TODO(tschottdorf): EndTransaction will use this pathway to return // intents which should immediately be resolved. However, there's // a slight chance that an error between the origin of that intents // slice and here still results in that intent slice arriving here // without the EndTransaction having committed. We should clearly // separate the part of the ProposalData which also applies on errors. if pd.intents != nil { r.store.intentResolver.processIntentsAsync(r, *pd.intents) } } pd.intents = nil // The above are present too often, so we assert only if there are // "nontrivial" actions below. shouldAssert = shouldAssert || (pd.LocalProposalData != LocalProposalData{}) if pd.raftLogSize != nil { r.mu.Lock() r.mu.raftLogSize = *pd.raftLogSize r.mu.Unlock() pd.raftLogSize = nil } if pd.gossipFirstRange { // We need to run the gossip in an async task because gossiping requires // the range lease and we'll deadlock if we try to acquire it while // holding processRaftMu. Specifically, Replica.redirectOnOrAcquireLease // blocks waiting for the lease acquisition to finish but it can't finish // because we're not processing raft messages due to holding // processRaftMu (and running on the processRaft goroutine). if err := r.store.Stopper().RunAsyncTask(ctx, func(ctx context.Context) { hasLease, pErr := r.getLeaseForGossip(ctx) if pErr != nil { log.Infof(ctx, "unable to gossip first range; hasLease=%t, err=%s", hasLease, pErr) } else if !hasLease { return } r.gossipFirstRange(ctx) }); err != nil { log.Infof(ctx, "unable to gossip first range: %s", err) } pd.gossipFirstRange = false } if pd.addToReplicaGCQueue { if _, err := r.store.replicaGCQueue.Add(r, replicaGCPriorityRemoved); err != nil { // Log the error; the range should still be GC'd eventually. log.Errorf(ctx, "unable to add to replica GC queue: %s", err) } pd.addToReplicaGCQueue = false } if pd.maybeAddToSplitQueue { r.store.splitQueue.MaybeAdd(r, r.store.Clock().Now()) pd.maybeAddToSplitQueue = false } if pd.maybeGossipSystemConfig { r.maybeGossipSystemConfig() pd.maybeGossipSystemConfig = false } if originReplica.StoreID == r.store.StoreID() { if pd.leaseMetricsResult != nil { r.store.metrics.leaseRequestComplete(*pd.leaseMetricsResult) } if pd.maybeGossipNodeLiveness != nil { r.maybeGossipNodeLiveness(*pd.maybeGossipNodeLiveness) } } // Satisfy the assertions for all of the items processed only on the // proposer (the block just above). pd.leaseMetricsResult = nil pd.maybeGossipNodeLiveness = nil if pd.ComputeChecksum != nil { r.computeChecksumPostApply(ctx, *pd.ComputeChecksum) pd.ComputeChecksum = nil } if (pd != ProposalData{}) { log.Fatalf(context.TODO(), "unhandled field in ProposalData: %s", pretty.Diff(pd, ProposalData{})) } if shouldAssert { // Assert that the on-disk state doesn't diverge from the in-memory // state as a result of the side effects. r.assertState(r.store.Engine()) } }
func (r *Replica) handleReplicatedProposalData( ctx context.Context, rpd storagebase.ReplicatedProposalData, ) (shouldAssert bool) { // Fields for which no action is taken in this method are zeroed so that // they don't trigger an assertion at the end of the method (which checks // that all fields were handled). { rpd.IsLeaseRequest = false rpd.IsConsistencyRelated = false rpd.IsFreeze = false rpd.Timestamp = hlc.ZeroTimestamp } if rpd.BlockReads { r.readOnlyCmdMu.Lock() defer r.readOnlyCmdMu.Unlock() rpd.BlockReads = false } // Update MVCC stats and Raft portion of ReplicaState. r.mu.Lock() r.mu.state.Stats.Add(rpd.Delta) if rpd.State.RaftAppliedIndex != 0 { r.mu.state.RaftAppliedIndex = rpd.State.RaftAppliedIndex } if rpd.State.LeaseAppliedIndex != 0 { r.mu.state.LeaseAppliedIndex = rpd.State.LeaseAppliedIndex } needsSplitBySize := r.needsSplitBySizeLocked() r.mu.Unlock() r.store.metrics.addMVCCStats(rpd.Delta) rpd.Delta = enginepb.MVCCStats{} const raftLogCheckFrequency = 1 + RaftLogQueueStaleThreshold/4 if rpd.State.RaftAppliedIndex%raftLogCheckFrequency == 1 { r.store.raftLogQueue.MaybeAdd(r, r.store.Clock().Now()) } if needsSplitBySize { r.store.splitQueue.MaybeAdd(r, r.store.Clock().Now()) } rpd.State.Stats = enginepb.MVCCStats{} rpd.State.LeaseAppliedIndex = 0 rpd.State.RaftAppliedIndex = 0 // The above are always present, so we assert only if there are // "nontrivial" actions below. shouldAssert = (rpd != storagebase.ReplicatedProposalData{}) // Process Split or Merge. This needs to happen after stats update because // of the ContainsEstimates hack. if rpd.Split != nil { // TODO(tschottdorf): We want to let the usual MVCCStats-delta // machinery update our stats for the left-hand side. But there is no // way to pass up an MVCCStats object that will clear out the // ContainsEstimates flag. We should introduce one, but the migration // makes this worth a separate effort (ContainsEstimates would need to // have three possible values, 'UNCHANGED', 'NO', and 'YES'). // Until then, we're left with this rather crude hack. { r.mu.Lock() r.mu.state.Stats.ContainsEstimates = false stats := r.mu.state.Stats r.mu.Unlock() if err := setMVCCStats(ctx, r.store.Engine(), r.RangeID, stats); err != nil { log.Fatal(ctx, errors.Wrap(err, "unable to write MVCC stats")) } } splitPostApply( r.AnnotateCtx(ctx), rpd.Split.RHSDelta, &rpd.Split.SplitTrigger, r, ) rpd.Split = nil } if rpd.Merge != nil { if err := r.store.MergeRange(ctx, r, rpd.Merge.LeftDesc.EndKey, rpd.Merge.RightDesc.RangeID, ); err != nil { // Our in-memory state has diverged from the on-disk state. log.Fatalf(ctx, "failed to update store after merging range: %s", err) } rpd.Merge = nil } // Update the remaining ReplicaState. if rpd.State.Frozen != storagebase.ReplicaState_FROZEN_UNSPECIFIED { r.mu.Lock() r.mu.state.Frozen = rpd.State.Frozen r.mu.Unlock() } rpd.State.Frozen = storagebase.ReplicaState_FROZEN_UNSPECIFIED if newDesc := rpd.State.Desc; newDesc != nil { if err := r.setDesc(newDesc); err != nil { // Log the error. There's not much we can do because the commit may // have already occurred at this point. log.Fatalf( ctx, "failed to update range descriptor to %+v: %s", newDesc, err, ) } rpd.State.Desc = nil } if change := rpd.ChangeReplicas; change != nil { if change.ChangeType == roachpb.REMOVE_REPLICA && r.store.StoreID() == change.Replica.StoreID { // This wants to run as late as possible, maximizing the chances // that the other nodes have finished this command as well (since // processing the removal from the queue looks up the Range at the // lease holder, being too early here turns this into a no-op). if _, err := r.store.replicaGCQueue.Add(r, replicaGCPriorityRemoved); err != nil { // Log the error; the range should still be GC'd eventually. log.Errorf(ctx, "unable to add to replica GC queue: %s", err) } } rpd.ChangeReplicas = nil } if newLease := rpd.State.Lease; newLease != nil { rpd.State.Lease = nil // for assertion r.mu.Lock() replicaID := r.mu.replicaID prevLease := r.mu.state.Lease r.mu.state.Lease = newLease r.mu.Unlock() r.leasePostApply(ctx, newLease, replicaID, prevLease) } if newTruncState := rpd.State.TruncatedState; newTruncState != nil { rpd.State.TruncatedState = nil // for assertion r.mu.Lock() r.mu.state.TruncatedState = newTruncState r.mu.Unlock() // Clear any entries in the Raft log entry cache for this range up // to and including the most recently truncated index. r.store.raftEntryCache.clearTo(r.RangeID, newTruncState.Index+1) } if newThresh := rpd.State.GCThreshold; newThresh != hlc.ZeroTimestamp { r.mu.Lock() r.mu.state.GCThreshold = newThresh r.mu.Unlock() rpd.State.GCThreshold = hlc.ZeroTimestamp } if newThresh := rpd.State.TxnSpanGCThreshold; newThresh != hlc.ZeroTimestamp { r.mu.Lock() r.mu.state.TxnSpanGCThreshold = newThresh r.mu.Unlock() rpd.State.TxnSpanGCThreshold = hlc.ZeroTimestamp } if rpd.ComputeChecksum != nil { r.computeChecksumPostApply(ctx, *rpd.ComputeChecksum) rpd.ComputeChecksum = nil } if (rpd != storagebase.ReplicatedProposalData{}) { log.Fatalf(ctx, "unhandled field in ReplicatedProposalData: %s", pretty.Diff(rpd, storagebase.ReplicatedProposalData{})) } return shouldAssert }