// recordJoinEvent begins an asynchronous task which attempts to log a "node // join" or "node restart" event. This query will retry until it succeeds or the // server stops. func (n *Node) recordJoinEvent() { if !n.storeCfg.LogRangeEvents { return } logEventType := sql.EventLogNodeRestart if n.initialBoot { logEventType = sql.EventLogNodeJoin } n.stopper.RunWorker(func() { ctx, span := n.AnnotateCtxWithSpan(context.Background(), "record-join-event") defer span.Finish() retryOpts := base.DefaultRetryOptions() retryOpts.Closer = n.stopper.ShouldStop() for r := retry.Start(retryOpts); r.Next(); { if err := n.storeCfg.DB.Txn(ctx, func(txn *client.Txn) error { return n.eventLogger.InsertEventRecord(txn, logEventType, int32(n.Descriptor.NodeID), int32(n.Descriptor.NodeID), struct { Descriptor roachpb.NodeDescriptor ClusterID uuid.UUID StartedAt int64 }{n.Descriptor, n.ClusterID, n.startedAt}, ) }); err != nil { log.Warningf(ctx, "%s: unable to log %s event: %s", n, logEventType, err) } else { return } } }) }
// createTestNode creates an rpc server using the specified address, // gossip instance, KV database and a node using the specified slice // of engines. The server, clock and node are returned. If gossipBS is // not nil, the gossip bootstrap address is set to gossipBS. func createTestNode( addr net.Addr, engines []engine.Engine, gossipBS net.Addr, t *testing.T, ) (*grpc.Server, net.Addr, *hlc.Clock, *Node, *stop.Stopper) { cfg := storage.StoreConfig{} stopper := stop.NewStopper() cfg.Clock = hlc.NewClock(hlc.UnixNano) nodeRPCContext := rpc.NewContext(log.AmbientContext{}, nodeTestBaseContext, cfg.Clock, stopper) cfg.ScanInterval = 10 * time.Hour cfg.ConsistencyCheckInterval = 10 * time.Hour grpcServer := rpc.NewServer(nodeRPCContext) serverCfg := makeTestConfig() cfg.Gossip = gossip.NewTest( 0, nodeRPCContext, grpcServer, serverCfg.GossipBootstrapResolvers, stopper, metric.NewRegistry(), ) ln, err := netutil.ListenAndServeGRPC(stopper, grpcServer, addr) if err != nil { t.Fatal(err) } if gossipBS != nil { // Handle possibility of a :0 port specification. if gossipBS.Network() == addr.Network() && gossipBS.String() == addr.String() { gossipBS = ln.Addr() } r, err := resolver.NewResolverFromAddress(gossipBS) if err != nil { t.Fatalf("bad gossip address %s: %s", gossipBS, err) } cfg.Gossip.SetResolvers([]resolver.Resolver{r}) cfg.Gossip.Start(ln.Addr()) } retryOpts := base.DefaultRetryOptions() retryOpts.Closer = stopper.ShouldQuiesce() distSender := kv.NewDistSender(kv.DistSenderConfig{ Clock: cfg.Clock, RPCContext: nodeRPCContext, RPCRetryOptions: &retryOpts, }, cfg.Gossip) cfg.AmbientCtx.Tracer = tracing.NewTracer() sender := kv.NewTxnCoordSender( cfg.AmbientCtx, distSender, cfg.Clock, false, stopper, kv.MakeTxnMetrics(metric.TestSampleInterval), ) cfg.DB = client.NewDB(sender) cfg.Transport = storage.NewDummyRaftTransport() cfg.MetricsSampleInterval = metric.TestSampleInterval node := NewNode(cfg, status.NewMetricsRecorder(cfg.Clock), metric.NewRegistry(), stopper, kv.MakeTxnMetrics(metric.TestSampleInterval), sql.MakeEventLogger(nil)) roachpb.RegisterInternalServer(grpcServer, node) return grpcServer, ln.Addr(), cfg.Clock, node, stopper }
// execSchemaChanges releases schema leases and runs the queued // schema changers. This needs to be run after the transaction // scheduling the schema change has finished. // // The list of closures is cleared after (attempting) execution. // // Args: // results: The results from all statements in the group that scheduled the // schema changes we're about to execute. Results corresponding to the // schema change statements will be changed in case an error occurs. func (scc *schemaChangerCollection) execSchemaChanges( e *Executor, planMaker *planner, results ResultList, ) { if planMaker.txn != nil { panic("trying to execute schema changes while still in a transaction") } ctx := e.AnnotateCtx(context.TODO()) // Release the leases once a transaction is complete. planMaker.releaseLeases() if e.cfg.SchemaChangerTestingKnobs.SyncFilter != nil { e.cfg.SchemaChangerTestingKnobs.SyncFilter(TestingSchemaChangerCollection{scc}) } // Execute any schema changes that were scheduled, in the order of the // statements that scheduled them. for _, scEntry := range scc.schemaChangers { sc := &scEntry.sc sc.db = *e.cfg.DB sc.testingKnobs = e.cfg.SchemaChangerTestingKnobs for r := retry.Start(base.DefaultRetryOptions()); r.Next(); { if done, err := sc.IsDone(); err != nil { log.Warning(ctx, err) break } else if done { break } if err := sc.exec(); err != nil { if isSchemaChangeRetryError(err) { // Try again continue } // All other errors can be reported; we report it as the result // corresponding to the statement that enqueued this changer. // There's some sketchiness here: we assume there's a single result // per statement and we clobber the result/error of the corresponding // statement. // There's also another subtlety: we can only report results for // statements in the current batch; we can't modify the results of older // statements. if scEntry.epoch == scc.curGroupNum { results[scEntry.idx] = Result{Err: err} } log.Warningf(ctx, "error executing schema change: %s", err) } break } } scc.schemaChangers = scc.schemaChangers[:0] }
func (ia *idAllocator) start() { ia.stopper.RunWorker(func() { ctx := ia.AnnotateCtx(context.Background()) defer close(ia.ids) for { var newValue int64 for newValue <= int64(ia.minID) { var err error var res client.KeyValue for r := retry.Start(base.DefaultRetryOptions()); r.Next(); { idKey := ia.idKey.Load().(roachpb.Key) if err := ia.stopper.RunTask(func() { res, err = ia.db.Inc(ctx, idKey, int64(ia.blockSize)) }); err != nil { log.Warning(ctx, err) return } if err == nil { newValue = res.ValueInt() break } log.Warningf(ctx, "unable to allocate %d ids from %s: %s", ia.blockSize, idKey, err) } if err != nil { panic(fmt.Sprintf("unexpectedly exited id allocation retry loop: %s", err)) } } end := newValue + 1 start := end - int64(ia.blockSize) if start < int64(ia.minID) { start = int64(ia.minID) } // Add all new ids to the channel for consumption. for i := start; i < end; i++ { select { case ia.ids <- uint32(i): case <-ia.stopper.ShouldStop(): return } } } }) }
// heartbeat is called to update a node's expiration timestamp. This // method does a conditional put on the node liveness record, and if // successful, stores the updated liveness record in the nodes map. func (nl *NodeLiveness) heartbeat(ctx context.Context) error { nodeID := nl.gossip.NodeID.Get() var newLiveness Liveness var oldLiveness *Liveness liveness, err := nl.GetLiveness(nodeID) if err == nil { oldLiveness = &liveness newLiveness = liveness } else { newLiveness = Liveness{ NodeID: nodeID, Epoch: 1, } } // Retry heartbeat in the event the conditional put fails. for r := retry.StartWithCtx(ctx, base.DefaultRetryOptions()); r.Next(); { newLiveness.Expiration = nl.clock.Now().Add(nl.livenessThreshold.Nanoseconds(), 0) tryAgain := false if err := nl.updateLiveness(ctx, nodeID, &newLiveness, oldLiveness, func(actual Liveness) { oldLiveness = &actual newLiveness = actual tryAgain = true }); err != nil { nl.metrics.HeartbeatFailures.Inc(1) return err } if !tryAgain { break } } log.VEventf(ctx, 1, "heartbeat node %d liveness with expiration %s", nodeID, newLiveness.Expiration) nl.mu.Lock() defer nl.mu.Unlock() nl.mu.self = newLiveness nl.metrics.HeartbeatSuccesses.Inc(1) return nil }
// InitSenderForLocalTestCluster initializes a TxnCoordSender that can be used // with LocalTestCluster. func InitSenderForLocalTestCluster( nodeDesc *roachpb.NodeDescriptor, tracer opentracing.Tracer, clock *hlc.Clock, latency time.Duration, stores client.Sender, stopper *stop.Stopper, gossip *gossip.Gossip, ) client.Sender { retryOpts := base.DefaultRetryOptions() retryOpts.Closer = stopper.ShouldQuiesce() senderTransportFactory := SenderTransportFactory(tracer, stores) distSender := NewDistSender(DistSenderConfig{ Clock: clock, RPCRetryOptions: &retryOpts, nodeDescriptor: nodeDesc, TransportFactory: func( opts SendOptions, rpcContext *rpc.Context, replicas ReplicaSlice, args roachpb.BatchRequest, ) (Transport, error) { transport, err := senderTransportFactory(opts, rpcContext, replicas, args) if err != nil { return nil, err } return &localTestClusterTransport{transport, latency}, nil }, }, gossip) ambient := log.AmbientContext{Tracer: tracer} return NewTxnCoordSender( ambient, distSender, clock, false, /* !linearizable */ stopper, MakeTxnMetrics(metric.TestSampleInterval), ) }
// StartHeartbeat starts a periodic heartbeat to refresh this node's // last heartbeat in the node liveness table. func (nl *NodeLiveness) StartHeartbeat(ctx context.Context, stopper *stop.Stopper) { log.VEventf(ctx, 1, "starting liveness heartbeat") retryOpts := base.DefaultRetryOptions() retryOpts.Closer = stopper.ShouldQuiesce() stopper.RunWorker(func() { ambient := nl.ambientCtx ambient.AddLogTag("hb", nil) ticker := time.NewTicker(nl.heartbeatInterval) defer ticker.Stop() for { if !nl.pauseHeartbeat.Load().(bool) { ctx, sp := ambient.AnnotateCtxWithSpan(context.Background(), "heartbeat") ctx, cancel := context.WithTimeout(ctx, nl.heartbeatInterval) // Retry heartbeat in the event the conditional put fails. for r := retry.StartWithCtx(ctx, retryOpts); r.Next(); { liveness, err := nl.Self() if err != nil && err != ErrNoLivenessRecord { log.Errorf(ctx, "unexpected error getting liveness: %v", err) } if err := nl.Heartbeat(ctx, liveness); err != nil { if err == errSkippedHeartbeat { continue } log.Errorf(ctx, "failed liveness heartbeat: %v", err) } break } cancel() sp.Finish() } select { case <-ticker.C: case <-stopper.ShouldStop(): return } } }) }
// waitForStoreFrozen polls the given stores until they all report having no // unfrozen Replicas (or an error or timeout occurs). func (s *adminServer) waitForStoreFrozen( stream serverpb.Admin_ClusterFreezeServer, stores map[roachpb.StoreID]roachpb.NodeID, wantFrozen bool, ) error { mu := struct { syncutil.Mutex oks map[roachpb.StoreID]bool }{ oks: make(map[roachpb.StoreID]bool), } opts := base.DefaultRetryOptions() opts.Closer = s.server.stopper.ShouldQuiesce() opts.MaxRetries = 20 sem := make(chan struct{}, 256) errChan := make(chan error, 1) sendErr := func(err error) { select { case errChan <- err: default: } } numWaiting := len(stores) // loop until this drops to zero var err error for r := retry.Start(opts); r.Next(); { mu.Lock() for storeID, nodeID := range stores { storeID, nodeID := storeID, nodeID // loop-local copies for goroutine var nodeDesc roachpb.NodeDescriptor if err := s.server.gossip.GetInfoProto(gossip.MakeNodeIDKey(nodeID), &nodeDesc); err != nil { sendErr(err) break } addr := nodeDesc.Address.String() if _, inflightOrSucceeded := mu.oks[storeID]; inflightOrSucceeded { continue } mu.oks[storeID] = false // mark as inflight action := func() (err error) { var resp *storage.PollFrozenResponse defer func() { message := fmt.Sprintf("node %d, store %d: ", nodeID, storeID) if err != nil { message += err.Error() } else { numMismatching := len(resp.Results) mu.Lock() if numMismatching == 0 { // If the Store is in the right state, mark it as such. // This means we won't try it again. message += "ready" mu.oks[storeID] = true } else { // Otherwise, forget that we tried the Store so that // the retry loop picks it up again. message += fmt.Sprintf("%d replicas report wrong status", numMismatching) if limit := 10; numMismatching > limit { message += " [truncated]: " resp.Results = resp.Results[:limit] } else { message += ": " } message += fmt.Sprintf("%+v", resp.Results) delete(mu.oks, storeID) } mu.Unlock() } err = stream.Send(&serverpb.ClusterFreezeResponse{ Message: message, }) }() conn, err := s.server.rpcContext.GRPCDial(addr) if err != nil { return err } client := storage.NewFreezeClient(conn) resp, err = client.PollFrozen(context.TODO(), &storage.PollFrozenRequest{ StoreRequestHeader: storage.StoreRequestHeader{ NodeID: nodeID, StoreID: storeID, }, // If we are looking to freeze everything, we want to // collect thawed Replicas, and vice versa. CollectFrozen: !wantFrozen, }) return err } // Run a limited, non-blocking task. That means the task simply // won't run if the semaphore is full (or the node is draining). // Both are handled by the surrounding retry loop. if err := s.server.stopper.RunLimitedAsyncTask( context.TODO(), sem, true /* wait */, func(_ context.Context) { if err := action(); err != nil { sendErr(err) } }); err != nil { // Node draining. sendErr(err) break } } numWaiting = len(stores) for _, ok := range mu.oks { if ok { // Store has reported that it is frozen. numWaiting-- continue } } mu.Unlock() select { case err = <-errChan: default: } // Keep going unless there's been an error or everyone's frozen. if err != nil || numWaiting == 0 { break } if err := stream.Send(&serverpb.ClusterFreezeResponse{ Message: fmt.Sprintf("waiting for %d store%s to apply operation", numWaiting, util.Pluralize(int64(numWaiting))), }); err != nil { return err } } if err != nil { return err } if numWaiting > 0 { err = fmt.Errorf("timed out waiting for %d store%s to report freeze", numWaiting, util.Pluralize(int64(numWaiting))) } return err }
// DefaultDBContext returns (a copy of) the default options for // NewDBWithContext. func DefaultDBContext() DBContext { return DBContext{ UserPriority: roachpb.NormalUserPriority, TxnRetryOptions: base.DefaultRetryOptions(), } }
// NewDistSender returns a batch.Sender instance which connects to the // Cockroach cluster via the supplied gossip instance. Supplying a // DistSenderContext or the fields within is optional. For omitted values, sane // defaults will be used. func NewDistSender(cfg DistSenderConfig, g *gossip.Gossip) *DistSender { ds := &DistSender{gossip: g} ds.AmbientContext = cfg.AmbientCtx if ds.AmbientContext.Tracer == nil { ds.AmbientContext.Tracer = tracing.NewTracer() } ds.clock = cfg.Clock if ds.clock == nil { ds.clock = hlc.NewClock(hlc.UnixNano) } if cfg.nodeDescriptor != nil { atomic.StorePointer(&ds.nodeDescriptor, unsafe.Pointer(cfg.nodeDescriptor)) } rcSize := cfg.RangeDescriptorCacheSize if rcSize <= 0 { rcSize = defaultRangeDescriptorCacheSize } rdb := cfg.RangeDescriptorDB if rdb == nil { rdb = ds } ds.rangeCache = newRangeDescriptorCache(ds.AnnotateCtx(context.TODO()), rdb, int(rcSize)) lcSize := cfg.LeaseHolderCacheSize if lcSize <= 0 { lcSize = defaultLeaseHolderCacheSize } ds.leaseHolderCache = newLeaseHolderCache(int(lcSize)) if cfg.RangeLookupMaxRanges <= 0 { ds.rangeLookupMaxRanges = defaultRangeLookupMaxRanges } if cfg.TransportFactory != nil { ds.transportFactory = cfg.TransportFactory } ds.rpcRetryOptions = base.DefaultRetryOptions() if cfg.RPCRetryOptions != nil { ds.rpcRetryOptions = *cfg.RPCRetryOptions } if cfg.RPCContext != nil { ds.rpcContext = cfg.RPCContext if ds.rpcRetryOptions.Closer == nil { ds.rpcRetryOptions.Closer = ds.rpcContext.Stopper.ShouldQuiesce() } } if cfg.SendNextTimeout != 0 { ds.sendNextTimeout = cfg.SendNextTimeout } else { ds.sendNextTimeout = defaultSendNextTimeout } if cfg.SenderConcurrency != 0 { ds.asyncSenderSem = make(chan struct{}, cfg.SenderConcurrency) } else { ds.asyncSenderSem = make(chan struct{}, defaultSenderConcurrency) } if g != nil { ctx := ds.AnnotateCtx(context.Background()) g.RegisterCallback(gossip.KeyFirstRangeDescriptor, func(_ string, value roachpb.Value) { if log.V(1) { var desc roachpb.RangeDescriptor if err := value.GetProto(&desc); err != nil { log.Errorf(ctx, "unable to parse gossiped first range descriptor: %s", err) } else { log.Infof(ctx, "gossiped first range descriptor: %+v", desc.Replicas) } } err := ds.rangeCache.EvictCachedRangeDescriptor(roachpb.RKeyMin, nil, false) if err != nil { log.Warningf(ctx, "failed to evict first range descriptor: %s", err) } }) } return ds }
// TestMultiRangeScanWithMaxResults tests that commands which access multiple // ranges with MaxResults parameter are carried out properly. func TestMultiRangeScanWithMaxResults(t *testing.T) { defer leaktest.AfterTest(t)() testCases := []struct { splitKeys []roachpb.Key keys []roachpb.Key }{ {[]roachpb.Key{roachpb.Key("m")}, []roachpb.Key{roachpb.Key("a"), roachpb.Key("z")}}, {[]roachpb.Key{roachpb.Key("h"), roachpb.Key("q")}, []roachpb.Key{roachpb.Key("b"), roachpb.Key("f"), roachpb.Key("k"), roachpb.Key("r"), roachpb.Key("w"), roachpb.Key("y")}}, } for i, tc := range testCases { s, _, _ := serverutils.StartServer(t, base.TestServerArgs{}) defer s.Stopper().Stop() ts := s.(*TestServer) retryOpts := base.DefaultRetryOptions() retryOpts.Closer = ts.stopper.ShouldQuiesce() ds := kv.NewDistSender(kv.DistSenderConfig{ Clock: s.Clock(), RPCContext: s.RPCContext(), RPCRetryOptions: &retryOpts, }, ts.Gossip()) ambient := log.AmbientContext{Tracer: tracing.NewTracer()} tds := kv.NewTxnCoordSender( ambient, ds, ts.Clock(), ts.Cfg.Linearizable, ts.stopper, kv.MakeTxnMetrics(metric.TestSampleInterval), ) for _, sk := range tc.splitKeys { if err := ts.node.storeCfg.DB.AdminSplit(context.TODO(), sk); err != nil { t.Fatal(err) } } for _, k := range tc.keys { put := roachpb.NewPut(k, roachpb.MakeValueFromBytes(k)) if _, err := client.SendWrapped(context.Background(), tds, put); err != nil { t.Fatal(err) } } // Try every possible ScanRequest startKey. for start := 0; start < len(tc.keys); start++ { // Try every possible maxResults, from 1 to beyond the size of key array. for maxResults := 1; maxResults <= len(tc.keys)-start+1; maxResults++ { scan := roachpb.NewScan(tc.keys[start], tc.keys[len(tc.keys)-1].Next()) reply, err := client.SendWrappedWith( context.Background(), tds, roachpb.Header{MaxSpanRequestKeys: int64(maxResults)}, scan, ) if err != nil { t.Fatal(err) } rows := reply.(*roachpb.ScanResponse).Rows if start+maxResults <= len(tc.keys) && len(rows) != maxResults { t.Errorf("%d: start=%s: expected %d rows, but got %d", i, tc.keys[start], maxResults, len(rows)) } else if start+maxResults == len(tc.keys)+1 && len(rows) != maxResults-1 { t.Errorf("%d: expected %d rows, but got %d", i, maxResults-1, len(rows)) } } } } }
// TestMultiRangeScanDeleteRange tests that commands which access multiple // ranges are carried out properly. func TestMultiRangeScanDeleteRange(t *testing.T) { defer leaktest.AfterTest(t)() s, _, _ := serverutils.StartServer(t, base.TestServerArgs{}) defer s.Stopper().Stop() ts := s.(*TestServer) retryOpts := base.DefaultRetryOptions() retryOpts.Closer = ts.stopper.ShouldQuiesce() ds := kv.NewDistSender(kv.DistSenderConfig{ Clock: s.Clock(), RPCContext: s.RPCContext(), RPCRetryOptions: &retryOpts, }, ts.Gossip()) ambient := log.AmbientContext{Tracer: tracing.NewTracer()} tds := kv.NewTxnCoordSender( ambient, ds, s.Clock(), ts.Cfg.Linearizable, ts.stopper, kv.MakeTxnMetrics(metric.TestSampleInterval), ) if err := ts.node.storeCfg.DB.AdminSplit(context.TODO(), "m"); err != nil { t.Fatal(err) } writes := []roachpb.Key{roachpb.Key("a"), roachpb.Key("z")} get := &roachpb.GetRequest{ Span: roachpb.Span{Key: writes[0]}, } get.EndKey = writes[len(writes)-1] if _, err := client.SendWrapped(context.Background(), tds, get); err == nil { t.Errorf("able to call Get with a key range: %v", get) } var delTS hlc.Timestamp for i, k := range writes { put := roachpb.NewPut(k, roachpb.MakeValueFromBytes(k)) if _, err := client.SendWrapped(context.Background(), tds, put); err != nil { t.Fatal(err) } scan := roachpb.NewScan(writes[0], writes[len(writes)-1].Next()) reply, err := client.SendWrapped(context.Background(), tds, scan) if err != nil { t.Fatal(err) } sr := reply.(*roachpb.ScanResponse) if sr.Txn != nil { // This was the other way around at some point in the past. // Same below for Delete, etc. t.Errorf("expected no transaction in response header") } if rows := sr.Rows; len(rows) != i+1 { t.Fatalf("expected %d rows, but got %d", i+1, len(rows)) } } del := &roachpb.DeleteRangeRequest{ Span: roachpb.Span{ Key: writes[0], EndKey: roachpb.Key(writes[len(writes)-1]).Next(), }, ReturnKeys: true, } reply, err := client.SendWrappedWith(context.Background(), tds, roachpb.Header{Timestamp: delTS}, del) if err != nil { t.Fatal(err) } dr := reply.(*roachpb.DeleteRangeResponse) if dr.Txn != nil { t.Errorf("expected no transaction in response header") } if !reflect.DeepEqual(dr.Keys, writes) { t.Errorf("expected %d keys to be deleted, but got %d instead", writes, dr.Keys) } scan := roachpb.NewScan(writes[0], writes[len(writes)-1].Next()) txn := &roachpb.Transaction{Name: "MyTxn"} reply, err = client.SendWrappedWith(context.Background(), tds, roachpb.Header{Txn: txn}, scan) if err != nil { t.Fatal(err) } sr := reply.(*roachpb.ScanResponse) if txn := sr.Txn; txn == nil || txn.Name != "MyTxn" { t.Errorf("wanted Txn to persist, but it changed to %v", txn) } if rows := sr.Rows; len(rows) > 0 { t.Fatalf("scan after delete returned rows: %v", rows) } }
// restoreTable inserts the given DatabaseDescriptor. If the name conflicts with // an existing table, the one being restored is rekeyed with a new ID and the // old data is deleted. func restoreTable( ctx context.Context, db client.DB, database sqlbase.DatabaseDescriptor, table *sqlbase.TableDescriptor, ranges []sqlbase.BackupRangeDescriptor, ) error { if log.V(1) { log.Infof(ctx, "Restoring Table %q", table.Name) } var newTableID sqlbase.ID if err := db.Txn(ctx, func(txn *client.Txn) error { // Make sure there's a database with a name that matches the original. if _, err := getDescriptorID(txn, tableKey{name: database.Name}); err != nil { return errors.Wrapf(err, "a database named %q needs to exist to restore table %q", database.Name, table.Name) } // Assign a new ID for the table. TODO(dan): For now, we're always // generating a new ID, but varints get longer as they get bigger and so // our keys will, too. We should someday figure out how to overwrite an // existing table and steal its ID. var err error newTableID, err = GenerateUniqueDescID(txn) return err }); err != nil { return err } // Create the iteration keys before we give the table its new ID. tableStartKeyOld := roachpb.Key(sqlbase.MakeIndexKeyPrefix(table, table.PrimaryIndex.ID)) tableEndKeyOld := tableStartKeyOld.PrefixEnd() // This loop makes restoring multiple tables O(N*M), where N is the number // of tables and M is the number of ranges. We could reduce this using an // interval tree if necessary. var wg sync.WaitGroup result := struct { syncutil.Mutex firstErr error numErrs int }{} for _, rangeDesc := range ranges { if len(rangeDesc.Path) == 0 { // Empty path means empty range. continue } intersectBegin, intersectEnd := IntersectHalfOpen( rangeDesc.StartKey, rangeDesc.EndKey, tableStartKeyOld, tableEndKeyOld) if intersectBegin != nil && intersectEnd != nil { // Write the data under the new ID. // TODO(dan): There's no SQL descriptors that point at this yet, so it // should be possible to remove it from the one txn this is all currently // run under. If we do that, make sure this data gets cleaned up on errors. wg.Add(1) go func(desc sqlbase.BackupRangeDescriptor) { for r := retry.StartWithCtx(ctx, base.DefaultRetryOptions()); r.Next(); { err := db.Txn(ctx, func(txn *client.Txn) error { return Ingest(ctx, txn, desc.Path, desc.CRC, intersectBegin, intersectEnd, newTableID) }) if _, ok := err.(*client.AutoCommitError); ok { log.Errorf(ctx, "auto commit error during ingest: %s", err) // TODO(dan): Ingest currently does not rely on the // range being empty, but the plan is that it will. When // that change happens, this will have to delete any // partially ingested data or something. continue } if err != nil { log.Errorf(ctx, "%T %s", err, err) result.Lock() defer result.Unlock() if result.firstErr != nil { result.firstErr = err } result.numErrs++ } break } wg.Done() }(rangeDesc) } } wg.Wait() // All concurrent accesses have finished, we don't need the lock anymore. if result.firstErr != nil { // This leaves the data that did get imported in case the user wants to // retry. // TODO(dan): Build tooling to allow a user to restart a failed restore. return errors.Wrapf(result.firstErr, "ingest encountered %d errors", result.numErrs) } table.ID = newTableID return db.Txn(ctx, func(txn *client.Txn) error { // Pass the descriptors by value to keep this idempotent. return restoreTableDesc(ctx, txn, database, *table) }) }
// NewServer creates a Server from a server.Context. func NewServer(cfg Config, stopper *stop.Stopper) (*Server, error) { if _, err := net.ResolveTCPAddr("tcp", cfg.AdvertiseAddr); err != nil { return nil, errors.Errorf("unable to resolve RPC address %q: %v", cfg.AdvertiseAddr, err) } if cfg.AmbientCtx.Tracer == nil { cfg.AmbientCtx.Tracer = tracing.NewTracer() } // Try loading the TLS configs before anything else. if _, err := cfg.GetServerTLSConfig(); err != nil { return nil, err } if _, err := cfg.GetClientTLSConfig(); err != nil { return nil, err } s := &Server{ mux: http.NewServeMux(), clock: hlc.NewClock(hlc.UnixNano, cfg.MaxOffset), stopper: stopper, cfg: cfg, } // Add a dynamic log tag value for the node ID. // // We need to pass an ambient context to the various server components, but we // won't know the node ID until we Start(). At that point it's too late to // change the ambient contexts in the components (various background processes // will have already started using them). // // NodeIDContainer allows us to add the log tag to the context now and update // the value asynchronously. It's not significantly more expensive than a // regular tag since it's just doing an (atomic) load when a log/trace message // is constructed. The node ID is set by the Store if this host was // bootstrapped; otherwise a new one is allocated in Node. s.cfg.AmbientCtx.AddLogTag("n", &s.nodeIDContainer) ctx := s.AnnotateCtx(context.Background()) if s.cfg.Insecure { log.Warning(ctx, "running in insecure mode, this is strongly discouraged. See --insecure.") } s.rpcContext = rpc.NewContext(s.cfg.AmbientCtx, s.cfg.Config, s.clock, s.stopper) s.rpcContext.HeartbeatCB = func() { if err := s.rpcContext.RemoteClocks.VerifyClockOffset(); err != nil { log.Fatal(ctx, err) } } s.grpc = rpc.NewServer(s.rpcContext) s.registry = metric.NewRegistry() s.gossip = gossip.New( s.cfg.AmbientCtx, &s.nodeIDContainer, s.rpcContext, s.grpc, s.cfg.GossipBootstrapResolvers, s.stopper, s.registry, ) s.storePool = storage.NewStorePool( s.cfg.AmbientCtx, s.gossip, s.clock, s.rpcContext, s.cfg.TimeUntilStoreDead, s.stopper, /* deterministic */ false, ) // A custom RetryOptions is created which uses stopper.ShouldQuiesce() as // the Closer. This prevents infinite retry loops from occurring during // graceful server shutdown // // Such a loop loop occurs with the DistSender attempts a connection to the // local server during shutdown, and receives an internal server error (HTTP // Code 5xx). This is the correct error for a server to return when it is // shutting down, and is normally retryable in a cluster environment. // However, on a single-node setup (such as a test), retries will never // succeed because the only server has been shut down; thus, thus the // DistSender needs to know that it should not retry in this situation. retryOpts := base.DefaultRetryOptions() retryOpts.Closer = s.stopper.ShouldQuiesce() distSenderCfg := kv.DistSenderConfig{ AmbientCtx: s.cfg.AmbientCtx, Clock: s.clock, RPCContext: s.rpcContext, RPCRetryOptions: &retryOpts, } s.distSender = kv.NewDistSender(distSenderCfg, s.gossip) txnMetrics := kv.MakeTxnMetrics(s.cfg.MetricsSampleInterval) s.registry.AddMetricStruct(txnMetrics) s.txnCoordSender = kv.NewTxnCoordSender( s.cfg.AmbientCtx, s.distSender, s.clock, s.cfg.Linearizable, s.stopper, txnMetrics, ) s.db = client.NewDB(s.txnCoordSender) // Use the range lease expiration and renewal durations as the node // liveness expiration and heartbeat interval. active, renewal := storage.RangeLeaseDurations( storage.RaftElectionTimeout(s.cfg.RaftTickInterval, s.cfg.RaftElectionTimeoutTicks)) s.nodeLiveness = storage.NewNodeLiveness( s.cfg.AmbientCtx, s.clock, s.db, s.gossip, active, renewal, ) s.registry.AddMetricStruct(s.nodeLiveness.Metrics()) s.raftTransport = storage.NewRaftTransport( s.cfg.AmbientCtx, storage.GossipAddressResolver(s.gossip), s.grpc, s.rpcContext, ) s.kvDB = kv.NewDBServer(s.cfg.Config, s.txnCoordSender, s.stopper) roachpb.RegisterExternalServer(s.grpc, s.kvDB) // Set up internal memory metrics for use by internal SQL executors. s.internalMemMetrics = sql.MakeMemMetrics("internal") s.registry.AddMetricStruct(s.internalMemMetrics) // Set up Lease Manager var lmKnobs sql.LeaseManagerTestingKnobs if cfg.TestingKnobs.SQLLeaseManager != nil { lmKnobs = *s.cfg.TestingKnobs.SQLLeaseManager.(*sql.LeaseManagerTestingKnobs) } s.leaseMgr = sql.NewLeaseManager(&s.nodeIDContainer, *s.db, s.clock, lmKnobs, s.stopper, &s.internalMemMetrics) s.leaseMgr.RefreshLeases(s.stopper, s.db, s.gossip) // Set up the DistSQL server distSQLCfg := distsql.ServerConfig{ AmbientContext: s.cfg.AmbientCtx, DB: s.db, RPCContext: s.rpcContext, Stopper: s.stopper, } s.distSQLServer = distsql.NewServer(distSQLCfg) distsql.RegisterDistSQLServer(s.grpc, s.distSQLServer) // Set up admin memory metrics for use by admin SQL executors. s.adminMemMetrics = sql.MakeMemMetrics("admin") s.registry.AddMetricStruct(s.adminMemMetrics) // Set up Executor execCfg := sql.ExecutorConfig{ AmbientCtx: s.cfg.AmbientCtx, NodeID: &s.nodeIDContainer, DB: s.db, Gossip: s.gossip, LeaseManager: s.leaseMgr, Clock: s.clock, DistSQLSrv: s.distSQLServer, MetricsSampleInterval: s.cfg.MetricsSampleInterval, } if s.cfg.TestingKnobs.SQLExecutor != nil { execCfg.TestingKnobs = s.cfg.TestingKnobs.SQLExecutor.(*sql.ExecutorTestingKnobs) } else { execCfg.TestingKnobs = &sql.ExecutorTestingKnobs{} } if s.cfg.TestingKnobs.SQLSchemaChanger != nil { execCfg.SchemaChangerTestingKnobs = s.cfg.TestingKnobs.SQLSchemaChanger.(*sql.SchemaChangerTestingKnobs) } else { execCfg.SchemaChangerTestingKnobs = &sql.SchemaChangerTestingKnobs{} } s.sqlExecutor = sql.NewExecutor(execCfg, s.stopper, &s.adminMemMetrics) s.registry.AddMetricStruct(s.sqlExecutor) s.pgServer = pgwire.MakeServer( s.cfg.AmbientCtx, s.cfg.Config, s.sqlExecutor, &s.internalMemMetrics, s.cfg.SQLMemoryPoolSize, ) s.registry.AddMetricStruct(s.pgServer.Metrics()) s.tsDB = ts.NewDB(s.db) s.tsServer = ts.MakeServer(s.cfg.AmbientCtx, s.tsDB, s.cfg.TimeSeriesServerConfig, s.stopper) // TODO(bdarnell): make StoreConfig configurable. storeCfg := storage.StoreConfig{ AmbientCtx: s.cfg.AmbientCtx, Clock: s.clock, DB: s.db, Gossip: s.gossip, NodeLiveness: s.nodeLiveness, Transport: s.raftTransport, RaftTickInterval: s.cfg.RaftTickInterval, ScanInterval: s.cfg.ScanInterval, ScanMaxIdleTime: s.cfg.ScanMaxIdleTime, ConsistencyCheckInterval: s.cfg.ConsistencyCheckInterval, ConsistencyCheckPanicOnFailure: s.cfg.ConsistencyCheckPanicOnFailure, MetricsSampleInterval: s.cfg.MetricsSampleInterval, StorePool: s.storePool, SQLExecutor: sql.InternalExecutor{ LeaseManager: s.leaseMgr, }, LogRangeEvents: s.cfg.EventLogEnabled, AllocatorOptions: storage.AllocatorOptions{ AllowRebalance: true, }, RangeLeaseActiveDuration: active, RangeLeaseRenewalDuration: renewal, TimeSeriesDataStore: s.tsDB, } if s.cfg.TestingKnobs.Store != nil { storeCfg.TestingKnobs = *s.cfg.TestingKnobs.Store.(*storage.StoreTestingKnobs) } s.recorder = status.NewMetricsRecorder(s.clock) s.registry.AddMetricStruct(s.rpcContext.RemoteClocks.Metrics()) s.runtime = status.MakeRuntimeStatSampler(s.clock) s.registry.AddMetricStruct(s.runtime) s.node = NewNode(storeCfg, s.recorder, s.registry, s.stopper, txnMetrics, sql.MakeEventLogger(s.leaseMgr)) roachpb.RegisterInternalServer(s.grpc, s.node) storage.RegisterConsistencyServer(s.grpc, s.node.storesServer) storage.RegisterFreezeServer(s.grpc, s.node.storesServer) s.admin = newAdminServer(s) s.status = newStatusServer( s.cfg.AmbientCtx, s.db, s.gossip, s.recorder, s.rpcContext, s.node.stores, ) for _, gw := range []grpcGatewayServer{s.admin, s.status, &s.tsServer} { gw.RegisterService(s.grpc) } return s, nil }
// EnsureMigrations should be run during node startup to ensure that all // required migrations have been run (and running all those that are definitely // safe to run). func (m *Manager) EnsureMigrations(ctx context.Context) error { // First, check whether there are any migrations that need to be run. completedMigrations, err := m.getCompletedMigrations(ctx) if err != nil { return err } allMigrationsCompleted := true for _, migration := range backwardCompatibleMigrations { key := migrationKey(migration) if _, ok := completedMigrations[string(key)]; !ok { allMigrationsCompleted = false } } if allMigrationsCompleted { return nil } // If there are any, grab the migration lease to ensure that only one // node is ever doing migrations at a time. // Note that we shouldn't ever let client.LeaseNotAvailableErrors cause us // to stop trying, because if we return an error the server will be shut down, // and this server being down may prevent the leaseholder from finishing. var lease *client.Lease if log.V(1) { log.Info(ctx, "trying to acquire lease") } for r := retry.StartWithCtx(ctx, base.DefaultRetryOptions()); r.Next(); { lease, err = m.leaseManager.AcquireLease(ctx, keys.MigrationLease) if err == nil { break } log.Errorf(ctx, "failed attempt to acquire migration lease: %s", err) } if err != nil { return errors.Wrapf(err, "failed to acquire lease for running necessary migrations") } // Ensure that we hold the lease throughout the migration process and release // it when we're done. done := make(chan interface{}, 1) defer func() { done <- nil if log.V(1) { log.Info(ctx, "trying to release the lease") } if err := m.leaseManager.ReleaseLease(ctx, lease); err != nil { log.Errorf(ctx, "failed to release migration lease: %s", err) } }() if err := m.stopper.RunAsyncTask(ctx, func(ctx context.Context) { select { case <-done: return case <-time.After(leaseRefreshInterval): if err := m.leaseManager.ExtendLease(ctx, lease); err != nil { log.Warningf(ctx, "unable to extend ownership of expiration lease: %s", err) } if m.leaseManager.TimeRemaining(lease) < leaseRefreshInterval { // Note that we may be able to do better than this by influencing the // deadline of migrations' transactions based on the least expiration // time, but simply kill the process for now for the sake of simplicity. log.Fatal(ctx, "not enough time left on migration lease, terminating for safety") } } }); err != nil { return err } // Re-get the list of migrations in case any of them were completed between // our initial check and our grabbing of the lease. completedMigrations, err = m.getCompletedMigrations(ctx) if err != nil { return err } startTime := timeutil.Now().String() r := runner{ db: m.db, sqlExecutor: m.sqlExecutor, } for _, migration := range backwardCompatibleMigrations { key := migrationKey(migration) if _, ok := completedMigrations[string(key)]; ok { continue } if log.V(1) { log.Infof(ctx, "running migration %q", migration.name) } if err := migration.workFn(ctx, r); err != nil { return errors.Wrapf(err, "failed to run migration %q", migration.name) } if log.V(1) { log.Infof(ctx, "trying to persist record of completing migration %s", migration.name) } if err := m.db.Put(ctx, key, startTime); err != nil { return errors.Wrapf(err, "failed to persist record of completing migration %q", migration.name) } } return nil }
// Publish updates a table descriptor. It also maintains the invariant that // there are at most two versions of the descriptor out in the wild at any time // by first waiting for all nodes to be on the current (pre-update) version of // the table desc. // The update closure is called after the wait, and it provides the new version // of the descriptor to be written. In a multi-step schema operation, this // update should perform a single step. // The closure may be called multiple times if retries occur; make sure it does // not have side effects. // Returns the updated version of the descriptor. func (s LeaseStore) Publish( tableID sqlbase.ID, update func(*sqlbase.TableDescriptor) error, logEvent func(*client.Txn) error, ) (*sqlbase.Descriptor, error) { errLeaseVersionChanged := errors.New("lease version changed") // Retry while getting errLeaseVersionChanged. for r := retry.Start(base.DefaultRetryOptions()); r.Next(); { // Wait until there are no unexpired leases on the previous version // of the table. expectedVersion, err := s.waitForOneVersion(tableID, base.DefaultRetryOptions()) if err != nil { return nil, err } desc := &sqlbase.Descriptor{} // There should be only one version of the descriptor, but it's // a race now to update to the next version. err = s.db.Txn(context.TODO(), func(txn *client.Txn) error { descKey := sqlbase.MakeDescMetadataKey(tableID) // Re-read the current version of the table descriptor, this time // transactionally. if err := txn.GetProto(descKey, desc); err != nil { return err } tableDesc := desc.GetTable() if tableDesc == nil { return errors.Errorf("ID %d is not a table", tableID) } if expectedVersion != tableDesc.Version { // The version changed out from under us. Someone else must be // performing a schema change operation. if log.V(3) { log.Infof(txn.Context, "publish (version changed): %d != %d", expectedVersion, tableDesc.Version) } return errLeaseVersionChanged } // Run the update closure. if err := update(tableDesc); err != nil { return err } // Bump the version and modification time. tableDesc.Version++ now := s.clock.Now() tableDesc.ModificationTime = now if log.V(3) { log.Infof(txn.Context, "publish: descID=%d (%s) version=%d mtime=%s", tableDesc.ID, tableDesc.Name, tableDesc.Version, now.GoTime()) } if err := tableDesc.ValidateTable(); err != nil { return err } // Write the updated descriptor. txn.SetSystemConfigTrigger() b := txn.NewBatch() b.Put(descKey, desc) if logEvent != nil { // If an event log is required for this update, ensure that the // descriptor change occurs first in the transaction. This is // necessary to ensure that the System configuration change is // gossiped. See the documentation for // transaction.SetSystemConfigTrigger() for more information. if err := txn.Run(b); err != nil { return err } if err := logEvent(txn); err != nil { return err } return txn.Commit() } // More efficient batching can be used if no event log message // is required. return txn.CommitInBatch(b) }) switch err { case nil, errDidntUpdateDescriptor: return desc, nil case errLeaseVersionChanged: // will loop around to retry default: return nil, err } } panic("not reached") }