func (at *allocatorTest) Run(ctx context.Context, t *testing.T) { at.f = MakeFarmer(t, at.Prefix, stopper) if at.CockroachDiskSizeGB != 0 { at.f.AddVars["cockroach_disk_size"] = strconv.Itoa(at.CockroachDiskSizeGB) } log.Infof(ctx, "creating cluster with %d node(s)", at.StartNodes) if err := at.f.Resize(at.StartNodes); err != nil { t.Fatal(err) } CheckGossip(ctx, t, at.f, longWaitTime, HasPeers(at.StartNodes)) at.f.Assert(ctx, t) log.Info(ctx, "initial cluster is up") // We must stop the cluster because a) `nodectl` pokes at the data directory // and, more importantly, b) we don't want the cluster above and the cluster // below to ever talk to each other (see #7224). log.Info(ctx, "stopping cluster") for i := 0; i < at.f.NumNodes(); i++ { if err := at.f.Kill(ctx, i); err != nil { t.Fatalf("error stopping node %d: %s", i, err) } } log.Info(ctx, "downloading archived stores from Google Cloud Storage in parallel") errors := make(chan error, at.f.NumNodes()) for i := 0; i < at.f.NumNodes(); i++ { go func(nodeNum int) { errors <- at.f.Exec(nodeNum, "./nodectl download "+at.StoreURL) }(i) } for i := 0; i < at.f.NumNodes(); i++ { if err := <-errors; err != nil { t.Fatalf("error downloading store %d: %s", i, err) } } log.Info(ctx, "restarting cluster with archived store(s)") for i := 0; i < at.f.NumNodes(); i++ { if err := at.f.Restart(ctx, i); err != nil { t.Fatalf("error restarting node %d: %s", i, err) } } at.f.Assert(ctx, t) log.Infof(ctx, "resizing cluster to %d nodes", at.EndNodes) if err := at.f.Resize(at.EndNodes); err != nil { t.Fatal(err) } CheckGossip(ctx, t, at.f, longWaitTime, HasPeers(at.EndNodes)) at.f.Assert(ctx, t) log.Info(ctx, "waiting for rebalance to finish") if err := at.WaitForRebalance(ctx, t); err != nil { t.Fatal(err) } at.f.Assert(ctx, t) }
// Stop signals all live workers to stop and then waits for each to // confirm it has stopped. func (s *Stopper) Stop() { defer s.Recover() defer unregister(s) log.Info(context.TODO(), "stop has been called, stopping or quiescing all running tasks") // Don't bother doing stuff cleanly if we're panicking, that would likely // block. Instead, best effort only. This cleans up the stack traces, // avoids stalls and helps some tests in `./cli` finish cleanly (where // panics happen on purpose). if r := recover(); r != nil { go s.Quiesce() close(s.stopper) close(s.stopped) s.mu.Lock() for _, c := range s.mu.closers { go c.Close() } s.mu.Unlock() panic(r) } s.Quiesce() close(s.stopper) s.stop.Wait() s.mu.Lock() defer s.mu.Unlock() for _, c := range s.mu.closers { c.Close() } close(s.stopped) }
func (rq *replicateQueue) process( ctx context.Context, now hlc.Timestamp, repl *Replica, sysCfg config.SystemConfig, ) error { retryOpts := retry.Options{ InitialBackoff: 50 * time.Millisecond, MaxBackoff: 1 * time.Second, Multiplier: 2, MaxRetries: 5, } // Use a retry loop in order to backoff in the case of preemptive // snapshot errors, usually signalling that a rebalancing // reservation could not be made with the selected target. for r := retry.StartWithCtx(ctx, retryOpts); r.Next(); { if err := rq.processOneChange(ctx, now, repl, sysCfg); err != nil { if IsPreemptiveSnapshotError(err) { // If ChangeReplicas failed because the preemptive snapshot failed, we // log the error but then return success indicating we should retry the // operation. The most likely causes of the preemptive snapshot failing are // a declined reservation or the remote node being unavailable. In either // case we don't want to wait another scanner cycle before reconsidering // the range. log.Info(ctx, err) continue } return err } // Enqueue this replica again to see if there are more changes to be made. rq.MaybeAdd(repl, rq.clock.Now()) return nil } return errors.Errorf("failed to replicate %s after %d retries", repl, retryOpts.MaxRetries) }
func (c *sqlConn) Close() { if c.conn != nil { err := c.conn.Close() if err != nil && err != driver.ErrBadConn { log.Info(context.TODO(), err) } c.conn = nil } }
// deleteAllRows runs the kv operations necessary to delete all sql rows in the // table passed at construction. This may require a scan. // // resume is the resume-span which should be used for the table deletion when // the table deletion is chunked. The first call to this method should use a // zero resume-span. After a chunk is deleted a new resume-span is returned. // // limit is a limit on either the number of keys or table-rows (for // interleaved tables) deleted in the operation. func (td *tableDeleter) deleteAllRows( ctx context.Context, resume roachpb.Span, limit int64, ) (roachpb.Span, error) { if td.rd.helper.tableDesc.IsInterleaved() { if log.V(2) { log.Info(ctx, "delete forced to scan: table is interleaved") } return td.deleteAllRowsScan(ctx, resume, limit) } return td.deleteAllRowsFast(ctx, resume, limit) }
// deleteIndex runs the kv operations necessary to delete all kv entries in the // given index. This may require a scan. // // resume is the resume-span which should be used for the index deletion // when the index deletion is chunked. The first call to this method should // use a zero resume-span. After a chunk of the index is deleted a new resume- // span is returned. // // limit is a limit on the number of index entries deleted in the operation. func (td *tableDeleter) deleteIndex( ctx context.Context, idx *sqlbase.IndexDescriptor, resume roachpb.Span, limit int64, ) (roachpb.Span, error) { if len(idx.Interleave.Ancestors) > 0 || len(idx.InterleavedBy) > 0 { if log.V(2) { log.Info(ctx, "delete forced to scan: table is interleaved") } return td.deleteIndexScan(ctx, idx, resume, limit) } return td.deleteIndexFast(ctx, idx, resume, limit) }
// fastPathAvailable returns true if the fastDelete optimization can be used. func (td *tableDeleter) fastPathAvailable(ctx context.Context) bool { if len(td.rd.helper.indexes) != 0 { if log.V(2) { log.Infof(ctx, "delete forced to scan: values required to update %d secondary indexes", len(td.rd.helper.indexes)) } return false } if td.rd.helper.tableDesc.IsInterleaved() { if log.V(2) { log.Info(ctx, "delete forced to scan: table is interleaved") } return false } if len(td.rd.helper.tableDesc.PrimaryIndex.ReferencedBy) > 0 { if log.V(2) { log.Info(ctx, "delete forced to scan: table is referenced by foreign keys") } return false } return true }
func (m *Manager) getCompletedMigrations(ctx context.Context) (map[string]struct{}, error) { if log.V(1) { log.Info(ctx, "trying to get the list of completed migrations") } keyvals, err := m.db.Scan(ctx, keys.MigrationPrefix, keys.MigrationKeyMax, 0 /* maxRows */) if err != nil { return nil, errors.Wrapf(err, "failed to get list of completed migrations") } completedMigrations := make(map[string]struct{}) for _, keyval := range keyvals { completedMigrations[string(keyval.Key)] = struct{}{} } return completedMigrations, nil }
// WaitForRebalance waits until there's been no recent range adds, removes, and // splits. We wait until the cluster is balanced or until `StableInterval` // elapses, whichever comes first. Then, it prints stats about the rebalancing // process. If the replica count appears unbalanced, an error is returned. // // This method is crude but necessary. If we were to wait until range counts // were just about even, we'd miss potential post-rebalance thrashing. func (at *allocatorTest) WaitForRebalance(ctx context.Context, t *testing.T) error { const statsInterval = 20 * time.Second db, err := gosql.Open("postgres", at.f.PGUrl(ctx, 0)) if err != nil { return err } defer func() { _ = db.Close() }() var statsTimer timeutil.Timer var assertTimer timeutil.Timer defer statsTimer.Stop() defer assertTimer.Stop() statsTimer.Reset(statsInterval) assertTimer.Reset(0) for { select { case <-statsTimer.C: statsTimer.Read = true stats, err := at.allocatorStats(db) if err != nil { return err } log.Info(ctx, stats) if StableInterval <= stats.ElapsedSinceLastEvent { host := at.f.Hostname(0) log.Infof(context.Background(), "replica count = %f, max = %f", stats.ReplicaCountStdDev, *flagATMaxStdDev) if stats.ReplicaCountStdDev > *flagATMaxStdDev { _ = at.printRebalanceStats(db, host) return errors.Errorf( "%s elapsed without changes, but replica count standard "+ "deviation is %.2f (>%.2f)", stats.ElapsedSinceLastEvent, stats.ReplicaCountStdDev, *flagATMaxStdDev) } return at.printRebalanceStats(db, host) } statsTimer.Reset(statsInterval) case <-assertTimer.C: assertTimer.Read = true at.f.Assert(ctx, t) assertTimer.Reset(time.Minute) case <-stopper.ShouldStop(): return errors.New("interrupted") } } }
// addHistory persists a line of input to the readline history // file. func (c *cliState) addHistory(line string) { if !isInteractive { return } // ins.SaveHistory will push command into memory and try to // persist to disk (if ins's config.HistoryFile is set). err can // be not nil only if it got a IO error while trying to persist. if err := c.ins.SaveHistory(line); err != nil { log.Warningf(context.TODO(), "cannot save command-line history: %s", err) log.Info(context.TODO(), "command-line history will not be saved in this session") cfg := c.ins.Config.Clone() cfg.HistoryFile = "" c.ins.SetConfig(cfg) } }
func (rq *replicateQueue) addReplica( ctx context.Context, repl *Replica, repDesc roachpb.ReplicaDescriptor, desc *roachpb.RangeDescriptor, ) error { err := repl.ChangeReplicas(ctx, roachpb.ADD_REPLICA, repDesc, desc) if IsPreemptiveSnapshotError(err) { // If the ChangeReplicas failed because the preemptive snapshot failed, we // log the error but then return success indicating we should retry the // operation. The most likely causes of the preemptive snapshot failing are // a declined reservation or the remote node being unavailable. In either // case we don't want to wait another scanner cycle before reconsidering // the range. log.Info(ctx, err) return nil } return err }
func (c *cliState) doStart(nextState cliStateEnum) cliStateEnum { // Common initialization. c.syntax = parser.Traditional c.querySyntax = true c.partialLines = []string{} if isInteractive { c.fullPrompt, c.continuePrompt = preparePrompts(c.conn.url) // We only enable history management when the terminal is actually // interactive. This saves on memory when e.g. piping a large SQL // script through the command-line client. userAcct, err := user.Current() if err != nil { if log.V(2) { log.Warningf(context.TODO(), "cannot retrieve user information: %s", err) log.Info(context.TODO(), "cannot load or save the command-line history") } } else { histFile := filepath.Join(userAcct.HomeDir, cmdHistFile) cfg := c.ins.Config.Clone() cfg.HistoryFile = histFile cfg.HistorySearchFold = true c.ins.SetConfig(cfg) } // The user only gets to see the info screen on interactive session. fmt.Print(infoMessage) c.checkSyntax = true c.normalizeHistory = true c.errExit = false } else { // When running non-interactive, by default we want errors to stop // further processing and all syntax checking to be done // server-side. c.errExit = true c.checkSyntax = false } return nextState }
func (at *allocatorTest) runSchemaChanges(ctx context.Context, t *testing.T) error { db, err := gosql.Open("postgres", at.f.PGUrl(ctx, 0)) if err != nil { return err } defer func() { _ = db.Close() }() const tableName = "datablocks.blocks" schemaChanges := []string{ "ALTER TABLE %s ADD COLUMN newcol DECIMAL DEFAULT (DECIMAL '1.4')", "CREATE INDEX foo ON %s (block_id)", } var wg sync.WaitGroup wg.Add(len(schemaChanges)) for i := range schemaChanges { go func(i int) { start := timeutil.Now() cmd := fmt.Sprintf(schemaChanges[i], tableName) log.Infof(ctx, "starting schema change: %s", cmd) if _, err := db.Exec(cmd); err != nil { log.Infof(ctx, "hit schema change error: %s, for %s, in %s", err, cmd, timeutil.Since(start)) t.Error(err) return } log.Infof(ctx, "completed schema change: %s, in %s", cmd, timeutil.Since(start)) wg.Done() // TODO(vivek): Monitor progress of schema changes and log progress. }(i) } wg.Wait() log.Info(ctx, "validate applied schema changes") if err := at.ValidateSchemaChanges(ctx, t); err != nil { t.Fatal(err) } return nil }
// EnsureMigrations should be run during node startup to ensure that all // required migrations have been run (and running all those that are definitely // safe to run). func (m *Manager) EnsureMigrations(ctx context.Context) error { // First, check whether there are any migrations that need to be run. completedMigrations, err := m.getCompletedMigrations(ctx) if err != nil { return err } allMigrationsCompleted := true for _, migration := range backwardCompatibleMigrations { key := migrationKey(migration) if _, ok := completedMigrations[string(key)]; !ok { allMigrationsCompleted = false } } if allMigrationsCompleted { return nil } // If there are any, grab the migration lease to ensure that only one // node is ever doing migrations at a time. // Note that we shouldn't ever let client.LeaseNotAvailableErrors cause us // to stop trying, because if we return an error the server will be shut down, // and this server being down may prevent the leaseholder from finishing. var lease *client.Lease if log.V(1) { log.Info(ctx, "trying to acquire lease") } for r := retry.StartWithCtx(ctx, base.DefaultRetryOptions()); r.Next(); { lease, err = m.leaseManager.AcquireLease(ctx, keys.MigrationLease) if err == nil { break } log.Errorf(ctx, "failed attempt to acquire migration lease: %s", err) } if err != nil { return errors.Wrapf(err, "failed to acquire lease for running necessary migrations") } // Ensure that we hold the lease throughout the migration process and release // it when we're done. done := make(chan interface{}, 1) defer func() { done <- nil if log.V(1) { log.Info(ctx, "trying to release the lease") } if err := m.leaseManager.ReleaseLease(ctx, lease); err != nil { log.Errorf(ctx, "failed to release migration lease: %s", err) } }() if err := m.stopper.RunAsyncTask(ctx, func(ctx context.Context) { select { case <-done: return case <-time.After(leaseRefreshInterval): if err := m.leaseManager.ExtendLease(ctx, lease); err != nil { log.Warningf(ctx, "unable to extend ownership of expiration lease: %s", err) } if m.leaseManager.TimeRemaining(lease) < leaseRefreshInterval { // Note that we may be able to do better than this by influencing the // deadline of migrations' transactions based on the least expiration // time, but simply kill the process for now for the sake of simplicity. log.Fatal(ctx, "not enough time left on migration lease, terminating for safety") } } }); err != nil { return err } // Re-get the list of migrations in case any of them were completed between // our initial check and our grabbing of the lease. completedMigrations, err = m.getCompletedMigrations(ctx) if err != nil { return err } startTime := timeutil.Now().String() r := runner{ db: m.db, sqlExecutor: m.sqlExecutor, } for _, migration := range backwardCompatibleMigrations { key := migrationKey(migration) if _, ok := completedMigrations[string(key)]; ok { continue } if log.V(1) { log.Infof(ctx, "running migration %q", migration.name) } if err := migration.workFn(ctx, r); err != nil { return errors.Wrapf(err, "failed to run migration %q", migration.name) } if log.V(1) { log.Infof(ctx, "trying to persist record of completing migration %s", migration.name) } if err := m.db.Put(ctx, key, startTime); err != nil { return errors.Wrapf(err, "failed to persist record of completing migration %q", migration.name) } } return nil }
// Start starts the server on the specified port, starts gossip and initializes // the node using the engines from the server's context. // // The passed context can be used to trace the server startup. The context // should represent the general startup operation. func (s *Server) Start(ctx context.Context) error { ctx = s.AnnotateCtx(ctx) startTime := timeutil.Now() tlsConfig, err := s.cfg.GetServerTLSConfig() if err != nil { return err } httpServer := netutil.MakeServer(s.stopper, tlsConfig, s) plainRedirectServer := netutil.MakeServer(s.stopper, tlsConfig, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { http.Redirect(w, r, "https://"+r.Host+r.RequestURI, http.StatusPermanentRedirect) })) // The following code is a specialization of util/net.go's ListenAndServe // which adds pgwire support. A single port is used to serve all protocols // (pg, http, h2) via the following construction: // // non-TLS case: // net.Listen -> cmux.New // | // - -> pgwire.Match -> pgwire.Server.ServeConn // - -> cmux.Any -> grpc.(*Server).Serve // // TLS case: // net.Listen -> cmux.New // | // - -> pgwire.Match -> pgwire.Server.ServeConn // - -> cmux.Any -> grpc.(*Server).Serve // // Note that the difference between the TLS and non-TLS cases exists due to // Go's lack of an h2c (HTTP2 Clear Text) implementation. See inline comments // in util.ListenAndServe for an explanation of how h2c is implemented there // and here. ln, err := net.Listen("tcp", s.cfg.Addr) if err != nil { return err } log.Eventf(ctx, "listening on port %s", s.cfg.Addr) unresolvedListenAddr, err := officialAddr(s.cfg.Addr, ln.Addr()) if err != nil { return err } s.cfg.Addr = unresolvedListenAddr.String() unresolvedAdvertAddr, err := officialAddr(s.cfg.AdvertiseAddr, ln.Addr()) if err != nil { return err } s.cfg.AdvertiseAddr = unresolvedAdvertAddr.String() s.rpcContext.SetLocalInternalServer(s.node) m := cmux.New(ln) pgL := m.Match(pgwire.Match) anyL := m.Match(cmux.Any()) httpLn, err := net.Listen("tcp", s.cfg.HTTPAddr) if err != nil { return err } unresolvedHTTPAddr, err := officialAddr(s.cfg.HTTPAddr, httpLn.Addr()) if err != nil { return err } s.cfg.HTTPAddr = unresolvedHTTPAddr.String() workersCtx := s.AnnotateCtx(context.Background()) s.stopper.RunWorker(func() { <-s.stopper.ShouldQuiesce() if err := httpLn.Close(); err != nil { log.Fatal(workersCtx, err) } }) if tlsConfig != nil { httpMux := cmux.New(httpLn) clearL := httpMux.Match(cmux.HTTP1()) tlsL := httpMux.Match(cmux.Any()) s.stopper.RunWorker(func() { netutil.FatalIfUnexpected(httpMux.Serve()) }) s.stopper.RunWorker(func() { netutil.FatalIfUnexpected(plainRedirectServer.Serve(clearL)) }) httpLn = tls.NewListener(tlsL, tlsConfig) } s.stopper.RunWorker(func() { netutil.FatalIfUnexpected(httpServer.Serve(httpLn)) }) s.stopper.RunWorker(func() { <-s.stopper.ShouldQuiesce() netutil.FatalIfUnexpected(anyL.Close()) <-s.stopper.ShouldStop() s.grpc.Stop() }) s.stopper.RunWorker(func() { netutil.FatalIfUnexpected(s.grpc.Serve(anyL)) }) // Running the SQL migrations safely requires that we aren't serving SQL // requests at the same time -- to ensure that, block the serving of SQL // traffic until the migrations are done, as indicated by this channel. serveSQL := make(chan bool) s.stopper.RunWorker(func() { select { case <-serveSQL: case <-s.stopper.ShouldQuiesce(): return } pgCtx := s.pgServer.AmbientCtx.AnnotateCtx(context.Background()) netutil.FatalIfUnexpected(httpServer.ServeWith(s.stopper, pgL, func(conn net.Conn) { connCtx := log.WithLogTagStr(pgCtx, "client", conn.RemoteAddr().String()) if err := s.pgServer.ServeConn(connCtx, conn); err != nil && !netutil.IsClosedConnection(err) { // Report the error on this connection's context, so that we // know which remote client caused the error when looking at // the logs. log.Error(connCtx, err) } })) }) // Enable the debug endpoints first to provide an earlier window // into what's going on with the node in advance of exporting node // functionality. // TODO(marc): when cookie-based authentication exists, // apply it for all web endpoints. s.mux.HandleFunc(debugEndpoint, http.HandlerFunc(handleDebug)) s.gossip.Start(unresolvedAdvertAddr) log.Event(ctx, "started gossip") s.engines, err = s.cfg.CreateEngines() if err != nil { return errors.Wrap(err, "failed to create engines") } s.stopper.AddCloser(&s.engines) // We might have to sleep a bit to protect against this node producing non- // monotonic timestamps. Before restarting, its clock might have been driven // by other nodes' fast clocks, but when we restarted, we lost all this // information. For example, a client might have written a value at a // timestamp that's in the future of the restarted node's clock, and if we // don't do something, the same client's read would not return the written // value. So, we wait up to MaxOffset; we couldn't have served timestamps more // than MaxOffset in the future (assuming that MaxOffset was not changed, see // #9733). // // As an optimization for tests, we don't sleep if all the stores are brand // new. In this case, the node will not serve anything anyway until it // synchronizes with other nodes. { anyStoreBootstrapped := false for _, e := range s.engines { if _, err := storage.ReadStoreIdent(ctx, e); err != nil { // NotBootstrappedError is expected. if _, ok := err.(*storage.NotBootstrappedError); !ok { return err } } else { anyStoreBootstrapped = true break } } if anyStoreBootstrapped { sleepDuration := s.clock.MaxOffset() - timeutil.Since(startTime) if sleepDuration > 0 { log.Infof(ctx, "sleeping for %s to guarantee HLC monotonicity", sleepDuration) time.Sleep(sleepDuration) } } } // Now that we have a monotonic HLC wrt previous incarnations of the process, // init all the replicas. err = s.node.start( ctx, unresolvedAdvertAddr, s.engines, s.cfg.NodeAttributes, s.cfg.Locality, ) if err != nil { return err } log.Event(ctx, "started node") // We can now add the node registry. s.recorder.AddNode(s.registry, s.node.Descriptor, s.node.startedAt) // Begin recording runtime statistics. s.startSampleEnvironment(s.cfg.MetricsSampleInterval) // Begin recording time series data collected by the status monitor. s.tsDB.PollSource( s.cfg.AmbientCtx, s.recorder, s.cfg.MetricsSampleInterval, ts.Resolution10s, s.stopper, ) // Begin recording status summaries. s.node.startWriteSummaries(s.cfg.MetricsSampleInterval) // Create and start the schema change manager only after a NodeID // has been assigned. testingKnobs := &sql.SchemaChangerTestingKnobs{} if s.cfg.TestingKnobs.SQLSchemaChanger != nil { testingKnobs = s.cfg.TestingKnobs.SQLSchemaChanger.(*sql.SchemaChangerTestingKnobs) } sql.NewSchemaChangeManager(testingKnobs, *s.db, s.gossip, s.leaseMgr).Start(s.stopper) s.sqlExecutor.Start(ctx, &s.adminMemMetrics, s.node.Descriptor) s.distSQLServer.Start() log.Infof(ctx, "starting %s server at %s", s.cfg.HTTPRequestScheme(), unresolvedHTTPAddr) log.Infof(ctx, "starting grpc/postgres server at %s", unresolvedListenAddr) log.Infof(ctx, "advertising CockroachDB node at %s", unresolvedAdvertAddr) s.stopper.RunWorker(func() { netutil.FatalIfUnexpected(m.Serve()) }) if len(s.cfg.SocketFile) != 0 { log.Infof(ctx, "starting postgres server at unix:%s", s.cfg.SocketFile) // Unix socket enabled: postgres protocol only. unixLn, err := net.Listen("unix", s.cfg.SocketFile) if err != nil { return err } s.stopper.RunWorker(func() { <-s.stopper.ShouldQuiesce() if err := unixLn.Close(); err != nil { log.Fatal(workersCtx, err) } }) s.stopper.RunWorker(func() { select { case <-serveSQL: case <-s.stopper.ShouldQuiesce(): return } pgCtx := s.pgServer.AmbientCtx.AnnotateCtx(context.Background()) netutil.FatalIfUnexpected(httpServer.ServeWith(s.stopper, unixLn, func(conn net.Conn) { connCtx := log.WithLogTagStr(pgCtx, "client", conn.RemoteAddr().String()) if err := s.pgServer.ServeConn(connCtx, conn); err != nil && !netutil.IsClosedConnection(err) { // Report the error on this connection's context, so that we // know which remote client caused the error when looking at // the logs. log.Error(connCtx, err) } })) }) } log.Event(ctx, "accepting connections") s.nodeLiveness.StartHeartbeat(ctx, s.stopper) // Before serving SQL requests, we have to make sure the database is // in an acceptable form for this version of the software. // We have to do this after actually starting up the server to be able to // seamlessly use the kv client against other nodes in the cluster. migMgr := migrations.NewManager( s.stopper, s.db, s.sqlExecutor, s.clock, s.NodeID().String()) if err := migMgr.EnsureMigrations(ctx); err != nil { log.Fatal(ctx, err) } log.Infof(ctx, "done ensuring all necessary migrations have run") close(serveSQL) log.Info(ctx, "serving sql connections") // Initialize grpc-gateway mux and context. jsonpb := &protoutil.JSONPb{ EnumsAsInts: true, EmitDefaults: true, Indent: " ", } protopb := new(protoutil.ProtoPb) gwMux := gwruntime.NewServeMux( gwruntime.WithMarshalerOption(gwruntime.MIMEWildcard, jsonpb), gwruntime.WithMarshalerOption(httputil.JSONContentType, jsonpb), gwruntime.WithMarshalerOption(httputil.AltJSONContentType, jsonpb), gwruntime.WithMarshalerOption(httputil.ProtoContentType, protopb), gwruntime.WithMarshalerOption(httputil.AltProtoContentType, protopb), ) gwCtx, gwCancel := context.WithCancel(s.AnnotateCtx(context.Background())) s.stopper.AddCloser(stop.CloserFn(gwCancel)) // Setup HTTP<->gRPC handlers. conn, err := s.rpcContext.GRPCDial(s.cfg.Addr) if err != nil { return errors.Errorf("error constructing grpc-gateway: %s; are your certificates valid?", err) } for _, gw := range []grpcGatewayServer{s.admin, s.status, &s.tsServer} { if err := gw.RegisterGateway(gwCtx, gwMux, conn); err != nil { return err } } var uiFileSystem http.FileSystem uiDebug := envutil.EnvOrDefaultBool("COCKROACH_DEBUG_UI", false) if uiDebug { uiFileSystem = http.Dir("pkg/ui") } else { uiFileSystem = &assetfs.AssetFS{ Asset: ui.Asset, AssetDir: ui.AssetDir, AssetInfo: ui.AssetInfo, } } uiFileServer := http.FileServer(uiFileSystem) s.mux.HandleFunc("/", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { if r.URL.Path == "/" { if uiDebug { r.URL.Path = "debug.html" } else { r.URL.Path = "release.html" } } uiFileServer.ServeHTTP(w, r) })) // TODO(marc): when cookie-based authentication exists, // apply it for all web endpoints. s.mux.Handle(adminPrefix, gwMux) s.mux.Handle(ts.URLPrefix, gwMux) s.mux.Handle(statusPrefix, gwMux) s.mux.Handle("/health", gwMux) s.mux.Handle(statusVars, http.HandlerFunc(s.status.handleVars)) log.Event(ctx, "added http endpoints") if err := sdnotify.Ready(); err != nil { log.Errorf(ctx, "failed to signal readiness using systemd protocol: %s", err) } log.Event(ctx, "server ready") return nil }
// RefreshLeases starts a goroutine that refreshes the lease manager // leases for tables received in the latest system configuration via gossip. func (m *LeaseManager) RefreshLeases(s *stop.Stopper, db *client.DB, gossip *gossip.Gossip) { s.RunWorker(func() { descKeyPrefix := keys.MakeTablePrefix(uint32(sqlbase.DescriptorTable.ID)) gossipUpdateC := gossip.RegisterSystemConfigChannel() for { select { case <-gossipUpdateC: cfg, _ := gossip.GetSystemConfig() if m.testingKnobs.GossipUpdateEvent != nil { m.testingKnobs.GossipUpdateEvent(cfg) } // Read all tables and their versions if log.V(2) { log.Info(context.TODO(), "received a new config; will refresh leases") } // Loop through the configuration to find all the tables. for _, kv := range cfg.Values { if !bytes.HasPrefix(kv.Key, descKeyPrefix) { continue } // Attempt to unmarshal config into a table/database descriptor. var descriptor sqlbase.Descriptor if err := kv.Value.GetProto(&descriptor); err != nil { log.Warningf(context.TODO(), "%s: unable to unmarshal descriptor %v", kv.Key, kv.Value) continue } switch union := descriptor.Union.(type) { case *sqlbase.Descriptor_Table: table := union.Table table.MaybeUpgradeFormatVersion() if err := table.ValidateTable(); err != nil { log.Errorf(context.TODO(), "%s: received invalid table descriptor: %v", kv.Key, table) continue } if log.V(2) { log.Infof(context.TODO(), "%s: refreshing lease table: %d (%s), version: %d, deleted: %t", kv.Key, table.ID, table.Name, table.Version, table.Dropped()) } // Try to refresh the table lease to one >= this version. if t := m.findTableState(table.ID, false /* create */); t != nil { if err := t.purgeOldLeases( db, table.Dropped(), table.Version, m.LeaseStore); err != nil { log.Warningf(context.TODO(), "error purging leases for table %d(%s): %s", table.ID, table.Name, err) } } case *sqlbase.Descriptor_Database: // Ignore. } } if m.testingKnobs.TestingLeasesRefreshedEvent != nil { m.testingKnobs.TestingLeasesRefreshedEvent(cfg) } case <-s.ShouldStop(): return } } }) }
// Start starts a goroutine that runs outstanding schema changes // for tables received in the latest system configuration via gossip. func (s *SchemaChangeManager) Start(stopper *stop.Stopper) { stopper.RunWorker(func() { descKeyPrefix := keys.MakeTablePrefix(uint32(sqlbase.DescriptorTable.ID)) gossipUpdateC := s.gossip.RegisterSystemConfigChannel() timer := &time.Timer{} delay := 360 * time.Second if s.testingKnobs.AsyncExecQuickly { delay = 20 * time.Millisecond } for { select { case <-gossipUpdateC: cfg, _ := s.gossip.GetSystemConfig() // Read all tables and their versions if log.V(2) { log.Info(context.TODO(), "received a new config") } schemaChanger := SchemaChanger{ nodeID: s.leaseMgr.nodeID.Get(), db: s.db, leaseMgr: s.leaseMgr, testingKnobs: s.testingKnobs, } // Keep track of existing schema changers. oldSchemaChangers := make(map[sqlbase.ID]struct{}, len(s.schemaChangers)) for k := range s.schemaChangers { oldSchemaChangers[k] = struct{}{} } execAfter := timeutil.Now().Add(delay) // Loop through the configuration to find all the tables. for _, kv := range cfg.Values { if !bytes.HasPrefix(kv.Key, descKeyPrefix) { continue } // Attempt to unmarshal config into a table/database descriptor. var descriptor sqlbase.Descriptor if err := kv.Value.GetProto(&descriptor); err != nil { log.Warningf(context.TODO(), "%s: unable to unmarshal descriptor %v", kv.Key, kv.Value) continue } switch union := descriptor.Union.(type) { case *sqlbase.Descriptor_Table: table := union.Table table.MaybeUpgradeFormatVersion() if err := table.ValidateTable(); err != nil { log.Errorf(context.TODO(), "%s: received invalid table descriptor: %v", kv.Key, table) continue } // Keep track of outstanding schema changes. // If all schema change commands always set UpVersion, why // check for the presence of mutations? // A schema change execution might fail soon after // unsetting UpVersion, and we still want to process // outstanding mutations. Similar with a table marked for deletion. if table.UpVersion || table.Dropped() || table.Adding() || table.Renamed() || len(table.Mutations) > 0 { if log.V(2) { log.Infof(context.TODO(), "%s: queue up pending schema change; table: %d, version: %d", kv.Key, table.ID, table.Version) } // Only track the first schema change. We depend on // gossip to renotify us when a schema change has been // completed. schemaChanger.tableID = table.ID if len(table.Mutations) == 0 { schemaChanger.mutationID = sqlbase.InvalidMutationID } else { schemaChanger.mutationID = table.Mutations[0].MutationID } schemaChanger.execAfter = execAfter // Keep track of this schema change. // Remove from oldSchemaChangers map. delete(oldSchemaChangers, table.ID) if sc, ok := s.schemaChangers[table.ID]; ok { if sc.mutationID == schemaChanger.mutationID { // Ignore duplicate. continue } } s.schemaChangers[table.ID] = schemaChanger } case *sqlbase.Descriptor_Database: // Ignore. } } // Delete old schema changers. for k := range oldSchemaChangers { delete(s.schemaChangers, k) } timer = s.newTimer() case <-timer.C: if s.testingKnobs.AsyncExecNotification != nil && s.testingKnobs.AsyncExecNotification() != nil { timer = s.newTimer() continue } for tableID, sc := range s.schemaChangers { if timeutil.Since(sc.execAfter) > 0 { err := sc.exec() if err != nil { if err == errExistingSchemaChangeLease { } else if err == sqlbase.ErrDescriptorNotFound { // Someone deleted this table. Don't try to run the schema // changer again. Note that there's no gossip update for the // deletion which would remove this schemaChanger. delete(s.schemaChangers, tableID) } else { // We don't need to act on integrity // constraints violations because exec() // purges mutations that violate integrity // constraints. log.Warningf(context.TODO(), "Error executing schema change: %s", err) } } // Advance the execAfter time so that this schema // changer doesn't get called again for a while. sc.execAfter = timeutil.Now().Add(delay) } // Only attempt to run one schema changer. break } timer = s.newTimer() case <-stopper.ShouldStop(): return } } }) }
func TestReplicateQueueRebalance(t *testing.T) { defer leaktest.AfterTest(t)() // Set the gossip stores interval lower to speed up rebalancing. With the // default of 5s we have to wait ~5s for the rebalancing to start. defer func(v time.Duration) { gossip.GossipStoresInterval = v }(gossip.GossipStoresInterval) gossip.GossipStoresInterval = 100 * time.Millisecond // TODO(peter): Remove when lease rebalancing is the default. defer func(v bool) { storage.EnableLeaseRebalancing = v }(storage.EnableLeaseRebalancing) storage.EnableLeaseRebalancing = true const numNodes = 5 tc := testcluster.StartTestCluster(t, numNodes, base.TestClusterArgs{ReplicationMode: base.ReplicationAuto}, ) defer tc.Stopper().Stop() const newRanges = 5 for i := 0; i < newRanges; i++ { tableID := keys.MaxReservedDescID + i + 1 splitKey := keys.MakeRowSentinelKey(keys.MakeTablePrefix(uint32(tableID))) for { if _, _, err := tc.SplitRange(splitKey); err != nil { if testutils.IsError(err, "split at key .* failed: conflict updating range descriptors") || testutils.IsError(err, "range is already split at key") { continue } t.Fatal(err) } break } } countReplicas := func() []int { counts := make([]int, len(tc.Servers)) for _, s := range tc.Servers { err := s.Stores().VisitStores(func(s *storage.Store) error { counts[s.StoreID()-1] += s.ReplicaCount() return nil }) if err != nil { t.Fatal(err) } } return counts } numRanges := newRanges + server.ExpectedInitialRangeCount() numReplicas := numRanges * 3 const minThreshold = 0.9 minReplicas := int(math.Floor(minThreshold * (float64(numReplicas) / numNodes))) util.SucceedsSoon(t, func() error { counts := countReplicas() for _, c := range counts { if c < minReplicas { err := errors.Errorf("not balanced: %d", counts) log.Info(context.Background(), err) return err } } return nil }) }
func (s *server) gossipReceiver( ctx context.Context, argsPtr **Request, senderFn func(*Response) error, receiverFn func() (*Request, error), ) error { s.mu.Lock() defer s.mu.Unlock() reply := new(Response) // This loop receives gossip from the client. It does not attempt to send the // server's gossip to the client. for { args := *argsPtr if args.NodeID != 0 { // Decide whether or not we can accept the incoming connection // as a permanent peer. if args.NodeID == s.NodeID.Get() { // This is an incoming loopback connection which should be closed by // the client. if log.V(2) { log.Infof(ctx, "ignoring gossip from node %d (loopback)", args.NodeID) } } else if s.mu.incoming.hasNode(args.NodeID) { // Do nothing. if log.V(2) { log.Infof(ctx, "gossip received from node %d", args.NodeID) } } else if s.mu.incoming.hasSpace() { if log.V(2) { log.Infof(ctx, "adding node %d to incoming set", args.NodeID) } s.mu.incoming.addNode(args.NodeID) s.mu.nodeMap[args.Addr] = serverInfo{ peerID: args.NodeID, createdAt: timeutil.Now(), } defer func(nodeID roachpb.NodeID, addr util.UnresolvedAddr) { if log.V(2) { log.Infof(ctx, "removing node %d from incoming set", args.NodeID) } s.mu.incoming.removeNode(nodeID) delete(s.mu.nodeMap, addr) }(args.NodeID, args.Addr) } else { var alternateAddr util.UnresolvedAddr var alternateNodeID roachpb.NodeID // Choose a random peer for forwarding. altIdx := rand.Intn(len(s.mu.nodeMap)) for addr, info := range s.mu.nodeMap { if altIdx == 0 { alternateAddr = addr alternateNodeID = info.peerID break } altIdx-- } log.Infof(ctx, "refusing gossip from node %d (max %d conns); forwarding to %d (%s)", args.NodeID, s.mu.incoming.maxSize, alternateNodeID, alternateAddr) *reply = Response{ NodeID: s.NodeID.Get(), AlternateAddr: &alternateAddr, AlternateNodeID: alternateNodeID, } s.mu.Unlock() err := senderFn(reply) s.mu.Lock() // Naively, we would return err here unconditionally, but that // introduces a race. Specifically, the client may observe the // end of the connection before it has a chance to receive and // process this message, which instructs it to hang up anyway. // Instead, we send the message and proceed to gossip // normally, depending on the client to end the connection. if err != nil { return err } } } else { log.Info(ctx, "received gossip from unknown node") } bytesReceived := int64(args.Size()) infosReceived := int64(len(args.Delta)) s.nodeMetrics.BytesReceived.Inc(bytesReceived) s.nodeMetrics.InfosReceived.Inc(infosReceived) s.serverMetrics.BytesReceived.Inc(bytesReceived) s.serverMetrics.InfosReceived.Inc(infosReceived) freshCount, err := s.mu.is.combine(args.Delta, args.NodeID) if err != nil { log.Warningf(ctx, "failed to fully combine gossip delta from node %d: %s", args.NodeID, err) } if log.V(1) { log.Infof(ctx, "received %s from node %d (%d fresh)", extractKeys(args.Delta), args.NodeID, freshCount) } s.maybeTightenLocked() *reply = Response{ NodeID: s.NodeID.Get(), HighWaterStamps: s.mu.is.getHighWaterStamps(), } s.mu.Unlock() err = senderFn(reply) s.mu.Lock() if err != nil { return err } if cycler := s.simulationCycler; cycler != nil { cycler.Wait() } s.mu.Unlock() recvArgs, err := receiverFn() s.mu.Lock() if err != nil { return err } // *argsPtr holds the remote peer state; we need to update it whenever we // receive a new non-nil request. We avoid assigning to *argsPtr directly // because the gossip sender above has closed over *argsPtr and will NPE if // *argsPtr were set to nil. *argsPtr = recvArgs } }