func (sc *SchemaChanger) maybeWriteResumeSpan( txn *client.Txn, version sqlbase.DescriptorVersion, resume roachpb.Span, mutationIdx int, lastCheckpoint *time.Time, ) error { checkpointInterval := checkpointInterval if sc.testingKnobs.WriteCheckpointInterval > 0 { checkpointInterval = sc.testingKnobs.WriteCheckpointInterval } if timeutil.Since(*lastCheckpoint) < checkpointInterval { return nil } tableDesc, err := sqlbase.GetTableDescFromID(txn, sc.tableID) if err != nil { return err } if tableDesc.Version != version { return errVersionMismatch } tableDesc.Mutations[mutationIdx].ResumeSpan = resume txn.SetSystemConfigTrigger() if err := txn.Put(sqlbase.MakeDescMetadataKey(tableDesc.GetID()), sqlbase.WrapDescriptor(tableDesc)); err != nil { return err } *lastCheckpoint = timeutil.Now() return nil }
// processReplica processes a single replica. This should not be // called externally to the queue. bq.mu.Lock must not be held // while calling this method. func (bq *baseQueue) processReplica( queueCtx context.Context, repl *Replica, clock *hlc.Clock, ) error { bq.processMu.Lock() defer bq.processMu.Unlock() // Load the system config. cfg, ok := bq.gossip.GetSystemConfig() if !ok { log.VEventf(queueCtx, 1, "no system config available, skipping") return nil } if bq.requiresSplit(cfg, repl) { // Range needs to be split due to zone configs, but queue does // not accept unsplit ranges. log.VEventf(queueCtx, 3, "%s: split needed; skipping", repl) return nil } // Putting a span in a context means that events will no longer go to the // event log. Use queueCtx for events that are intended for the event log. ctx, span := bq.AnnotateCtxWithSpan(queueCtx, bq.name) defer span.Finish() // Also add the Replica annotations to ctx. ctx = repl.AnnotateCtx(ctx) ctx, cancel := context.WithTimeout(ctx, bq.processTimeout) defer cancel() log.Eventf(ctx, "processing replica") // If the queue requires a replica to have the range lease in // order to be processed, check whether this replica has range lease // and renew or acquire if necessary. if bq.needsLease { // Create a "fake" get request in order to invoke redirectOnOrAcquireLease. if err := repl.redirectOnOrAcquireLease(ctx); err != nil { if _, harmless := err.GetDetail().(*roachpb.NotLeaseHolderError); harmless { log.VEventf(queueCtx, 3, "not holding lease; skipping") return nil } return errors.Wrapf(err.GoError(), "%s: could not obtain lease", repl) } log.Event(ctx, "got range lease") } log.VEventf(queueCtx, 3, "processing") start := timeutil.Now() err := bq.impl.process(ctx, clock.Now(), repl, cfg) duration := timeutil.Since(start) bq.processingNanos.Inc(duration.Nanoseconds()) if err != nil { return err } log.VEventf(queueCtx, 2, "done: %s", duration) log.Event(ctx, "done") bq.successes.Inc(1) return nil }
func testDecimalSingleArgFunc( t *testing.T, f func(*inf.Dec, *inf.Dec, inf.Scale) (*inf.Dec, error), s inf.Scale, tests []decimalOneArgTestCase, ) { for _, tc := range tests { t.Run(fmt.Sprintf("%s = %s", tc.input, tc.expected), func(t *testing.T) { x, exp := new(inf.Dec), new(inf.Dec) x.SetString(tc.input) exp.SetString(tc.expected) // Test allocated return value. var z *inf.Dec var err error done := make(chan struct{}, 1) start := timeutil.Now() go func() { z, err = f(nil, x, s) done <- struct{}{} }() var after <-chan time.Time if *flagDurationLimit > 0 { after = time.After(*flagDurationLimit) } select { case <-done: t.Logf("execute duration: %s", timeutil.Since(start)) case <-after: t.Fatalf("timedout after %s", *flagDurationLimit) } if err != nil { if tc.expected != err.Error() { t.Errorf("expected error %s, got %s", tc.expected, err) } return } if exp.Cmp(z) != 0 { t.Errorf("expected %s, got %s", exp, z) } // Test provided decimal mutation. z.SetString("0.0") _, _ = f(z, x, s) if exp.Cmp(z) != 0 { t.Errorf("expected %s, got %s", exp, z) } // Test same arg mutation. _, _ = f(x, x, s) if exp.Cmp(x) != 0 { t.Errorf("expected %s, got %s", exp, x) } x.SetString(tc.input) }) } }
func testPutInner(ctx context.Context, t *testing.T, c cluster.Cluster, cfg cluster.TestConfig) { db, err := c.NewClient(ctx, 0) if err != nil { t.Fatal(err) } errs := make(chan error, c.NumNodes()) start := timeutil.Now() deadline := start.Add(cfg.Duration) var count int64 for i := 0; i < c.NumNodes(); i++ { go func() { r, _ := randutil.NewPseudoRand() value := randutil.RandBytes(r, 8192) for timeutil.Now().Before(deadline) { k := atomic.AddInt64(&count, 1) v := value[:r.Intn(len(value))] if err := db.Put(ctx, fmt.Sprintf("%08d", k), v); err != nil { errs <- err return } } errs <- nil }() } for i := 0; i < c.NumNodes(); { baseCount := atomic.LoadInt64(&count) select { case <-stopper.ShouldStop(): t.Fatalf("interrupted") case err := <-errs: if err != nil { t.Fatal(err) } i++ case <-time.After(1 * time.Second): // Periodically print out progress so that we know the test is still // running. loadedCount := atomic.LoadInt64(&count) log.Infof(ctx, "%d (%d/s)", loadedCount, loadedCount-baseCount) c.Assert(ctx, t) if err := cluster.Consistent(ctx, c, 0); err != nil { t.Fatal(err) } } } elapsed := timeutil.Since(start) log.Infof(ctx, "%d %.1f/sec", count, float64(count)/elapsed.Seconds()) }
func (at *allocatorTest) runSchemaChanges(ctx context.Context, t *testing.T) error { db, err := gosql.Open("postgres", at.f.PGUrl(ctx, 0)) if err != nil { return err } defer func() { _ = db.Close() }() const tableName = "datablocks.blocks" schemaChanges := []string{ "ALTER TABLE %s ADD COLUMN newcol DECIMAL DEFAULT (DECIMAL '1.4')", "CREATE INDEX foo ON %s (block_id)", } var wg sync.WaitGroup wg.Add(len(schemaChanges)) for i := range schemaChanges { go func(i int) { start := timeutil.Now() cmd := fmt.Sprintf(schemaChanges[i], tableName) log.Infof(ctx, "starting schema change: %s", cmd) if _, err := db.Exec(cmd); err != nil { log.Infof(ctx, "hit schema change error: %s, for %s, in %s", err, cmd, timeutil.Since(start)) t.Error(err) return } log.Infof(ctx, "completed schema change: %s, in %s", cmd, timeutil.Since(start)) wg.Done() // TODO(vivek): Monitor progress of schema changes and log progress. }(i) } wg.Wait() log.Info(ctx, "validate applied schema changes") if err := at.ValidateSchemaChanges(ctx, t); err != nil { t.Fatal(err) } return nil }
func testClusterRecoveryInner( ctx context.Context, t *testing.T, c cluster.Cluster, cfg cluster.TestConfig, ) { num := c.NumNodes() // One client for each node. initBank(t, c.PGUrl(ctx, 0)) start := timeutil.Now() state := testState{ t: t, errChan: make(chan error, num), teardown: make(chan struct{}), deadline: start.Add(cfg.Duration), clients: make([]testClient, num), } for i := 0; i < num; i++ { state.clients[i].Lock() state.initClient(ctx, t, c, i) state.clients[i].Unlock() go transferMoneyLoop(ctx, i, &state, *numAccounts, *maxTransfer) } defer func() { <-state.teardown }() // Chaos monkey. rnd, seed := randutil.NewPseudoRand() log.Warningf(ctx, "monkey starts (seed %d)", seed) pickNodes := func() []int { return rnd.Perm(num)[:rnd.Intn(num)+1] } go chaosMonkey(ctx, &state, c, true, pickNodes, 0) waitClientsStop(ctx, num, &state, stall) // Verify accounts. verifyAccounts(t, &state.clients[0]) elapsed := timeutil.Since(start) var count uint64 counts := state.counts() for _, c := range counts { count += c } log.Infof(ctx, "%d %.1f/sec", count, float64(count)/elapsed.Seconds()) }
func TestSucceedsSoon(t *testing.T) { // Try a method which always succeeds. SucceedsSoon(t, func() error { return nil }) // Try a method which succeeds after a known duration. start := timeutil.Now() duration := time.Millisecond * 10 SucceedsSoon(t, func() error { elapsed := timeutil.Since(start) if elapsed > duration { return nil } return errors.Errorf("%s elapsed, waiting until %s elapses", elapsed, duration) }) }
func (s *server) status() string { s.mu.Lock() defer s.mu.Unlock() var buf bytes.Buffer fmt.Fprintf(&buf, "gossip server (%d/%d cur/max conns, %s)\n", s.mu.incoming.gauge.Value(), s.mu.incoming.maxSize, s.serverMetrics) for addr, info := range s.mu.nodeMap { // TODO(peter): Report per connection sent/received statistics. The // structure of server.Gossip and server.gossipReceiver makes this // irritating to track. fmt.Fprintf(&buf, " %d: %s (%s)\n", info.peerID, addr.AddressField, roundSecs(timeutil.Since(info.createdAt))) } return buf.String() }
// waitForFullReplication waits for the cluster to be fully replicated. func (c *Cluster) waitForFullReplication() { for i := 1; true; i++ { done, detail := c.isReplicated() if (done && i >= 50) || (i%50) == 0 { fmt.Print(detail) log.Infof(context.Background(), "waiting for replication") } if done { break } time.Sleep(100 * time.Millisecond) } log.Infof(context.Background(), "replicated %.3fs", timeutil.Since(c.started).Seconds()) }
func (g *Gossip) clientStatus() string { var buf bytes.Buffer g.mu.Lock() defer g.mu.Unlock() g.clientsMu.Lock() defer g.clientsMu.Unlock() fmt.Fprintf(&buf, "gossip client (%d/%d cur/max conns)\n", len(g.clientsMu.clients), g.outgoing.maxSize) for _, c := range g.clientsMu.clients { fmt.Fprintf(&buf, " %d: %s (%s: %s)\n", c.peerID, c.addr, roundSecs(timeutil.Since(c.createdAt)), c.clientMetrics) } return buf.String() }
func testNodeRestartInner( ctx context.Context, t *testing.T, c cluster.Cluster, cfg cluster.TestConfig, ) { num := c.NumNodes() if minNum := 3; num < minNum { t.Skipf("need at least %d nodes, got %d", minNum, num) } // One client for each node. initBank(t, c.PGUrl(ctx, 0)) start := timeutil.Now() state := testState{ t: t, errChan: make(chan error, 1), teardown: make(chan struct{}), deadline: start.Add(cfg.Duration), clients: make([]testClient, 1), } clientIdx := num - 1 client := &state.clients[0] client.Lock() client.db = makePGClient(t, c.PGUrl(ctx, clientIdx)) client.Unlock() go transferMoneyLoop(ctx, 0, &state, *numAccounts, *maxTransfer) defer func() { <-state.teardown }() // Chaos monkey. rnd, seed := randutil.NewPseudoRand() log.Warningf(ctx, "monkey starts (seed %d)", seed) pickNodes := func() []int { return []int{rnd.Intn(clientIdx)} } go chaosMonkey(ctx, &state, c, false, pickNodes, clientIdx) waitClientsStop(ctx, 1, &state, stall) // Verify accounts. verifyAccounts(t, client) elapsed := timeutil.Since(start) count := atomic.LoadUint64(&client.count) log.Infof(ctx, "%d %.1f/sec", count, float64(count)/elapsed.Seconds()) }
// finishSQLTxn closes the root span for the current SQL txn. // This needs to be called before resetForNewSQLTransaction() is called for // starting another SQL txn. // The session context is just used for logging the SQL trace. func (ts *txnState) finishSQLTxn(sessionCtx context.Context) { ts.mon.Stop(ts.Ctx) if ts.sp == nil { panic("No span in context? Was resetForNewSQLTxn() called previously?") } sampledFor7881 := (ts.sp.BaggageItem(keyFor7881Sample) != "") ts.sp.Finish() ts.sp = nil if (traceSQL && timeutil.Since(ts.sqlTimestamp) >= traceSQLDuration) || (traceSQLFor7881 && sampledFor7881) { dump := tracing.FormatRawSpans(ts.CollectedSpans) if len(dump) > 0 { log.Infof(sessionCtx, "SQL trace:\n%s", dump) } } }
// scanLoop loops endlessly, scanning through replicas available via // the replica set, or until the scanner is stopped. The iteration // is paced to complete a full scan in approximately the scan interval. func (rs *replicaScanner) scanLoop(clock *hlc.Clock, stopper *stop.Stopper) { stopper.RunWorker(func() { ctx := rs.AnnotateCtx(context.Background()) start := timeutil.Now() // waitTimer is reset in each call to waitAndProcess. defer rs.waitTimer.Stop() for { if rs.GetDisabled() { if done := rs.waitEnabled(stopper); done { return } continue } var shouldStop bool count := 0 rs.replicas.Visit(func(repl *Replica) bool { count++ shouldStop = rs.waitAndProcess(ctx, start, clock, stopper, repl) return !shouldStop }) if count == 0 { // No replicas processed, just wait. shouldStop = rs.waitAndProcess(ctx, start, clock, stopper, nil) } shouldStop = shouldStop || nil != stopper.RunTask(func() { // Increment iteration count. rs.mu.Lock() defer rs.mu.Unlock() rs.mu.scanCount++ rs.mu.total += timeutil.Since(start) if log.V(6) { log.Infof(ctx, "reset replica scan iteration") } // Reset iteration and start time. start = timeutil.Now() }) if shouldStop { return } } }) }
// Start starts a cluster. The numWorkers parameter controls the SQL connection // settings to avoid unnecessary connection creation. The args parameter can be // used to pass extra arguments to each node. func (c *Cluster) Start(db string, numWorkers int, args, env []string) { c.started = timeutil.Now() baseCtx := &base.Config{ User: security.NodeUser, Insecure: true, } c.rpcCtx = rpc.NewContext(log.AmbientContext{}, baseCtx, nil, c.stopper) for i := range c.Nodes { c.Nodes[i] = c.makeNode(i, args, env) c.Clients[i] = c.makeClient(i) c.Status[i] = c.makeStatus(i) c.DB[i] = c.makeDB(i, numWorkers, db) } log.Infof(context.Background(), "started %.3fs", timeutil.Since(c.started).Seconds()) c.waitForFullReplication() }
// PeriodicallyCheckForUpdates starts a background worker that periodically // phones home to check for updates and report usage. func (s *Server) PeriodicallyCheckForUpdates() { s.stopper.RunWorker(func() { startup := timeutil.Now() for { // `maybeCheckForUpdates` and `maybeReportUsage` both return the // duration until they should next be checked. // Wait for the shorter of the durations returned by the two checks. wait := s.maybeCheckForUpdates() if reportWait := s.maybeReportUsage(timeutil.Since(startup)); reportWait < wait { wait = reportWait } jitter := rand.Intn(updateCheckJitterSeconds) - updateCheckJitterSeconds/2 wait = wait + (time.Duration(jitter) * time.Second) select { case <-s.stopper.ShouldQuiesce(): return case <-time.After(wait): } } }) }
func (r *rocksDBBatch) Commit() error { if r.closed() { panic("this batch was already committed") } start := timeutil.Now() var count, size int r.distinctOpen = false if r.flushes > 0 { // We've previously flushed mutations to the C++ batch, so we have to flush // any remaining mutations as well and then commit the batch. r.flushMutations() if err := statusToError(C.DBCommitAndCloseBatch(r.batch)); err != nil { return err } r.batch = nil count, size = r.flushedCount, r.flushedSize } else if r.builder.count > 0 { count, size = r.builder.count, len(r.builder.repr) // Fast-path which avoids flushing mutations to the C++ batch. Instead, we // directly apply the mutations to the database. if err := r.parent.ApplyBatchRepr(r.builder.Finish()); err != nil { return err } C.DBClose(r.batch) r.batch = nil } const batchCommitWarnThreshold = 500 * time.Millisecond if elapsed := timeutil.Since(start); elapsed >= batchCommitWarnThreshold { log.Warningf(context.TODO(), "batch [%d/%d/%d] commit took %s (>%s):\n%s", count, size, r.flushes, elapsed, batchCommitWarnThreshold, debug.Stack()) } return nil }
func TestConcurrentBatch(t *testing.T) { defer leaktest.AfterTest(t)() dir, err := ioutil.TempDir("", "TestConcurrentBatch") if err != nil { t.Fatal(err) } defer func() { if err := os.RemoveAll(dir); err != nil { t.Fatal(err) } }() db, err := NewRocksDB(roachpb.Attributes{}, dir, RocksDBCache{}, 0, DefaultMaxOpenFiles) if err != nil { t.Fatalf("could not create new rocksdb db instance at %s: %v", dir, err) } defer db.Close() // Prepare 16 4 MB batches containing non-overlapping contents. var batches []Batch for i := 0; i < 16; i++ { batch := db.NewBatch() for j := 0; true; j++ { key := encoding.EncodeUvarintAscending([]byte("bar"), uint64(i)) key = encoding.EncodeUvarintAscending(key, uint64(j)) if err := batch.Put(MakeMVCCMetadataKey(key), nil); err != nil { t.Fatal(err) } if len(batch.Repr()) >= 4<<20 { break } } batches = append(batches, batch) } errChan := make(chan error, len(batches)) // Concurrently write all the batches. for _, batch := range batches { go func(batch Batch) { errChan <- batch.Commit() }(batch) } // While the batch writes are in progress, try to write another key. time.Sleep(100 * time.Millisecond) remainingBatches := len(batches) for i := 0; remainingBatches > 0; i++ { select { case err := <-errChan: if err != nil { t.Fatal(err) } remainingBatches-- default: } // This write can get delayed excessively if we hit the max memtable count // or the L0 stop writes threshold. start := timeutil.Now() key := encoding.EncodeUvarintAscending([]byte("foo"), uint64(i)) if err := db.Put(MakeMVCCMetadataKey(key), nil); err != nil { t.Fatal(err) } if elapsed := timeutil.Since(start); elapsed >= 10*time.Second { t.Fatalf("write took %0.1fs\n", elapsed.Seconds()) } } }
// grpcTransportFactory during race builds wraps the implementation and // intercepts all BatchRequests, reading them in a tight loop. This allows the // race detector to catch any mutations of a batch passed to the transport. func grpcTransportFactory( opts SendOptions, rpcContext *rpc.Context, replicas ReplicaSlice, args roachpb.BatchRequest, ) (Transport, error) { if atomic.AddInt32(&running, 1) <= 1 { rpcContext.Stopper.RunWorker(func() { var iters int var curIdx int defer func() { atomic.StoreInt32(&running, 0) log.Infof( context.TODO(), "transport race promotion: ran %d iterations on up to %d requests", iters, curIdx+1, ) }() // Make a fixed-size slice of *BatchRequest. When full, entries // are evicted in FIFO order. const size = 1000 bas := make([]*roachpb.BatchRequest, size) encoder := gob.NewEncoder(ioutil.Discard) for { iters++ start := timeutil.Now() for _, ba := range bas { if ba != nil { if err := encoder.Encode(ba); err != nil { panic(err) } } } // Prevent the goroutine from spinning too hot as this lets CI // times skyrocket. Sleep on average for as long as we worked // on the last iteration so we spend no more than half our CPU // time on this task. jittered := time.After(jitter(timeutil.Since(start))) // Collect incoming requests until the jittered timer fires, // then access everything we have. for { select { case <-rpcContext.Stopper.ShouldStop(): return case ba := <-incoming: bas[curIdx%size] = ba curIdx++ continue case <-jittered: } break } } }) } select { // We have a shallow copy here and so the top level scalar fields can't // really race, but making more copies doesn't make anything more // transparent, so from now on we operate on a pointer. case incoming <- &args: default: // Avoid slowing down the tests if we're backed up. } return grpcTransportFactoryImpl(opts, rpcContext, replicas, args) }
// Test that a connection closed abruptly while a SQL txn is in progress results // in that txn being rolled back. func TestSessionFinishRollsBackTxn(t *testing.T) { defer leaktest.AfterTest(t)() aborter := NewTxnAborter() defer aborter.Close(t) params, _ := createTestServerParams() params.Knobs.SQLExecutor = aborter.executorKnobs() s, mainDB, _ := serverutils.StartServer(t, params) defer s.Stopper().Stop() { pgURL, cleanup := sqlutils.PGUrl( t, s.ServingAddr(), "TestSessionFinishRollsBackTxn", url.User(security.RootUser)) defer cleanup() if err := aborter.Init(pgURL); err != nil { t.Fatal(err) } } if _, err := mainDB.Exec(` CREATE DATABASE t; CREATE TABLE t.test (k INT PRIMARY KEY, v TEXT); `); err != nil { t.Fatal(err) } // We're going to test the rollback of transactions left in various states // when the connection closes abruptly. // For the state CommitWait, there's no actual rollback we can test for (since // the kv-level transaction has already been committed). But we still // exercise this state to check that the server doesn't crash (which used to // happen - #9879). tests := []sql.TxnStateEnum{sql.Open, sql.RestartWait, sql.CommitWait} for _, state := range tests { t.Run(state.String(), func(t *testing.T) { // Create a low-level lib/pq connection so we can close it at will. pgURL, cleanupDB := sqlutils.PGUrl( t, s.ServingAddr(), state.String(), url.User(security.RootUser)) defer cleanupDB() conn, err := pq.Open(pgURL.String()) if err != nil { t.Fatal(err) } connClosed := false defer func() { if connClosed { return } if err := conn.Close(); err != nil { t.Fatal(err) } }() txn, err := conn.Begin() if err != nil { t.Fatal(err) } tx := txn.(driver.Execer) if _, err := tx.Exec("SET TRANSACTION PRIORITY NORMAL", nil); err != nil { t.Fatal(err) } if state == sql.RestartWait || state == sql.CommitWait { if _, err := tx.Exec("SAVEPOINT cockroach_restart", nil); err != nil { t.Fatal(err) } } insertStmt := "INSERT INTO t.test(k, v) VALUES (1, 'a')" if state == sql.RestartWait { // To get a txn in RestartWait, we'll use an aborter. if err := aborter.QueueStmtForAbortion( insertStmt, 1 /* restartCount */, false /* willBeRetriedIbid */); err != nil { t.Fatal(err) } } if _, err := tx.Exec(insertStmt, nil); err != nil { t.Fatal(err) } if err := aborter.VerifyAndClear(); err != nil { t.Fatal(err) } if state == sql.RestartWait || state == sql.CommitWait { _, err := tx.Exec("RELEASE SAVEPOINT cockroach_restart", nil) if state == sql.CommitWait { if err != nil { t.Fatal(err) } } else if !testutils.IsError(err, "pq: restart transaction:.*") { t.Fatal(err) } } // Abruptly close the connection. connClosed = true if err := conn.Close(); err != nil { t.Fatal(err) } // Check that the txn we had above was rolled back. We do this by reading // after the preceding txn and checking that we don't get an error and // that we haven't been blocked by intents (we can't exactly test that we // haven't been blocked but we assert that the query didn't take too // long). // We do the read in an explicit txn so that automatic retries don't hide // any errors. // TODO(andrei): Figure out a better way to test for non-blocking. // Use a trace when the client-side tracing story gets good enough. txCheck, err := mainDB.Begin() if err != nil { t.Fatal(err) } // Run check at low priority so we don't push the previous transaction and // fool ourselves into thinking it had been rolled back. if _, err := txCheck.Exec("SET TRANSACTION PRIORITY LOW"); err != nil { t.Fatal(err) } ts := timeutil.Now() var count int if err := txCheck.QueryRow("SELECT COUNT(1) FROM t.test").Scan(&count); err != nil { t.Fatal(err) } // CommitWait actually committed, so we'll need to clean up. if state != sql.CommitWait { if count != 0 { t.Fatalf("expected no rows, got: %d", count) } } else { if _, err := txCheck.Exec("DELETE FROM t.test"); err != nil { t.Fatal(err) } } if err := txCheck.Commit(); err != nil { t.Fatal(err) } if d := timeutil.Since(ts); d > time.Second { t.Fatalf("Looks like the checking tx was unexpectedly blocked. "+ "It took %s to commit.", d) } }) } }
// TestDumpRandom generates a random number of random rows with all data // types. This data is dumped, inserted, and dumped again. The two dumps // are compared for exactness. The data from the inserted dump is then // SELECT'd and compared to the original generated data to ensure it is // round-trippable. func TestDumpRandom(t *testing.T) { defer leaktest.AfterTest(t)() c, err := newCLITest(t, false) if err != nil { t.Fatal(err) } defer c.stop(true) url, cleanup := sqlutils.PGUrl(t, c.ServingAddr(), "TestDumpRandom", url.User(security.RootUser)) defer cleanup() conn := makeSQLConn(url.String()) defer conn.Close() if err := conn.Exec(` CREATE DATABASE d; CREATE DATABASE o; CREATE TABLE d.t ( rowid int, i int, f float, d date, m timestamp, n interval, o bool, e decimal, s string, b bytes, PRIMARY KEY (rowid, i, f, d, m, n, o, e, s, b) ); `, nil); err != nil { t.Fatal(err) } rnd, seed := randutil.NewPseudoRand() t.Logf("random seed: %v", seed) start := timeutil.Now() for iteration := 0; timeutil.Since(start) < *randomTestTime; iteration++ { if err := conn.Exec(`DELETE FROM d.t`, nil); err != nil { t.Fatal(err) } var generatedRows [][]driver.Value count := rnd.Int63n(500) t.Logf("random iteration %v: %v rows", iteration, count) for _i := int64(0); _i < count; _i++ { // Generate a random number of random inserts. i := rnd.Int63() f := rnd.Float64() d := time.Unix(0, rnd.Int63()).Round(time.Hour * 24).UTC() m := time.Unix(0, rnd.Int63()).Round(time.Microsecond).UTC() n := time.Duration(rnd.Int63()).String() o := rnd.Intn(2) == 1 e := strings.TrimRight(inf.NewDec(rnd.Int63(), inf.Scale(rnd.Int31n(20)-10)).String(), ".0") sr := make([]byte, rnd.Intn(500)) if _, err := rnd.Read(sr); err != nil { t.Fatal(err) } s := make([]byte, 0, len(sr)) for _, b := range sr { r := rune(b) if !utf8.ValidRune(r) { continue } s = append(s, []byte(string(r))...) } b := make([]byte, rnd.Intn(500)) if _, err := rnd.Read(b); err != nil { t.Fatal(err) } vals := []driver.Value{ _i, i, f, d, m, []byte(n), // intervals come out as `[]byte`s o, []byte(e), // decimals come out as `[]byte`s string(s), b, } if err := conn.Exec("INSERT INTO d.t VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10)", vals); err != nil { t.Fatal(err) } generatedRows = append(generatedRows, vals[1:]) } check := func(table string) { q := fmt.Sprintf("SELECT i, f, d, m, n, o, e, s, b FROM %s ORDER BY rowid", table) nrows, err := conn.Query(q, nil) if err != nil { t.Fatal(err) } defer func() { if err := nrows.Close(); err != nil { t.Fatal(err) } }() for gi, generatedRow := range generatedRows { fetched := make([]driver.Value, len(nrows.Columns())) if err := nrows.Next(fetched); err != nil { t.Fatal(err) } for i, fetchedVal := range fetched { generatedVal := generatedRow[i] if t, ok := fetchedVal.(time.Time); ok { // dates and timestamps come out with offset zero (but // not UTC specifically). fetchedVal = t.UTC() } if !reflect.DeepEqual(fetchedVal, generatedVal) { t.Errorf("NOT EQUAL: table %s, row %d, col %d\ngenerated (%T): %v\nselected (%T): %v\n", table, gi, i, generatedVal, generatedVal, fetchedVal, fetchedVal) } } if t.Failed() { t.FailNow() } } } check("d.t") var buf bytes.Buffer if err := DumpTable(&buf, conn, "d", "t"); err != nil { t.Fatal(err) } dump := buf.String() buf.Reset() if err := conn.Exec(` SET DATABASE = o; DROP TABLE IF EXISTS t; `, nil); err != nil { t.Fatal(err) } if err := conn.Exec(dump, nil); err != nil { t.Fatal(err) } check("o.t") if err := DumpTable(&buf, conn, "o", "t"); err != nil { t.Fatal(err) } dump2 := buf.String() if dump != dump2 { t.Fatalf("unmatching dumps:\nFIRST:\n%s\n\nSECOND:\n%s", dump, dump2) } } }
// Start starts a goroutine that runs outstanding schema changes // for tables received in the latest system configuration via gossip. func (s *SchemaChangeManager) Start(stopper *stop.Stopper) { stopper.RunWorker(func() { descKeyPrefix := keys.MakeTablePrefix(uint32(sqlbase.DescriptorTable.ID)) gossipUpdateC := s.gossip.RegisterSystemConfigChannel() timer := &time.Timer{} delay := 360 * time.Second if s.testingKnobs.AsyncExecQuickly { delay = 20 * time.Millisecond } for { select { case <-gossipUpdateC: cfg, _ := s.gossip.GetSystemConfig() // Read all tables and their versions if log.V(2) { log.Info(context.TODO(), "received a new config") } schemaChanger := SchemaChanger{ nodeID: s.leaseMgr.nodeID.Get(), db: s.db, leaseMgr: s.leaseMgr, testingKnobs: s.testingKnobs, } // Keep track of existing schema changers. oldSchemaChangers := make(map[sqlbase.ID]struct{}, len(s.schemaChangers)) for k := range s.schemaChangers { oldSchemaChangers[k] = struct{}{} } execAfter := timeutil.Now().Add(delay) // Loop through the configuration to find all the tables. for _, kv := range cfg.Values { if !bytes.HasPrefix(kv.Key, descKeyPrefix) { continue } // Attempt to unmarshal config into a table/database descriptor. var descriptor sqlbase.Descriptor if err := kv.Value.GetProto(&descriptor); err != nil { log.Warningf(context.TODO(), "%s: unable to unmarshal descriptor %v", kv.Key, kv.Value) continue } switch union := descriptor.Union.(type) { case *sqlbase.Descriptor_Table: table := union.Table table.MaybeUpgradeFormatVersion() if err := table.ValidateTable(); err != nil { log.Errorf(context.TODO(), "%s: received invalid table descriptor: %v", kv.Key, table) continue } // Keep track of outstanding schema changes. // If all schema change commands always set UpVersion, why // check for the presence of mutations? // A schema change execution might fail soon after // unsetting UpVersion, and we still want to process // outstanding mutations. Similar with a table marked for deletion. if table.UpVersion || table.Dropped() || table.Adding() || table.Renamed() || len(table.Mutations) > 0 { if log.V(2) { log.Infof(context.TODO(), "%s: queue up pending schema change; table: %d, version: %d", kv.Key, table.ID, table.Version) } // Only track the first schema change. We depend on // gossip to renotify us when a schema change has been // completed. schemaChanger.tableID = table.ID if len(table.Mutations) == 0 { schemaChanger.mutationID = sqlbase.InvalidMutationID } else { schemaChanger.mutationID = table.Mutations[0].MutationID } schemaChanger.execAfter = execAfter // Keep track of this schema change. // Remove from oldSchemaChangers map. delete(oldSchemaChangers, table.ID) if sc, ok := s.schemaChangers[table.ID]; ok { if sc.mutationID == schemaChanger.mutationID { // Ignore duplicate. continue } } s.schemaChangers[table.ID] = schemaChanger } case *sqlbase.Descriptor_Database: // Ignore. } } // Delete old schema changers. for k := range oldSchemaChangers { delete(s.schemaChangers, k) } timer = s.newTimer() case <-timer.C: if s.testingKnobs.AsyncExecNotification != nil && s.testingKnobs.AsyncExecNotification() != nil { timer = s.newTimer() continue } for tableID, sc := range s.schemaChangers { if timeutil.Since(sc.execAfter) > 0 { err := sc.exec() if err != nil { if err == errExistingSchemaChangeLease { } else if err == sqlbase.ErrDescriptorNotFound { // Someone deleted this table. Don't try to run the schema // changer again. Note that there's no gossip update for the // deletion which would remove this schemaChanger. delete(s.schemaChangers, tableID) } else { // We don't need to act on integrity // constraints violations because exec() // purges mutations that violate integrity // constraints. log.Warningf(context.TODO(), "Error executing schema change: %s", err) } } // Advance the execAfter time so that this schema // changer doesn't get called again for a while. sc.execAfter = timeutil.Now().Add(delay) } // Only attempt to run one schema changer. break } timer = s.newTimer() case <-stopper.ShouldStop(): return } } }) }
func testDecimalDoubleArgFunc( t *testing.T, f func(*inf.Dec, *inf.Dec, *inf.Dec, inf.Scale) (*inf.Dec, error), s inf.Scale, tests []decimalTwoArgsTestCase, ) { for _, tc := range tests { t.Run(fmt.Sprintf("%s, %s = %s", tc.input1, tc.input2, tc.expected), func(t *testing.T) { x, y, exp := new(inf.Dec), new(inf.Dec), new(inf.Dec) if _, ok := x.SetString(tc.input1); !ok { t.Fatalf("could not set decimal: %s", tc.input1) } if _, ok := y.SetString(tc.input2); !ok { t.Fatalf("could not set decimal: %s", tc.input2) } // Test allocated return value. var z *inf.Dec var err error done := make(chan struct{}, 1) start := timeutil.Now() go func() { z, err = f(nil, x, y, s) done <- struct{}{} }() var after <-chan time.Time if *flagDurationLimit > 0 { after = time.After(*flagDurationLimit) } select { case <-done: t.Logf("execute duration: %s", timeutil.Since(start)) case <-after: t.Fatalf("timedout after %s", *flagDurationLimit) } if err != nil { if tc.expected != err.Error() { t.Errorf("expected error %s, got %s", tc.expected, err) } return } if z == nil { if tc.expected != "nil" { t.Errorf("expected %s, got nil", tc.expected) } return } else if tc.expected == "nil" { t.Errorf("expected nil, got %s", z) return } if _, ok := exp.SetString(tc.expected); !ok { t.Errorf("could not set decimal: %s", tc.expected) return } if exp.Cmp(z) != 0 { t.Errorf("expected %s, got %s", exp, z) } // Test provided decimal mutation. z.SetString("0.0") _, _ = f(z, x, y, s) if exp.Cmp(z) != 0 { t.Errorf("expected %s, got %s", exp, z) } // Test first arg mutation. _, _ = f(x, x, y, s) if exp.Cmp(x) != 0 { t.Errorf("expected %s, got %s", exp, x) } x.SetString(tc.input1) // Test second arg mutation. _, _ = f(y, x, y, s) if exp.Cmp(y) != 0 { t.Errorf("expected %s, got %s", exp, y) } y.SetString(tc.input2) // Test both arg mutation, if possible. if tc.input1 == tc.input2 { _, _ = f(x, x, x, s) if exp.Cmp(x) != 0 { t.Errorf("expected %s, got %s", exp, x) } x.SetString(tc.input1) } }) } }
// Start starts the server on the specified port, starts gossip and initializes // the node using the engines from the server's context. // // The passed context can be used to trace the server startup. The context // should represent the general startup operation. func (s *Server) Start(ctx context.Context) error { ctx = s.AnnotateCtx(ctx) startTime := timeutil.Now() tlsConfig, err := s.cfg.GetServerTLSConfig() if err != nil { return err } httpServer := netutil.MakeServer(s.stopper, tlsConfig, s) plainRedirectServer := netutil.MakeServer(s.stopper, tlsConfig, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { http.Redirect(w, r, "https://"+r.Host+r.RequestURI, http.StatusPermanentRedirect) })) // The following code is a specialization of util/net.go's ListenAndServe // which adds pgwire support. A single port is used to serve all protocols // (pg, http, h2) via the following construction: // // non-TLS case: // net.Listen -> cmux.New // | // - -> pgwire.Match -> pgwire.Server.ServeConn // - -> cmux.Any -> grpc.(*Server).Serve // // TLS case: // net.Listen -> cmux.New // | // - -> pgwire.Match -> pgwire.Server.ServeConn // - -> cmux.Any -> grpc.(*Server).Serve // // Note that the difference between the TLS and non-TLS cases exists due to // Go's lack of an h2c (HTTP2 Clear Text) implementation. See inline comments // in util.ListenAndServe for an explanation of how h2c is implemented there // and here. ln, err := net.Listen("tcp", s.cfg.Addr) if err != nil { return err } log.Eventf(ctx, "listening on port %s", s.cfg.Addr) unresolvedListenAddr, err := officialAddr(s.cfg.Addr, ln.Addr()) if err != nil { return err } s.cfg.Addr = unresolvedListenAddr.String() unresolvedAdvertAddr, err := officialAddr(s.cfg.AdvertiseAddr, ln.Addr()) if err != nil { return err } s.cfg.AdvertiseAddr = unresolvedAdvertAddr.String() s.rpcContext.SetLocalInternalServer(s.node) m := cmux.New(ln) pgL := m.Match(pgwire.Match) anyL := m.Match(cmux.Any()) httpLn, err := net.Listen("tcp", s.cfg.HTTPAddr) if err != nil { return err } unresolvedHTTPAddr, err := officialAddr(s.cfg.HTTPAddr, httpLn.Addr()) if err != nil { return err } s.cfg.HTTPAddr = unresolvedHTTPAddr.String() workersCtx := s.AnnotateCtx(context.Background()) s.stopper.RunWorker(func() { <-s.stopper.ShouldQuiesce() if err := httpLn.Close(); err != nil { log.Fatal(workersCtx, err) } }) if tlsConfig != nil { httpMux := cmux.New(httpLn) clearL := httpMux.Match(cmux.HTTP1()) tlsL := httpMux.Match(cmux.Any()) s.stopper.RunWorker(func() { netutil.FatalIfUnexpected(httpMux.Serve()) }) s.stopper.RunWorker(func() { netutil.FatalIfUnexpected(plainRedirectServer.Serve(clearL)) }) httpLn = tls.NewListener(tlsL, tlsConfig) } s.stopper.RunWorker(func() { netutil.FatalIfUnexpected(httpServer.Serve(httpLn)) }) s.stopper.RunWorker(func() { <-s.stopper.ShouldQuiesce() netutil.FatalIfUnexpected(anyL.Close()) <-s.stopper.ShouldStop() s.grpc.Stop() }) s.stopper.RunWorker(func() { netutil.FatalIfUnexpected(s.grpc.Serve(anyL)) }) s.stopper.RunWorker(func() { pgCtx := s.pgServer.AmbientCtx.AnnotateCtx(context.Background()) netutil.FatalIfUnexpected(httpServer.ServeWith(s.stopper, pgL, func(conn net.Conn) { connCtx := log.WithLogTagStr(pgCtx, "client", conn.RemoteAddr().String()) if err := s.pgServer.ServeConn(connCtx, conn); err != nil && !netutil.IsClosedConnection(err) { // Report the error on this connection's context, so that we // know which remote client caused the error when looking at // the logs. log.Error(connCtx, err) } })) }) if len(s.cfg.SocketFile) != 0 { // Unix socket enabled: postgres protocol only. unixLn, err := net.Listen("unix", s.cfg.SocketFile) if err != nil { return err } s.stopper.RunWorker(func() { <-s.stopper.ShouldQuiesce() if err := unixLn.Close(); err != nil { log.Fatal(workersCtx, err) } }) s.stopper.RunWorker(func() { pgCtx := s.pgServer.AmbientCtx.AnnotateCtx(context.Background()) netutil.FatalIfUnexpected(httpServer.ServeWith(s.stopper, unixLn, func(conn net.Conn) { connCtx := log.WithLogTagStr(pgCtx, "client", conn.RemoteAddr().String()) if err := s.pgServer.ServeConn(connCtx, conn); err != nil && !netutil.IsClosedConnection(err) { // Report the error on this connection's context, so that we // know which remote client caused the error when looking at // the logs. log.Error(connCtx, err) } })) }) } // Enable the debug endpoints first to provide an earlier window // into what's going on with the node in advance of exporting node // functionality. // TODO(marc): when cookie-based authentication exists, // apply it for all web endpoints. s.mux.HandleFunc(debugEndpoint, http.HandlerFunc(handleDebug)) s.gossip.Start(unresolvedAdvertAddr) log.Event(ctx, "started gossip") s.engines, err = s.cfg.CreateEngines() if err != nil { return errors.Wrap(err, "failed to create engines") } s.stopper.AddCloser(&s.engines) // We might have to sleep a bit to protect against this node producing non- // monotonic timestamps. Before restarting, its clock might have been driven // by other nodes' fast clocks, but when we restarted, we lost all this // information. For example, a client might have written a value at a // timestamp that's in the future of the restarted node's clock, and if we // don't do something, the same client's read would not return the written // value. So, we wait up to MaxOffset; we couldn't have served timestamps more // than MaxOffset in the future (assuming that MaxOffset was not changed, see // #9733). // // As an optimization for tests, we don't sleep if all the stores are brand // new. In this case, the node will not serve anything anyway until it // synchronizes with other nodes. { anyStoreBootstrapped := false for _, e := range s.engines { if _, err := storage.ReadStoreIdent(ctx, e); err != nil { // NotBootstrappedError is expected. if _, ok := err.(*storage.NotBootstrappedError); !ok { return err } } else { anyStoreBootstrapped = true break } } if anyStoreBootstrapped { sleepDuration := s.clock.MaxOffset() - timeutil.Since(startTime) if sleepDuration > 0 { log.Infof(ctx, "sleeping for %s to guarantee HLC monotonicity", sleepDuration) time.Sleep(sleepDuration) } } } // Now that we have a monotonic HLC wrt previous incarnations of the process, // init all the replicas. err = s.node.start( ctx, unresolvedAdvertAddr, s.engines, s.cfg.NodeAttributes, s.cfg.Locality, ) if err != nil { return err } log.Event(ctx, "started node") s.nodeLiveness.StartHeartbeat(ctx, s.stopper) // We can now add the node registry. s.recorder.AddNode(s.registry, s.node.Descriptor, s.node.startedAt) // Begin recording runtime statistics. s.startSampleEnvironment(s.cfg.MetricsSampleInterval) // Begin recording time series data collected by the status monitor. s.tsDB.PollSource( s.cfg.AmbientCtx, s.recorder, s.cfg.MetricsSampleInterval, ts.Resolution10s, s.stopper, ) // Begin recording status summaries. s.node.startWriteSummaries(s.cfg.MetricsSampleInterval) // Create and start the schema change manager only after a NodeID // has been assigned. testingKnobs := &sql.SchemaChangerTestingKnobs{} if s.cfg.TestingKnobs.SQLSchemaChanger != nil { testingKnobs = s.cfg.TestingKnobs.SQLSchemaChanger.(*sql.SchemaChangerTestingKnobs) } sql.NewSchemaChangeManager(testingKnobs, *s.db, s.gossip, s.leaseMgr).Start(s.stopper) s.distSQLServer.Start() log.Infof(ctx, "starting %s server at %s", s.cfg.HTTPRequestScheme(), unresolvedHTTPAddr) log.Infof(ctx, "starting grpc/postgres server at %s", unresolvedListenAddr) log.Infof(ctx, "advertising CockroachDB node at %s", unresolvedAdvertAddr) if len(s.cfg.SocketFile) != 0 { log.Infof(ctx, "starting postgres server at unix:%s", s.cfg.SocketFile) } s.stopper.RunWorker(func() { netutil.FatalIfUnexpected(m.Serve()) }) log.Event(ctx, "accepting connections") // Initialize grpc-gateway mux and context. jsonpb := &protoutil.JSONPb{ EnumsAsInts: true, EmitDefaults: true, Indent: " ", } protopb := new(protoutil.ProtoPb) gwMux := gwruntime.NewServeMux( gwruntime.WithMarshalerOption(gwruntime.MIMEWildcard, jsonpb), gwruntime.WithMarshalerOption(httputil.JSONContentType, jsonpb), gwruntime.WithMarshalerOption(httputil.AltJSONContentType, jsonpb), gwruntime.WithMarshalerOption(httputil.ProtoContentType, protopb), gwruntime.WithMarshalerOption(httputil.AltProtoContentType, protopb), ) gwCtx, gwCancel := context.WithCancel(s.AnnotateCtx(context.Background())) s.stopper.AddCloser(stop.CloserFn(gwCancel)) // Setup HTTP<->gRPC handlers. conn, err := s.rpcContext.GRPCDial(s.cfg.Addr) if err != nil { return errors.Errorf("error constructing grpc-gateway: %s; are your certificates valid?", err) } for _, gw := range []grpcGatewayServer{s.admin, s.status, &s.tsServer} { if err := gw.RegisterGateway(gwCtx, gwMux, conn); err != nil { return err } } var uiFileSystem http.FileSystem uiDebug := envutil.EnvOrDefaultBool("COCKROACH_DEBUG_UI", false) if uiDebug { uiFileSystem = http.Dir("pkg/ui") } else { uiFileSystem = &assetfs.AssetFS{ Asset: ui.Asset, AssetDir: ui.AssetDir, AssetInfo: ui.AssetInfo, } } uiFileServer := http.FileServer(uiFileSystem) s.mux.HandleFunc("/", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { if r.URL.Path == "/" { if uiDebug { r.URL.Path = "debug.html" } else { r.URL.Path = "release.html" } } uiFileServer.ServeHTTP(w, r) })) // TODO(marc): when cookie-based authentication exists, // apply it for all web endpoints. s.mux.Handle(adminPrefix, gwMux) s.mux.Handle(ts.URLPrefix, gwMux) s.mux.Handle(statusPrefix, gwMux) s.mux.Handle("/health", gwMux) s.mux.Handle(statusVars, http.HandlerFunc(s.status.handleVars)) log.Event(ctx, "added http endpoints") if err := sdnotify.Ready(); err != nil { log.Errorf(ctx, "failed to signal readiness using systemd protocol: %s", err) } log.Event(ctx, "server ready") return nil }
// applySnapshot updates the replica based on the given snapshot and associated // HardState (which may be empty, as Raft may apply some snapshots which don't // require an update to the HardState). All snapshots must pass through Raft // for correctness, i.e. the parameters to this method must be taken from // a raft.Ready. It is the caller's responsibility to call // r.store.processRangeDescriptorUpdate(r) after a successful applySnapshot. func (r *Replica) applySnapshot( ctx context.Context, inSnap IncomingSnapshot, snap raftpb.Snapshot, hs raftpb.HardState, ) error { // Extract the updated range descriptor. desc := inSnap.RangeDescriptor r.mu.Lock() replicaID := r.mu.replicaID raftLogSize := r.mu.raftLogSize r.mu.Unlock() isPreemptive := replicaID == 0 // only used for accounting and log format var appliedSuccessfully bool defer func() { if appliedSuccessfully { if !isPreemptive { r.store.metrics.RangeSnapshotsNormalApplied.Inc(1) } else { r.store.metrics.RangeSnapshotsPreemptiveApplied.Inc(1) } } }() if raft.IsEmptySnap(snap) { // Raft discarded the snapshot, indicating that our local state is // already ahead of what the snapshot provides. But we count it for // stats (see the defer above). appliedSuccessfully = true return nil } snapType := "preemptive" if !isPreemptive { snapType = "Raft" } log.Infof(ctx, "applying %s snapshot at index %d "+ "(id=%s, encoded size=%d, %d rocksdb batches, %d log entries)", snapType, snap.Metadata.Index, inSnap.SnapUUID.Short(), len(snap.Data), len(inSnap.Batches), len(inSnap.LogEntries)) defer func(start time.Time) { log.Infof(ctx, "applied %s snapshot in %.3fs", snapType, timeutil.Since(start).Seconds()) }(timeutil.Now()) batch := r.store.Engine().NewBatch() defer batch.Close() // Clear the range using a distinct batch in order to prevent the iteration // from forcing the batch to flush from Go to C++. distinctBatch := batch.Distinct() // Delete everything in the range and recreate it from the snapshot. // We need to delete any old Raft log entries here because any log entries // that predate the snapshot will be orphaned and never truncated or GC'd. iter := NewReplicaDataIterator(&desc, distinctBatch, false /* !replicatedOnly */) defer iter.Close() for ; iter.Valid(); iter.Next() { if err := distinctBatch.Clear(iter.Key()); err != nil { return err } } distinctBatch.Close() // Write the snapshot into the range. for _, batchRepr := range inSnap.Batches { if err := batch.ApplyBatchRepr(batchRepr); err != nil { return err } } // The log entries are all written to distinct keys so we can use a // distinct batch. distinctBatch = batch.Distinct() logEntries := make([]raftpb.Entry, len(inSnap.LogEntries)) for i, bytes := range inSnap.LogEntries { if err := logEntries[i].Unmarshal(bytes); err != nil { return err } } // Write the snapshot's Raft log into the range. _, raftLogSize, err := r.append(ctx, distinctBatch, 0, raftLogSize, logEntries) if err != nil { return err } if !raft.IsEmptyHardState(hs) { if err := setHardState(ctx, distinctBatch, r.RangeID, hs); err != nil { return errors.Wrapf(err, "unable to persist HardState %+v", &hs) } } else { // Note that we don't require that Raft supply us with a nonempty // HardState on a snapshot. We don't want to make that assumption // because it's not guaranteed by the contract. Raft *must* send us // a HardState when it increases the committed index as a result of the // snapshot, but who is to say it isn't going to accept a snapshot // which is identical to the current state? } // We need to close the distinct batch and start using the normal batch for // the read below. distinctBatch.Close() s, err := loadState(ctx, batch, &desc) if err != nil { return err } if s.Desc.RangeID != r.RangeID { log.Fatalf(ctx, "unexpected range ID %d", s.Desc.RangeID) } // As outlined above, last and applied index are the same after applying // the snapshot (i.e. the snapshot has no uncommitted tail). if s.RaftAppliedIndex != snap.Metadata.Index { log.Fatalf(ctx, "snapshot RaftAppliedIndex %d doesn't match its metadata index %d", s.RaftAppliedIndex, snap.Metadata.Index) } if err := batch.Commit(); err != nil { return err } r.mu.Lock() // We set the persisted last index to the last applied index. This is // not a correctness issue, but means that we may have just transferred // some entries we're about to re-request from the leader and overwrite. // However, raft.MultiNode currently expects this behaviour, and the // performance implications are not likely to be drastic. If our // feelings about this ever change, we can add a LastIndex field to // raftpb.SnapshotMetadata. r.mu.lastIndex = s.RaftAppliedIndex r.mu.raftLogSize = raftLogSize // Update the range and store stats. r.store.metrics.subtractMVCCStats(r.mu.state.Stats) r.store.metrics.addMVCCStats(s.Stats) r.mu.state = s r.assertStateLocked(r.store.Engine()) r.mu.Unlock() // As the last deferred action after committing the batch, update other // fields which are uninitialized or need updating. This may not happen // if the system config has not yet been loaded. While config update // will correctly set the fields, there is no order guarantee in // ApplySnapshot. // TODO: should go through the standard store lock when adding a replica. if err := r.updateRangeInfo(&desc); err != nil { panic(err) } r.setDescWithoutProcessUpdate(&desc) appliedSuccessfully = true return nil }
// processLoop processes the entries in the queue until the provided // stopper signals exit. func (bq *baseQueue) processLoop(clock *hlc.Clock, stopper *stop.Stopper) { stopper.RunWorker(func() { ctx := bq.AnnotateCtx(context.Background()) defer func() { bq.mu.Lock() bq.mu.stopped = true bq.mu.Unlock() bq.AmbientContext.FinishEventLog() }() // nextTime is initially nil; we don't start any timers until the queue // becomes non-empty. var nextTime <-chan time.Time immediately := make(chan time.Time) close(immediately) for { select { // Exit on stopper. case <-stopper.ShouldStop(): return // Incoming signal sets the next time to process if there were previously // no replicas in the queue. case <-bq.incoming: if nextTime == nil { // When a replica is added, wake up immediately. This is mainly // to facilitate testing without unnecessary sleeps. nextTime = immediately // In case we're in a test, still block on the impl. bq.impl.timer(0) } // Process replicas as the timer expires. case <-nextTime: repl := bq.pop() var duration time.Duration if repl != nil { if stopper.RunTask(func() { annotatedCtx := repl.AnnotateCtx(ctx) start := timeutil.Now() if err := bq.processReplica(annotatedCtx, repl, clock); err != nil { // Maybe add failing replica to purgatory if the queue supports it. bq.maybeAddToPurgatory(annotatedCtx, repl, err, clock, stopper) } duration = timeutil.Since(start) log.VEventf(annotatedCtx, 2, "done %s", duration) bq.processingNanos.Inc(duration.Nanoseconds()) }) != nil { return } } if bq.Length() == 0 { nextTime = nil } else { nextTime = time.After(bq.impl.timer(duration)) } } } }) }
func testSingleKeyInner( ctx context.Context, t *testing.T, c cluster.Cluster, cfg cluster.TestConfig, ) { num := c.NumNodes() // Initialize the value for our test key to zero. const key = "test-key" initDB := c.NewClient(ctx, t, 0) if err := initDB.Put(ctx, key, 0); err != nil { t.Fatal(err) } type result struct { err error maxLatency time.Duration } resultCh := make(chan result, num) deadline := timeutil.Now().Add(cfg.Duration) var expected int64 // Start up num workers each reading and writing the same // key. Each worker is configured to talk to a different node in the // cluster. for i := 0; i < num; i++ { db := c.NewClient(ctx, t, i) go func() { var r result for timeutil.Now().Before(deadline) { start := timeutil.Now() err := db.Txn(ctx, func(txn *client.Txn) error { minExp := atomic.LoadInt64(&expected) r, err := txn.Get(key) if err != nil { return err } b := txn.NewBatch() v := r.ValueInt() b.Put(key, v+1) err = txn.CommitInBatch(b) // Atomic updates after the fact mean that we should read // exp or larger (since concurrent writers might have // committed but not yet performed their atomic update). if err == nil && v < minExp { return errors.Errorf("unexpected read: %d, expected >= %d", v, minExp) } return err }) if err != nil { resultCh <- result{err: err} return } atomic.AddInt64(&expected, 1) latency := timeutil.Since(start) if r.maxLatency < latency { r.maxLatency = latency } } resultCh <- r }() } // Verify that none of the workers encountered an error. var results []result for len(results) < num { select { case <-stopper.ShouldStop(): t.Fatalf("interrupted") case r := <-resultCh: if r.err != nil { t.Fatal(r.err) } results = append(results, r) case <-time.After(1 * time.Second): // Periodically print out progress so that we know the test is still // running. log.Infof(ctx, "%d", atomic.LoadInt64(&expected)) } } // Verify the resulting value stored at the key is what we expect. r, err := initDB.Get(ctx, key) if err != nil { t.Fatal(err) } v := r.ValueInt() if expected != v { t.Fatalf("expected %d, but found %d", expected, v) } var maxLatency []time.Duration for _, r := range results { maxLatency = append(maxLatency, r.maxLatency) } log.Infof(ctx, "%d increments: %s", v, maxLatency) }
// Run a particular schema change and run some OLTP operations in parallel, as // soon as the schema change starts executing its backfill. func runSchemaChangeWithOperations( t *testing.T, sqlDB *gosql.DB, kvDB *client.DB, schemaChange string, maxValue int, keyMultiple int, backfillNotification chan bool, ) { tableDesc := sqlbase.GetTableDescriptor(kvDB, "t", "test") // Run the schema change in a separate goroutine. var wg sync.WaitGroup wg.Add(1) go func() { start := timeutil.Now() // Start schema change that eventually runs a backfill. if _, err := sqlDB.Exec(schemaChange); err != nil { t.Error(err) } t.Logf("schema change %s took %v", schemaChange, timeutil.Since(start)) wg.Done() }() // Wait until the schema change backfill starts. <-backfillNotification // Run a variety of operations during the backfill. // Grabbing a schema change lease on the table will fail, disallowing // another schema change from being simultaneously executed. sc := csql.NewSchemaChangerForTesting(tableDesc.ID, 0, 0, *kvDB, nil) if l, err := sc.AcquireLease(); err == nil { t.Fatalf("schema change lease acquisition on table %d succeeded: %v", tableDesc.ID, l) } // Update some rows. var updatedKeys []int for i := 0; i < 10; i++ { k := rand.Intn(maxValue) v := maxValue + i + 1 if _, err := sqlDB.Exec(`UPDATE t.test SET v = $1 WHERE k = $2`, v, k); err != nil { t.Error(err) } updatedKeys = append(updatedKeys, k) } // Reupdate updated values back to what they were before. for _, k := range updatedKeys { if _, err := sqlDB.Exec(`UPDATE t.test SET v = $1 WHERE k = $2`, maxValue-k, k); err != nil { t.Error(err) } } // Delete some rows. deleteStartKey := rand.Intn(maxValue - 10) for i := 0; i < 10; i++ { if _, err := sqlDB.Exec(`DELETE FROM t.test WHERE k = $1`, deleteStartKey+i); err != nil { t.Error(err) } } // Reinsert deleted rows. for i := 0; i < 10; i++ { k := deleteStartKey + i if _, err := sqlDB.Exec(`INSERT INTO t.test VALUES($1, $2)`, k, maxValue-k); err != nil { t.Error(err) } } // Insert some new rows. numInserts := 10 for i := 0; i < numInserts; i++ { k := maxValue + i + 1 if _, err := sqlDB.Exec(`INSERT INTO t.test VALUES($1, $1)`, k); err != nil { t.Error(err) } } wg.Wait() // for schema change to complete. // Verify the number of keys left behind in the table to validate schema // change operations. tablePrefix := roachpb.Key(keys.MakeTablePrefix(uint32(tableDesc.ID))) tableEnd := tablePrefix.PrefixEnd() if kvs, err := kvDB.Scan(context.TODO(), tablePrefix, tableEnd, 0); err != nil { t.Fatal(err) } else if e := keyMultiple * (maxValue + numInserts + 1); len(kvs) != e { for _, kv := range kvs { t.Errorf("key %s, value %s", kv.Key, kv.Value) } t.Fatalf("expected %d key value pairs, but got %d", e, len(kvs)) } // Delete the rows inserted. for i := 0; i < numInserts; i++ { if _, err := sqlDB.Exec(`DELETE FROM t.test WHERE k = $1`, maxValue+i+1); err != nil { t.Error(err) } } }
func (n *Node) batchInternal( ctx context.Context, args *roachpb.BatchRequest, ) (*roachpb.BatchResponse, error) { // TODO(marc): grpc's authentication model (which gives credential access in // the request handler) doesn't really fit with the current design of the // security package (which assumes that TLS state is only given at connection // time) - that should be fixed. if peer, ok := peer.FromContext(ctx); ok { if tlsInfo, ok := peer.AuthInfo.(credentials.TLSInfo); ok { certUser, err := security.GetCertificateUser(&tlsInfo.State) if err != nil { return nil, err } if certUser != security.NodeUser { return nil, errors.Errorf("user %s is not allowed", certUser) } } } var br *roachpb.BatchResponse type snowballInfo struct { syncutil.Mutex collectedSpans [][]byte done bool } var snowball *snowballInfo if err := n.stopper.RunTaskWithErr(func() error { const opName = "node.Batch" sp, err := tracing.JoinOrNew(n.storeCfg.AmbientCtx.Tracer, args.TraceContext, opName) if err != nil { return err } // If this is a snowball span, it gets special treatment: It skips the // regular tracing machinery, and we instead send the collected spans // back with the response. This is more expensive, but then again, // those are individual requests traced by users, so they can be. if sp.BaggageItem(tracing.Snowball) != "" { sp.LogEvent("delegating to snowball tracing") sp.Finish() snowball = new(snowballInfo) recorder := func(rawSpan basictracer.RawSpan) { snowball.Lock() defer snowball.Unlock() if snowball.done { // This is a late span that we must discard because the request was // already completed. return } encSp, err := tracing.EncodeRawSpan(&rawSpan, nil) if err != nil { log.Warning(ctx, err) } snowball.collectedSpans = append(snowball.collectedSpans, encSp) } if sp, err = tracing.JoinOrNewSnowball(opName, args.TraceContext, recorder); err != nil { return err } } defer sp.Finish() traceCtx := opentracing.ContextWithSpan(ctx, sp) log.Event(traceCtx, args.Summary()) tStart := timeutil.Now() var pErr *roachpb.Error br, pErr = n.stores.Send(traceCtx, *args) if pErr != nil { br = &roachpb.BatchResponse{} log.ErrEventf(traceCtx, "%T", pErr.GetDetail()) } if br.Error != nil { panic(roachpb.ErrorUnexpectedlySet(n.stores, br)) } n.metrics.callComplete(timeutil.Since(tStart), pErr) br.Error = pErr return nil }); err != nil { return nil, err } if snowball != nil { snowball.Lock() br.CollectedSpans = snowball.collectedSpans snowball.done = true snowball.Unlock() } return br, nil }
func (rq *replicateQueue) canTransferLease() bool { if lastLeaseTransfer := rq.lastLeaseTransfer.Load(); lastLeaseTransfer != nil { return timeutil.Since(lastLeaseTransfer.(time.Time)) > minLeaseTransferInterval } return true }
// snapshot creates an OutgoingSnapshot containing a rocksdb snapshot for the given range. func snapshot( ctx context.Context, snap engine.Reader, rangeID roachpb.RangeID, eCache *raftEntryCache, startKey roachpb.RKey, ) (OutgoingSnapshot, error) { start := timeutil.Now() var desc roachpb.RangeDescriptor // We ignore intents on the range descriptor (consistent=false) because we // know they cannot be committed yet; operations that modify range // descriptors resolve their own intents when they commit. ok, err := engine.MVCCGetProto(ctx, snap, keys.RangeDescriptorKey(startKey), hlc.MaxTimestamp, false /* !consistent */, nil, &desc) if err != nil { return OutgoingSnapshot{}, errors.Errorf("failed to get desc: %s", err) } if !ok { return OutgoingSnapshot{}, errors.Errorf("couldn't find range descriptor") } var snapData roachpb.RaftSnapshotData // Store RangeDescriptor as metadata, it will be retrieved by ApplySnapshot() snapData.RangeDescriptor = desc // Read the range metadata from the snapshot instead of the members // of the Range struct because they might be changed concurrently. appliedIndex, _, err := loadAppliedIndex(ctx, snap, rangeID) if err != nil { return OutgoingSnapshot{}, err } // Synthesize our raftpb.ConfState from desc. var cs raftpb.ConfState for _, rep := range desc.Replicas { cs.Nodes = append(cs.Nodes, uint64(rep.ReplicaID)) } term, err := term(ctx, snap, rangeID, eCache, appliedIndex) if err != nil { return OutgoingSnapshot{}, errors.Errorf("failed to fetch term of %d: %s", appliedIndex, err) } // Intentionally let this iterator and the snapshot escape so that the // streamer can send chunks from it bit by bit. iter := NewReplicaDataIterator(&desc, snap, true /* replicatedOnly */) snapUUID := uuid.NewV4() log.Infof(ctx, "generated snapshot %s for range %s at index %d in %s.", snapUUID.Short(), rangeID, appliedIndex, timeutil.Since(start)) return OutgoingSnapshot{ EngineSnap: snap, Iter: iter, SnapUUID: *snapUUID, RaftSnap: raftpb.Snapshot{ Data: snapUUID.GetBytes(), Metadata: raftpb.SnapshotMetadata{ Index: appliedIndex, Term: term, ConfState: cs, }, }, }, nil }