func (ts *txnState) dumpTrace() { if traceSQL && ts.txn != nil { ts.sp.Finish() if timeutil.Since(ts.sqlTimestamp) >= traceSQLDuration { dump := tracing.FormatRawSpans(ts.txn.CollectedSpans) if len(dump) > 0 { log.Infof(context.Background(), "%s\n%s", ts.txn.Proto.ID, dump) } } } ts.sp = nil }
func (g *Gossip) clientStatus() string { var buf bytes.Buffer g.mu.Lock() defer g.mu.Unlock() g.clientsMu.Lock() defer g.clientsMu.Unlock() fmt.Fprintf(&buf, "gossip client (%d/%d cur/max conns)\n", len(g.clientsMu.clients), g.outgoing.maxSize) for _, c := range g.clientsMu.clients { fmt.Fprintf(&buf, " %d: %s (%s: %d/%d sent/received)\n", c.peerID, c.addr, roundSecs(timeutil.Since(c.createdAt)), c.sent, c.received) } return buf.String() }
func TestSucceedsSoon(t *testing.T) { // Try a method which always succeeds. SucceedsSoon(t, func() error { return nil }) // Try a method which succeeds after a known duration. start := timeutil.Now() duration := time.Millisecond * 10 SucceedsSoon(t, func() error { elapsed := timeutil.Since(start) if elapsed > duration { return nil } return errors.Errorf("%s elapsed, waiting until %s elapses", elapsed, duration) }) }
func (c *cluster) waitForFullReplication() { for i := 1; true; i++ { done, detail := c.isReplicated() if (done && i >= 50) || (i%50) == 0 { fmt.Print(detail) log.Infof(context.Background(), "waiting for replication") } if done { break } time.Sleep(100 * time.Millisecond) } log.Infof(context.Background(), "replicated %.3fs", timeutil.Since(c.started).Seconds()) }
func (s *server) status() string { s.mu.Lock() defer s.mu.Unlock() var buf bytes.Buffer fmt.Fprintf(&buf, "gossip server (%d/%d cur/max conns, %s)\n", s.incoming.gauge.Value(), s.incoming.maxSize, s.serverMetrics) for addr, info := range s.nodeMap { // TODO(peter): Report per connection sent/received statistics. The // structure of server.Gossip and server.gossipReceiver makes this // irritating to track. fmt.Fprintf(&buf, " %d: %s (%s)\n", info.peerID, addr.AddressField, roundSecs(timeutil.Since(info.createdAt))) } return buf.String() }
func testPutInner(t *testing.T, c cluster.Cluster, cfg cluster.TestConfig) { db, dbStopper := c.NewClient(t, 0) defer dbStopper.Stop() errs := make(chan error, c.NumNodes()) start := timeutil.Now() deadline := start.Add(cfg.Duration) var count int64 for i := 0; i < c.NumNodes(); i++ { go func() { r, _ := randutil.NewPseudoRand() value := randutil.RandBytes(r, 8192) for timeutil.Now().Before(deadline) { k := atomic.AddInt64(&count, 1) v := value[:r.Intn(len(value))] if err := db.Put(fmt.Sprintf("%08d", k), v); err != nil { errs <- err return } } errs <- nil }() } for i := 0; i < c.NumNodes(); { baseCount := atomic.LoadInt64(&count) select { case <-stopper: t.Fatalf("interrupted") case err := <-errs: if err != nil { t.Fatal(err) } i++ case <-time.After(1 * time.Second): // Periodically print out progress so that we know the test is still // running. loadedCount := atomic.LoadInt64(&count) log.Infof(context.Background(), "%d (%d/s)", loadedCount, loadedCount-baseCount) c.Assert(t) cluster.Consistent(t, c) } } elapsed := timeutil.Since(start) log.Infof(context.Background(), "%d %.1f/sec", count, float64(count)/elapsed.Seconds()) }
func testClusterRecoveryInner(t *testing.T, c cluster.Cluster, cfg cluster.TestConfig) { num := c.NumNodes() // One client for each node. initBank(t, c.PGUrl(0)) start := timeutil.Now() state := testState{ t: t, errChan: make(chan error, num), teardown: make(chan struct{}), deadline: start.Add(cfg.Duration), clients: make([]testClient, num), } for i := 0; i < num; i++ { state.clients[i].Lock() state.initClient(t, c, i) state.clients[i].Unlock() go transferMoneyLoop(i, &state, *numAccounts, *maxTransfer) } defer func() { <-state.teardown }() // Chaos monkey. rnd, seed := randutil.NewPseudoRand() log.Warningf(context.Background(), "monkey starts (seed %d)", seed) pickNodes := func() []int { return rnd.Perm(num)[:rnd.Intn(num)+1] } go chaosMonkey(&state, c, true, pickNodes) waitClientsStop(num, &state, stall) // Verify accounts. verifyAccounts(t, &state.clients[0]) elapsed := timeutil.Since(start) var count uint64 counts := state.counts() for _, c := range counts { count += c } log.Infof(context.Background(), "%d %.1f/sec", count, float64(count)/elapsed.Seconds()) }
// processReplica processes a single replica. This should not be // called externally to the queue. bq.mu.Lock should not be held // while calling this method. func (bq *baseQueue) processReplica(repl *Replica, clock *hlc.Clock) error { bq.processMu.Lock() defer bq.processMu.Unlock() // Load the system config. cfg, ok := bq.gossip.GetSystemConfig() if !ok { log.VEventf(1, bq.ctx, "no system config available. skipping") return nil } if bq.requiresSplit(cfg, repl) { // Range needs to be split due to zone configs, but queue does // not accept unsplit ranges. log.VEventf(3, bq.ctx, "%s: split needed; skipping", repl) return nil } sp := repl.store.Tracer().StartSpan(bq.name) ctx := opentracing.ContextWithSpan(context.Background(), sp) defer sp.Finish() log.Tracef(ctx, "processing replica %s", repl) // If the queue requires a replica to have the range lease in // order to be processed, check whether this replica has range lease // and renew or acquire if necessary. if bq.needsLease { // Create a "fake" get request in order to invoke redirectOnOrAcquireLease. if err := repl.redirectOnOrAcquireLease(ctx); err != nil { if _, harmless := err.GetDetail().(*roachpb.NotLeaseHolderError); harmless { log.VEventf(3, bq.ctx, "%s: not holding lease; skipping", repl) return nil } return errors.Wrapf(err.GoError(), "%s: could not obtain lease", repl) } log.Trace(ctx, "got range lease") } log.VEventf(3, bq.ctx, "%s: processing", repl) start := timeutil.Now() if err := bq.impl.process(ctx, clock.Now(), repl, cfg); err != nil { return err } log.VEventf(2, bq.ctx, "%s: done: %s", repl, timeutil.Since(start)) log.Trace(ctx, "done") return nil }
func (c *cluster) start(db string, args []string) { c.started = timeutil.Now() baseCtx := &base.Context{ User: security.NodeUser, Insecure: true, } c.rpcCtx = rpc.NewContext(baseCtx, nil, c.stopper) for i := range c.nodes { c.nodes[i] = c.makeNode(i, args) c.clients[i] = c.makeClient(i) c.db[i] = c.makeDB(i, db) } log.Infof(context.Background(), "started %.3fs", timeutil.Since(c.started).Seconds()) }
// scanLoop loops endlessly, scanning through replicas available via // the replica set, or until the scanner is stopped. The iteration // is paced to complete a full scan in approximately the scan interval. func (rs *replicaScanner) scanLoop(clock *hlc.Clock, stopper *stop.Stopper) { stopper.RunWorker(func() { start := timeutil.Now() // waitTimer is reset in each call to waitAndProcess. defer rs.waitTimer.Stop() for { if rs.GetDisabled() { if done := rs.waitEnabled(stopper); done { return } continue } var shouldStop bool count := 0 rs.replicas.Visit(func(repl *Replica) bool { count++ shouldStop = rs.waitAndProcess(start, clock, stopper, repl) return !shouldStop }) if count == 0 { // No replicas processed, just wait. shouldStop = rs.waitAndProcess(start, clock, stopper, nil) } shouldStop = shouldStop || nil != stopper.RunTask(func() { // Increment iteration count. rs.mu.Lock() defer rs.mu.Unlock() rs.mu.scanCount++ rs.mu.total += timeutil.Since(start) if log.V(6) { log.Infof(context.TODO(), "reset replica scan iteration") } // Reset iteration and start time. start = timeutil.Now() }) if shouldStop { return } } }) }
func testNodeRestartInner(t *testing.T, c cluster.Cluster, cfg cluster.TestConfig) { num := c.NumNodes() if minNum := 3; num < minNum { t.Skipf("need at least %d nodes, got %d", minNum, num) } // One client for each node. initBank(t, c.PGUrl(0)) start := timeutil.Now() state := testState{ t: t, errChan: make(chan error, 1), teardown: make(chan struct{}), deadline: start.Add(cfg.Duration), clients: make([]testClient, 1), } client := &state.clients[0] client.Lock() client.db = makePGClient(t, c.PGUrl(num-1)) client.Unlock() go transferMoneyLoop(0, &state, *numAccounts, *maxTransfer) defer func() { <-state.teardown }() // Chaos monkey. rnd, seed := randutil.NewPseudoRand() log.Warningf(context.Background(), "monkey starts (seed %d)", seed) pickNodes := func() []int { return []int{rnd.Intn(num - 1)} } go chaosMonkey(&state, c, false, pickNodes) waitClientsStop(1, &state, stall) // Verify accounts. verifyAccounts(t, client) elapsed := timeutil.Since(start) count := atomic.LoadUint64(&client.count) log.Infof(context.Background(), "%d %.1f/sec", count, float64(count)/elapsed.Seconds()) }
// processReplica processes a single replica. This should not be // called externally to the queue. bq.mu.Lock should not be held // while calling this method. func (bq *baseQueue) processReplica(repl *Replica, clock *hlc.Clock) error { // Load the system config. cfg, ok := bq.gossip.GetSystemConfig() if !ok { bq.eventLog.VInfof(log.V(1), "no system config available. skipping") return nil } desc := repl.Desc() if !bq.acceptsUnsplitRanges && cfg.NeedsSplit(desc.StartKey, desc.EndKey) { // Range needs to be split due to zone configs, but queue does // not accept unsplit ranges. bq.eventLog.VInfof(log.V(3), "%s: split needed; skipping", repl) return nil } sp := repl.store.Tracer().StartSpan(bq.name) ctx := opentracing.ContextWithSpan(repl.context(context.Background()), sp) log.Trace(ctx, fmt.Sprintf("queue start for range %d", repl.RangeID)) defer sp.Finish() // If the queue requires a replica to have the range leader lease in // order to be processed, check whether this replica has leader lease // and renew or acquire if necessary. if bq.needsLeaderLease { // Create a "fake" get request in order to invoke redirectOnOrAcquireLease. if err := repl.redirectOnOrAcquireLeaderLease(ctx); err != nil { if _, harmless := err.GetDetail().(*roachpb.NotLeaderError); harmless { bq.eventLog.VInfof(log.V(3), "%s: not holding lease; skipping", repl) return nil } return errors.Wrapf(err.GoError(), "%s: could not obtain lease", repl) } log.Trace(ctx, "got range lease") } bq.eventLog.VInfof(log.V(3), "%s: processing", repl) start := timeutil.Now() if err := bq.impl.process(ctx, clock.Now(), repl, cfg); err != nil { return err } bq.eventLog.VInfof(log.V(2), "%s: done: %s", repl, timeutil.Since(start)) log.Trace(ctx, "done") return nil }
// scanLoop loops endlessly, scanning through replicas available via // the replica set, or until the scanner is stopped. The iteration // is paced to complete a full scan in approximately the scan interval. func (rs *replicaScanner) scanLoop(clock *hlc.Clock, stopper *stop.Stopper) { stopper.RunWorker(func() { start := timeutil.Now() // waitTimer is reset in each call to waitAndProcess. defer rs.waitTimer.Stop() for { var shouldStop bool count := 0 rs.replicas.Visit(func(repl *Replica) bool { count++ shouldStop = rs.waitAndProcess(start, clock, stopper, repl) return !shouldStop }) if count == 0 { // No replicas processed, just wait. shouldStop = rs.waitAndProcess(start, clock, stopper, nil) } shouldStop = shouldStop || !stopper.RunTask(func() { // Increment iteration count. rs.completedScan.L.Lock() rs.count++ rs.total += timeutil.Since(start) rs.completedScan.Broadcast() rs.completedScan.L.Unlock() if log.V(6) { log.Infof("reset replica scan iteration") } // Reset iteration and start time. start = timeutil.Now() }) if shouldStop { return } } }) }
// PeriodicallyCheckForUpdates starts a background worker that periodically // phones home to check for updates and report usage. func (s *Server) PeriodicallyCheckForUpdates() { s.stopper.RunWorker(func() { startup := timeutil.Now() for { // `maybeCheckForUpdates` and `maybeReportUsage` both return the // duration until they should next be checked. // Wait for the shorter of the durations returned by the two checks. wait := s.maybeCheckForUpdates() if reportWait := s.maybeReportUsage(timeutil.Since(startup)); reportWait < wait { wait = reportWait } jitter := rand.Intn(updateCheckJitterSeconds) - updateCheckJitterSeconds/2 wait = wait + (time.Duration(jitter) * time.Second) select { case <-s.stopper.ShouldQuiesce(): return case <-time.After(wait): } } }) }
// Start starts a goroutine that runs outstanding schema changes // for tables received in the latest system configuration via gossip. func (s *SchemaChangeManager) Start(stopper *stop.Stopper) { stopper.RunWorker(func() { descKeyPrefix := keys.MakeTablePrefix(uint32(sqlbase.DescriptorTable.ID)) gossipUpdateC := s.gossip.RegisterSystemConfigChannel() timer := &time.Timer{} delay := 360 * time.Second if s.testingKnobs.AsyncSchemaChangerExecQuickly { delay = 20 * time.Millisecond } for { select { case <-gossipUpdateC: cfg, _ := s.gossip.GetSystemConfig() // Read all tables and their versions if log.V(2) { log.Info("received a new config") } schemaChanger := SchemaChanger{ nodeID: roachpb.NodeID(s.leaseMgr.nodeID), db: s.db, leaseMgr: s.leaseMgr, } // Keep track of existing schema changers. oldSchemaChangers := make(map[sqlbase.ID]struct{}, len(s.schemaChangers)) for k := range s.schemaChangers { oldSchemaChangers[k] = struct{}{} } execAfter := timeutil.Now().Add(delay) // Loop through the configuration to find all the tables. for _, kv := range cfg.Values { if !bytes.HasPrefix(kv.Key, descKeyPrefix) { continue } // Attempt to unmarshal config into a table/database descriptor. var descriptor sqlbase.Descriptor if err := kv.Value.GetProto(&descriptor); err != nil { log.Warningf("%s: unable to unmarshal descriptor %v", kv.Key, kv.Value) continue } switch union := descriptor.Union.(type) { case *sqlbase.Descriptor_Table: table := union.Table if err := table.Validate(); err != nil { log.Errorf("%s: received invalid table descriptor: %v", kv.Key, table) continue } // Keep track of outstanding schema changes. // If all schema change commands always set UpVersion, why // check for the presence of mutations? // A schema change execution might fail soon after // unsetting UpVersion, and we still want to process // outstanding mutations. Similar with a table marked for deletion. if table.UpVersion || table.Deleted() || table.Renamed() || len(table.Mutations) > 0 { if log.V(2) { log.Infof("%s: queue up pending schema change; table: %d, version: %d", kv.Key, table.ID, table.Version) } // Only track the first schema change. We depend on // gossip to renotify us when a schema change has been // completed. schemaChanger.tableID = table.ID if len(table.Mutations) == 0 { schemaChanger.mutationID = sqlbase.InvalidMutationID } else { schemaChanger.mutationID = table.Mutations[0].MutationID } schemaChanger.execAfter = execAfter // Keep track of this schema change. // Remove from oldSchemaChangers map. delete(oldSchemaChangers, table.ID) if sc, ok := s.schemaChangers[table.ID]; ok { if sc.mutationID == schemaChanger.mutationID { // Ignore duplicate. continue } } s.schemaChangers[table.ID] = schemaChanger } case *sqlbase.Descriptor_Database: // Ignore. } } // Delete old schema changers. for k := range oldSchemaChangers { delete(s.schemaChangers, k) } timer = s.newTimer() case <-timer.C: if s.testingKnobs.AsyncSchemaChangerExecNotification != nil && s.testingKnobs.AsyncSchemaChangerExecNotification() != nil { timer = s.newTimer() continue } for tableID, sc := range s.schemaChangers { if timeutil.Since(sc.execAfter) > 0 { err := sc.exec(nil, nil) if err != nil { if err == errExistingSchemaChangeLease { } else if err == errDescriptorNotFound { // Someone deleted this table. Don't try to run the schema // changer again. Note that there's no gossip update for the // deletion which would remove this schemaChanger. delete(s.schemaChangers, tableID) } else { // We don't need to act on integrity // constraints violations because exec() // purges mutations that violate integrity // constraints. log.Warningf("Error executing schema change: %s", err) } } // Advance the execAfter time so that this schema // changer doesn't get called again for a while. sc.execAfter = timeutil.Now().Add(delay) } // Only attempt to run one schema changer. break } timer = s.newTimer() case <-stopper.ShouldStop(): return } } }) }
func snapshot( ctx context.Context, snap engine.Reader, rangeID roachpb.RangeID, eCache *raftEntryCache, startKey roachpb.RKey, ) (raftpb.Snapshot, error) { start := timeutil.Now() var snapData roachpb.RaftSnapshotData truncState, err := loadTruncatedState(ctx, snap, rangeID) if err != nil { return raftpb.Snapshot{}, err } firstIndex := truncState.Index + 1 // Read the range metadata from the snapshot instead of the members // of the Range struct because they might be changed concurrently. appliedIndex, _, err := loadAppliedIndex(ctx, snap, rangeID) if err != nil { return raftpb.Snapshot{}, err } var desc roachpb.RangeDescriptor // We ignore intents on the range descriptor (consistent=false) because we // know they cannot be committed yet; operations that modify range // descriptors resolve their own intents when they commit. ok, err := engine.MVCCGetProto(ctx, snap, keys.RangeDescriptorKey(startKey), hlc.MaxTimestamp, false /* !consistent */, nil, &desc) if err != nil { return raftpb.Snapshot{}, errors.Errorf("failed to get desc: %s", err) } if !ok { return raftpb.Snapshot{}, errors.Errorf("couldn't find range descriptor") } // Store RangeDescriptor as metadata, it will be retrieved by ApplySnapshot() snapData.RangeDescriptor = desc // Iterate over all the data in the range, including local-only data like // the sequence cache. iter := NewReplicaDataIterator(&desc, snap, true /* replicatedOnly */) defer iter.Close() var alloc bufalloc.ByteAllocator for ; iter.Valid(); iter.Next() { var key engine.MVCCKey var value []byte alloc, key, value = iter.allocIterKeyValue(alloc) snapData.KV = append(snapData.KV, roachpb.RaftSnapshotData_KeyValue{ Key: key.Key, Value: value, Timestamp: key.Timestamp, }) } endIndex := appliedIndex + 1 snapData.LogEntries = make([][]byte, 0, endIndex-firstIndex) scanFunc := func(kv roachpb.KeyValue) (bool, error) { bytes, err := kv.Value.GetBytes() if err == nil { snapData.LogEntries = append(snapData.LogEntries, bytes) } return false, err } if err := iterateEntries(ctx, snap, rangeID, firstIndex, endIndex, scanFunc); err != nil { return raftpb.Snapshot{}, err } data, err := protoutil.Marshal(&snapData) if err != nil { return raftpb.Snapshot{}, err } // Synthesize our raftpb.ConfState from desc. var cs raftpb.ConfState for _, rep := range desc.Replicas { cs.Nodes = append(cs.Nodes, uint64(rep.ReplicaID)) } term, err := term(ctx, snap, rangeID, eCache, appliedIndex) if err != nil { return raftpb.Snapshot{}, errors.Errorf("failed to fetch term of %d: %s", appliedIndex, err) } log.Infof(ctx, "generated snapshot for range %s at index %d in %s. encoded size=%d, %d KV pairs, %d log entries", rangeID, appliedIndex, timeutil.Since(start), len(data), len(snapData.KV), len(snapData.LogEntries)) return raftpb.Snapshot{ Data: data, Metadata: raftpb.SnapshotMetadata{ Index: appliedIndex, Term: term, ConfState: cs, }, }, nil }
// TestDumpRandom generates a random number of random rows with all data // types. This data is dumped, inserted, and dumped again. The two dumps // are compared for exactness. The data from the inserted dump is then // SELECT'd and compared to the original generated data to ensure it is // round-trippable. func TestDumpRandom(t *testing.T) { defer leaktest.AfterTest(t)() s, _, _ := serverutils.StartServer(t, base.TestServerArgs{}) defer s.Stopper().Stop() url, cleanup := sqlutils.PGUrl(t, s.ServingAddr(), security.RootUser, "TestDumpRandom") defer cleanup() conn := makeSQLConn(url.String()) defer conn.Close() if err := conn.Exec(` CREATE DATABASE d; CREATE DATABASE o; CREATE TABLE d.t ( rowid int, i int, f float, d date, m timestamp, n interval, o bool, e decimal, s string, b bytes, PRIMARY KEY (rowid, i, f, d, m, n, o, e, s, b) ); `, nil); err != nil { t.Fatal(err) } rnd, seed := randutil.NewPseudoRand() t.Logf("random seed: %v", seed) start := timeutil.Now() for iteration := 0; timeutil.Since(start) < *randomTestTime; iteration++ { if err := conn.Exec(`DELETE FROM d.t`, nil); err != nil { t.Fatal(err) } var generatedRows [][]driver.Value count := rnd.Int63n(500) t.Logf("random iteration %v: %v rows", iteration, count) for _i := int64(0); _i < count; _i++ { // Generate a random number of random inserts. i := rnd.Int63() f := rnd.Float64() d := time.Unix(0, rnd.Int63()).Round(time.Hour * 24).UTC() m := time.Unix(0, rnd.Int63()).Round(time.Microsecond).UTC() n := time.Duration(rnd.Int63()).String() o := rnd.Intn(2) == 1 e := strings.TrimRight(inf.NewDec(rnd.Int63(), inf.Scale(rnd.Int31n(20)-10)).String(), ".0") s := make([]byte, rnd.Intn(500)) if _, err := rnd.Read(s); err != nil { t.Fatal(err) } b := make([]byte, rnd.Intn(500)) if _, err := rnd.Read(b); err != nil { t.Fatal(err) } vals := []driver.Value{ _i, i, f, d, m, []byte(n), // intervals come out as `[]byte`s o, []byte(e), // decimals come out as `[]byte`s string(s), b, } if err := conn.Exec("INSERT INTO d.t VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10)", vals); err != nil { t.Fatal(err) } generatedRows = append(generatedRows, vals[1:]) } check := func(table string) { q := fmt.Sprintf("SELECT i, f, d, m, n, o, e, s, b FROM %s ORDER BY rowid", table) nrows, err := conn.Query(q, nil) if err != nil { t.Fatal(err) } defer func() { if err := nrows.Close(); err != nil { t.Fatal(err) } }() for gi, generatedRow := range generatedRows { fetched := make([]driver.Value, len(nrows.Columns())) if err := nrows.Next(fetched); err != nil { t.Fatal(err) } for i, fetchedVal := range fetched { generatedVal := generatedRow[i] if t, ok := fetchedVal.(time.Time); ok { // dates and timestamps come out with offset zero (but // not UTC specifically). fetchedVal = t.UTC() } if !reflect.DeepEqual(fetchedVal, generatedVal) { t.Errorf("NOT EQUAL: table %s, row %d, col %d\ngenerated (%T): %v\nselected (%T): %v\n", table, gi, i, generatedVal, generatedVal, fetchedVal, fetchedVal) } } if t.Failed() { t.FailNow() } } } check("d.t") var buf bytes.Buffer if err := dumpTable(&buf, conn, "d", "t"); err != nil { t.Fatal(err) } dump := buf.String() buf.Reset() if err := conn.Exec(` SET DATABASE = o; DROP TABLE IF EXISTS t; `, nil); err != nil { t.Fatal(err) } if err := conn.Exec(dump, nil); err != nil { t.Fatal(err) } check("o.t") if err := dumpTable(&buf, conn, "o", "t"); err != nil { t.Fatal(err) } dump2 := buf.String() if dump != dump2 { t.Fatalf("unmatching dumps:\nFIRST:\n%s\n\nSECOND:\n%s", dump, dump2) } } }
// Batch implements the roachpb.KVServer interface. func (n *Node) Batch(ctx context.Context, args *roachpb.BatchRequest) (*roachpb.BatchResponse, error) { // TODO(marc): this code is duplicated in kv/db.go, which should be fixed. // Also, grpc's authentication model (which gives credential access in the // request handler) doesn't really fit with the current design of the // security package (which assumes that TLS state is only given at connection // time) - that should be fixed. if peer, ok := peer.FromContext(ctx); ok { if tlsInfo, ok := peer.AuthInfo.(credentials.TLSInfo); ok { certUser, err := security.GetCertificateUser(&tlsInfo.State) if err != nil { return nil, err } if certUser != security.NodeUser { return nil, util.Errorf("user %s is not allowed", certUser) } } } var br *roachpb.BatchResponse opName := "node " + strconv.Itoa(int(n.Descriptor.NodeID)) // could save allocs here fail := func(err error) { br = &roachpb.BatchResponse{} br.Error = roachpb.NewError(err) } f := func() { sp, err := tracing.JoinOrNew(n.ctx.Tracer, args.Trace, opName) if err != nil { fail(err) return } // If this is a snowball span, it gets special treatment: It skips the // regular tracing machinery, and we instead send the collected spans // back with the response. This is more expensive, but then again, // those are individual requests traced by users, so they can be. if sp.BaggageItem(tracing.Snowball) != "" { sp.LogEvent("delegating to snowball tracing") sp.Finish() if sp, err = tracing.JoinOrNewSnowball(opName, args.Trace, func(rawSpan basictracer.RawSpan) { encSp, err := tracing.EncodeRawSpan(&rawSpan, nil) if err != nil { log.Warning(err) } br.CollectedSpans = append(br.CollectedSpans, encSp) }); err != nil { fail(err) return } } defer sp.Finish() traceCtx := opentracing.ContextWithSpan(n.context(ctx), sp) tStart := timeutil.Now() var pErr *roachpb.Error br, pErr = n.stores.Send(traceCtx, *args) if pErr != nil { br = &roachpb.BatchResponse{} log.Trace(traceCtx, fmt.Sprintf("error: %T", pErr.GetDetail())) } if br.Error != nil { panic(roachpb.ErrorUnexpectedlySet(n.stores, br)) } n.metrics.callComplete(timeutil.Since(tStart), pErr) br.Error = pErr } if !n.stopper.RunTask(f) { return nil, util.Errorf("node %d stopped", n.Descriptor.NodeID) } return br, nil }
// applySnapshot updates the replica based on the given snapshot. // Returns the new last index. func (r *Replica) applySnapshot(batch engine.Batch, snap raftpb.Snapshot) (uint64, error) { snapData := roachpb.RaftSnapshotData{} err := proto.Unmarshal(snap.Data, &snapData) if err != nil { return 0, err } rangeID := r.RangeID // Extract the updated range descriptor. desc := snapData.RangeDescriptor r.mu.Lock() replicaID := r.mu.replicaID r.mu.Unlock() log.Infof("replica %d received snapshot for range %d at index %d. encoded size=%d, %d KV pairs, %d log entries", replicaID, rangeID, snap.Metadata.Index, len(snap.Data), len(snapData.KV), len(snapData.LogEntries)) defer func(start time.Time) { log.Infof("replica %d applied snapshot for range %d in %s", replicaID, rangeID, timeutil.Since(start)) }(timeutil.Now()) // Delete everything in the range and recreate it from the snapshot. // We need to delete any old Raft log entries here because any log entries // that predate the snapshot will be orphaned and never truncated or GC'd. iter := newReplicaDataIterator(&desc, batch, false /* !replicatedOnly */) defer iter.Close() for ; iter.Valid(); iter.Next() { if err := batch.Clear(iter.Key()); err != nil { return 0, err } } // Determine the unreplicated key prefix so we can drop any // unreplicated keys from the snapshot. unreplicatedPrefix := keys.MakeRangeIDUnreplicatedPrefix(desc.RangeID) // Write the snapshot into the range. for _, kv := range snapData.KV { if bytes.HasPrefix(kv.Key, unreplicatedPrefix) { continue } mvccKey := engine.MVCCKey{ Key: kv.Key, Timestamp: kv.Timestamp, } if err := batch.Put(mvccKey, kv.Value); err != nil { return 0, err } } logEntries := make([]raftpb.Entry, len(snapData.LogEntries)) for i, bytes := range snapData.LogEntries { if err := logEntries[i].Unmarshal(bytes); err != nil { return 0, err } } // Write the snapshot's Raft log into the range. if _, err := r.append(batch, 0, logEntries); err != nil { return 0, err } // Read the leader lease. lease, err := loadLeaderLease(batch, desc.RangeID) if err != nil { return 0, err } frozen, err := loadFrozenStatus(batch, desc.RangeID) if err != nil { return 0, err } lastThreshold, err := loadGCThreshold(batch, desc.RangeID) if err != nil { return 0, err } // Load updated range stats. The local newStats variable will be assigned // to r.stats after the batch commits. newStats, err := newRangeStats(desc.RangeID, batch) if err != nil { return 0, err } // The next line sets the persisted last index to the last applied index. // This is not a correctness issue, but means that we may have just // transferred some entries we're about to re-request from the leader and // overwrite. // However, raft.MultiNode currently expects this behaviour, and the // performance implications are not likely to be drastic. If our feelings // about this ever change, we can add a LastIndex field to // raftpb.SnapshotMetadata. if err := setLastIndex(batch, rangeID, snap.Metadata.Index); err != nil { return 0, err } batch.Defer(func() { // Update the range and store stats. r.store.metrics.subtractMVCCStats(r.stats.mvccStats) r.stats.Replace(newStats) r.store.metrics.addMVCCStats(r.stats.mvccStats) r.mu.Lock() // As outlined above, last and applied index are the same after applying // the snapshot. r.mu.appliedIndex = snap.Metadata.Index r.mu.leaderLease = lease r.mu.frozen = frozen r.mu.gcThreshold = lastThreshold r.mu.Unlock() // Update other fields which are uninitialized or need updating. // This may not happen if the system config has not yet been loaded. // While config update will correctly set the fields, there is no order // guarantee in ApplySnapshot. // TODO: should go through the standard store lock when adding a replica. if err := r.updateRangeInfo(&desc); err != nil { panic(err) } // Fill the reservation if there was any one for this range. r.store.bookie.Fill(rangeID) // Update the range descriptor. This is done last as this is the step that // makes the Replica visible in the Store. if err := r.setDesc(&desc); err != nil { panic(err) } }) return snap.Metadata.Index, nil }
// applySnapshot updates the replica based on the given snapshot. // Returns the new last index. func (r *Replica) applySnapshot(snap raftpb.Snapshot, typ snapshotType) (uint64, error) { // We use a separate batch to apply the snapshot since the Replica (and in // particular the last index) is updated after the batch commits. Using a // separate batch also allows for future optimization (such as using a // Distinct() batch). batch := r.store.Engine().NewBatch() defer batch.Close() snapData := roachpb.RaftSnapshotData{} err := proto.Unmarshal(snap.Data, &snapData) if err != nil { return 0, err } // Extract the updated range descriptor. desc := snapData.RangeDescriptor r.mu.Lock() replicaID := r.mu.replicaID raftLogSize := r.mu.raftLogSize r.mu.Unlock() log.Infof("replica %d received snapshot for range %d at index %d. "+ "encoded size=%d, %d KV pairs, %d log entries", replicaID, desc.RangeID, snap.Metadata.Index, len(snap.Data), len(snapData.KV), len(snapData.LogEntries)) defer func(start time.Time) { log.Infof("replica %d applied snapshot for range %d in %s", replicaID, desc.RangeID, timeutil.Since(start)) }(timeutil.Now()) // Delete everything in the range and recreate it from the snapshot. // We need to delete any old Raft log entries here because any log entries // that predate the snapshot will be orphaned and never truncated or GC'd. iter := NewReplicaDataIterator(&desc, batch, false /* !replicatedOnly */) defer iter.Close() for ; iter.Valid(); iter.Next() { if err := batch.Clear(iter.Key()); err != nil { return 0, err } } // Determine the unreplicated key prefix so we can drop any // unreplicated keys from the snapshot. unreplicatedPrefix := keys.MakeRangeIDUnreplicatedPrefix(desc.RangeID) // Write the snapshot into the range. for _, kv := range snapData.KV { if bytes.HasPrefix(kv.Key, unreplicatedPrefix) { continue } mvccKey := engine.MVCCKey{ Key: kv.Key, Timestamp: kv.Timestamp, } if err := batch.Put(mvccKey, kv.Value); err != nil { return 0, err } } logEntries := make([]raftpb.Entry, len(snapData.LogEntries)) for i, bytes := range snapData.LogEntries { if err := logEntries[i].Unmarshal(bytes); err != nil { return 0, err } } // Write the snapshot's Raft log into the range. _, raftLogSize, err = r.append(batch, 0, raftLogSize, logEntries) if err != nil { return 0, err } s, err := loadState(batch, &desc) if err != nil { return 0, err } // As outlined above, last and applied index are the same after applying // the snapshot. if s.RaftAppliedIndex != snap.Metadata.Index { log.Fatalf("%d: snapshot resulted in appliedIndex=%d, metadataIndex=%d", s.Desc.RangeID, s.RaftAppliedIndex, snap.Metadata.Index) } if replicaID == 0 { // The replica is not part of the Raft group so we need to write the Raft // hard state for the replica in order for the Raft state machine to start // correctly. if err := updateHardState(batch, s); err != nil { return 0, err } } if err := batch.Commit(); err != nil { return 0, err } r.mu.Lock() // We set the persisted last index to the last applied index. This is // not a correctness issue, but means that we may have just transferred // some entries we're about to re-request from the leader and overwrite. // However, raft.MultiNode currently expects this behaviour, and the // performance implications are not likely to be drastic. If our // feelings about this ever change, we can add a LastIndex field to // raftpb.SnapshotMetadata. r.mu.lastIndex = s.RaftAppliedIndex r.mu.raftLogSize = raftLogSize // Update the range and store stats. r.store.metrics.subtractMVCCStats(r.mu.state.Stats) r.store.metrics.addMVCCStats(s.Stats) r.mu.state = s lastIndex := r.mu.lastIndex r.assertStateLocked(r.store.Engine()) r.mu.Unlock() // Update other fields which are uninitialized or need updating. // This may not happen if the system config has not yet been loaded. // While config update will correctly set the fields, there is no order // guarantee in ApplySnapshot. // TODO: should go through the standard store lock when adding a replica. if err := r.updateRangeInfo(&desc); err != nil { panic(err) } // Fill the reservation if there was any one for this range. r.store.bookie.Fill(desc.RangeID) // Update the range descriptor. This is done last as this is the step that // makes the Replica visible in the Store. if err := r.setDesc(&desc); err != nil { panic(err) } switch typ { case normalSnapshot: r.store.metrics.rangeSnapshotsNormalApplied.Inc(1) case preemptiveSnapshot: r.store.metrics.rangeSnapshotsPreemptiveApplied.Inc(1) default: panic("not reached") } return lastIndex, nil }
// applySnapshot updates the replica based on the given snapshot and associated // HardState. The supplied HardState must be empty if a preemptive snapshot is // being applied (which is the case if and only if the ReplicaID is zero), in // which case it will be synthesized from any existing on-disk HardState // appropriately. For a regular snapshot, a HardState may or may not be // supplied, though in the common case it is (since the commit index changes as // a result of the snapshot application, so Raft will supply us with one). // The HardState, if altered or supplied, is persisted along with the applied // snapshot and the new last index is returned. // // During preemptive snapshots, we (must) run additional safety checks. For // example, the HardState, Raft's view of term, vote and committed log entries, // and other Raft state (like acknowledged log entries) must not move backwards. func (r *Replica) applySnapshot( snap raftpb.Snapshot, hs raftpb.HardState, ) (uint64, error) { // We use a separate batch to apply the snapshot since the Replica (and in // particular the last index) is updated after the batch commits. Using a // separate batch also allows for future optimization (such as using a // Distinct() batch). batch := r.store.Engine().NewBatch() defer batch.Close() snapData := roachpb.RaftSnapshotData{} err := proto.Unmarshal(snap.Data, &snapData) if err != nil { return 0, err } // Extract the updated range descriptor. desc := snapData.RangeDescriptor // Fill the reservation if there was one for this range, regardless of // whether the application succeeded. defer r.store.bookie.Fill(desc.RangeID) r.mu.Lock() replicaID := r.mu.replicaID raftLogSize := r.mu.raftLogSize r.mu.Unlock() isPreemptive := replicaID == 0 replicaIDStr := "[?]" snapType := "preemptive" if !isPreemptive { replicaIDStr = strconv.FormatInt(int64(replicaID), 10) snapType = "Raft" } log.Infof("%s: with replicaID %s, applying %s snapshot for range %d at index %d "+ "(encoded size=%d, %d KV pairs, %d log entries)", r, replicaIDStr, snapType, desc.RangeID, snap.Metadata.Index, len(snap.Data), len(snapData.KV), len(snapData.LogEntries)) defer func(start time.Time) { log.Infof("%s: with replicaID %s, applied %s snapshot for range %d in %s", r, replicaIDStr, snapType, desc.RangeID, timeutil.Since(start)) }(timeutil.Now()) // Remember the old last index to verify that the snapshot doesn't wipe out // log entries which have been acknowledged, which is possible with // preemptive snapshots. We assert on it later in this call. oldLastIndex, err := loadLastIndex(batch, desc.RangeID) if err != nil { return 0, errors.Wrap(err, "error loading last index") } // Similar strategy for the HardState. oldHardState, err := loadHardState(batch, desc.RangeID) if err != nil { return 0, errors.Wrap(err, "unable to load HardState") } // Delete everything in the range and recreate it from the snapshot. // We need to delete any old Raft log entries here because any log entries // that predate the snapshot will be orphaned and never truncated or GC'd. iter := NewReplicaDataIterator(&desc, batch, false /* !replicatedOnly */) defer iter.Close() for ; iter.Valid(); iter.Next() { if err := batch.Clear(iter.Key()); err != nil { return 0, err } } // Determine the unreplicated key prefix so we can drop any // unreplicated keys from the snapshot. unreplicatedPrefix := keys.MakeRangeIDUnreplicatedPrefix(desc.RangeID) // Write the snapshot into the range. for _, kv := range snapData.KV { if bytes.HasPrefix(kv.Key, unreplicatedPrefix) { continue } mvccKey := engine.MVCCKey{ Key: kv.Key, Timestamp: kv.Timestamp, } if err := batch.Put(mvccKey, kv.Value); err != nil { return 0, err } } logEntries := make([]raftpb.Entry, len(snapData.LogEntries)) for i, bytes := range snapData.LogEntries { if err := logEntries[i].Unmarshal(bytes); err != nil { return 0, err } } // Write the snapshot's Raft log into the range. _, raftLogSize, err = r.append(batch, 0, raftLogSize, logEntries) if err != nil { return 0, err } s, err := loadState(batch, &desc) if err != nil { return 0, err } // As outlined above, last and applied index are the same after applying // the snapshot (i.e. the snapshot has no uncommitted tail). if s.RaftAppliedIndex != snap.Metadata.Index { log.Fatalf("%s with state loaded from %d: snapshot resulted in appliedIndex=%d, metadataIndex=%d", r, s.Desc.RangeID, s.RaftAppliedIndex, snap.Metadata.Index) } if !raft.IsEmptyHardState(hs) { if isPreemptive { return 0, errors.Errorf("unexpected HardState %+v on preemptive snapshot", &hs) } if err := setHardState(batch, s.Desc.RangeID, hs); err != nil { return 0, errors.Wrapf(err, "unable to persist HardState %+v", &hs) } } else if isPreemptive { // Preemptive snapshots get special verifications (see #7619) of their // last index and (necessarily synthesized) HardState. if snap.Metadata.Index < oldLastIndex { // We are not aware of a specific way in which this could happen // (Raft itself should not emit such snapshots, and no Replica can // ever apply two preemptive snapshots), but it doesn't hurt to // check. return 0, errors.Errorf("%s: preemptive snapshot would erase acknowledged log entries", r) } if snap.Metadata.Term < oldHardState.Term { return 0, errors.Errorf("%s: cannot apply preemptive snapshot from past term %d at term %d", r, snap.Metadata.Term, oldHardState.Term) } if err := synthesizeHardState(batch, s, oldHardState); err != nil { return 0, errors.Wrapf(err, "%s: unable to write synthesized HardState", r) } } else { // Note that we don't require that Raft supply us with a nonempty // HardState on a snapshot. We don't want to make that assumption // because it's not guaranteed by the contract. Raft *must* send us // a HardState when it increases the committed index as a result of the // snapshot, but who is to say it isn't going to accept a snapshot // which is identical to the current state? } if err := batch.Commit(); err != nil { return 0, err } r.mu.Lock() // We set the persisted last index to the last applied index. This is // not a correctness issue, but means that we may have just transferred // some entries we're about to re-request from the leader and overwrite. // However, raft.MultiNode currently expects this behaviour, and the // performance implications are not likely to be drastic. If our // feelings about this ever change, we can add a LastIndex field to // raftpb.SnapshotMetadata. r.mu.lastIndex = s.RaftAppliedIndex r.mu.raftLogSize = raftLogSize // Update the range and store stats. r.store.metrics.subtractMVCCStats(r.mu.state.Stats) r.store.metrics.addMVCCStats(s.Stats) r.mu.state = s lastIndex := r.mu.lastIndex r.assertStateLocked(r.store.Engine()) r.mu.Unlock() // Update other fields which are uninitialized or need updating. // This may not happen if the system config has not yet been loaded. // While config update will correctly set the fields, there is no order // guarantee in ApplySnapshot. // TODO: should go through the standard store lock when adding a replica. if err := r.updateRangeInfo(&desc); err != nil { panic(err) } // Update the range descriptor. This is done last as this is the step that // makes the Replica visible in the Store. if err := r.setDesc(&desc); err != nil { panic(err) } if !isPreemptive { r.store.metrics.rangeSnapshotsNormalApplied.Inc(1) } else { r.store.metrics.rangeSnapshotsPreemptiveApplied.Inc(1) } return lastIndex, nil }
// applySnapshot updates the replica based on the given snapshot and associated // HardState (which may be empty, as Raft may apply some snapshots which don't // require an update to the HardState). All snapshots must pass through Raft // for correctness, i.e. the parameters to this method must be taken from // a raft.Ready. It is the caller's responsibility to call // r.store.processRangeDescriptorUpdate(r) after a successful applySnapshot. func (r *Replica) applySnapshot( ctx context.Context, snap raftpb.Snapshot, hs raftpb.HardState, ) error { // We use a separate batch to apply the snapshot since the Replica (and in // particular the last index) is updated after the batch commits. Using a // separate batch also allows for future optimization (such as using a // Distinct() batch). batch := r.store.Engine().NewBatch() defer batch.Close() snapData := roachpb.RaftSnapshotData{} err := proto.Unmarshal(snap.Data, &snapData) if err != nil { return err } // Extract the updated range descriptor. desc := snapData.RangeDescriptor // Fill the reservation if there was one for this range, regardless of // whether the application succeeded. defer r.store.bookie.Fill(desc.RangeID) r.mu.Lock() replicaID := r.mu.replicaID raftLogSize := r.mu.raftLogSize r.mu.Unlock() isPreemptive := replicaID == 0 // only used for accounting and log format replicaIDStr := "[?]" snapType := "preemptive" if !isPreemptive { replicaIDStr = strconv.FormatInt(int64(replicaID), 10) snapType = "Raft" } log.Infof(ctx, "%s: with replicaID %s, applying %s snapshot at index %d "+ "(encoded size=%d, %d KV pairs, %d log entries)", r, replicaIDStr, snapType, snap.Metadata.Index, len(snap.Data), len(snapData.KV), len(snapData.LogEntries)) defer func(start time.Time) { log.Infof(ctx, "%s: with replicaID %s, applied %s snapshot in %.3fs", r, replicaIDStr, snapType, timeutil.Since(start).Seconds()) }(timeutil.Now()) // Delete everything in the range and recreate it from the snapshot. // We need to delete any old Raft log entries here because any log entries // that predate the snapshot will be orphaned and never truncated or GC'd. // // The KVs and log entries are all written to distinct keys so we can use a // distinct batch. Note that we clear keys here that are potentially // overwritten below, which violates the spirit of the distinct batch. This // is safe because we don't do any reads until after the distinct batch is // closed (the raft log writes are "blind"). distinctBatch := batch.Distinct() iter := NewReplicaDataIterator(&desc, distinctBatch, false /* !replicatedOnly */) defer iter.Close() for ; iter.Valid(); iter.Next() { if err := distinctBatch.Clear(iter.Key()); err != nil { return err } } // Determine the unreplicated key prefix so we can drop any // unreplicated keys from the snapshot. unreplicatedPrefix := keys.MakeRangeIDUnreplicatedPrefix(desc.RangeID) // Write the snapshot into the range. for _, kv := range snapData.KV { if bytes.HasPrefix(kv.Key, unreplicatedPrefix) { continue } mvccKey := engine.MVCCKey{ Key: kv.Key, Timestamp: kv.Timestamp, } if err := distinctBatch.Put(mvccKey, kv.Value); err != nil { return err } } logEntries := make([]raftpb.Entry, len(snapData.LogEntries)) for i, bytes := range snapData.LogEntries { if err := logEntries[i].Unmarshal(bytes); err != nil { return err } } // Write the snapshot's Raft log into the range. _, raftLogSize, err = r.append(ctx, distinctBatch, 0, raftLogSize, logEntries) if err != nil { return err } if !raft.IsEmptyHardState(hs) { if err := setHardState(ctx, distinctBatch, r.RangeID, hs); err != nil { return errors.Wrapf(err, "unable to persist HardState %+v", &hs) } } else { // Note that we don't require that Raft supply us with a nonempty // HardState on a snapshot. We don't want to make that assumption // because it's not guaranteed by the contract. Raft *must* send us // a HardState when it increases the committed index as a result of the // snapshot, but who is to say it isn't going to accept a snapshot // which is identical to the current state? } // We need to close the distinct batch and start using the normal batch for // the read below. distinctBatch.Close() s, err := loadState(ctx, batch, &desc) if err != nil { return err } if s.Desc.RangeID != r.RangeID { log.Fatalf(ctx, "%s: unexpected range ID %d", r, s.Desc.RangeID) } // As outlined above, last and applied index are the same after applying // the snapshot (i.e. the snapshot has no uncommitted tail). if s.RaftAppliedIndex != snap.Metadata.Index { log.Fatalf(ctx, "%s: with state loaded from %d: snapshot resulted in appliedIndex=%d, metadataIndex=%d", r, s.Desc.RangeID, s.RaftAppliedIndex, snap.Metadata.Index) } if err := batch.Commit(); err != nil { return err } r.mu.Lock() // We set the persisted last index to the last applied index. This is // not a correctness issue, but means that we may have just transferred // some entries we're about to re-request from the leader and overwrite. // However, raft.MultiNode currently expects this behaviour, and the // performance implications are not likely to be drastic. If our // feelings about this ever change, we can add a LastIndex field to // raftpb.SnapshotMetadata. r.mu.lastIndex = s.RaftAppliedIndex r.mu.raftLogSize = raftLogSize // Update the range and store stats. r.store.metrics.subtractMVCCStats(r.mu.state.Stats) r.store.metrics.addMVCCStats(s.Stats) r.mu.state = s r.assertStateLocked(r.store.Engine()) r.mu.Unlock() // As the last deferred action after committing the batch, update other // fields which are uninitialized or need updating. This may not happen // if the system config has not yet been loaded. While config update // will correctly set the fields, there is no order guarantee in // ApplySnapshot. // TODO: should go through the standard store lock when adding a replica. if err := r.updateRangeInfo(&desc); err != nil { panic(err) } r.setDescWithoutProcessUpdate(&desc) if !isPreemptive { r.store.metrics.RangeSnapshotsNormalApplied.Inc(1) } else { r.store.metrics.RangeSnapshotsPreemptiveApplied.Inc(1) } return nil }
// Run a particular schema change and run some OLTP operations in parallel, as // soon as the schema change starts executing its backfill. func runSchemaChangeWithOperations( t *testing.T, sqlDB *gosql.DB, kvDB *client.DB, schemaChange string, maxValue int, keyMultiple int, backfillNotification chan bool, ) { tableDesc := sqlbase.GetTableDescriptor(kvDB, "t", "test") // Run the schema change in a separate goroutine. var wg sync.WaitGroup wg.Add(1) go func() { start := timeutil.Now() // Start schema change that eventually runs a backfill. if _, err := sqlDB.Exec(schemaChange); err != nil { t.Error(err) } t.Logf("schema change %s took %v", schemaChange, timeutil.Since(start)) wg.Done() }() // Wait until the schema change backfill starts. <-backfillNotification // Run a variety of operations during the backfill. // Grabbing a schema change lease on the table will fail, disallowing // another schema change from being simultaneously executed. sc := csql.NewSchemaChangerForTesting(tableDesc.ID, 0, 0, *kvDB, nil) if l, err := sc.AcquireLease(); err == nil { t.Fatalf("schema change lease acquisition on table %d succeeded: %v", tableDesc.ID, l) } // Update some rows. var updatedKeys []int for i := 0; i < 10; i++ { k := rand.Intn(maxValue) v := maxValue + i + 1 if _, err := sqlDB.Exec(`UPDATE t.test SET v = $2 WHERE k = $1`, k, v); err != nil { t.Fatal(err) } updatedKeys = append(updatedKeys, k) } // Reupdate updated values back to what they were before. for _, k := range updatedKeys { if _, err := sqlDB.Exec(`UPDATE t.test SET v = $2 WHERE k = $1`, k, maxValue-k); err != nil { t.Fatal(err) } } // Delete some rows. deleteStartKey := rand.Intn(maxValue - 10) for i := 0; i < 10; i++ { if _, err := sqlDB.Exec(`DELETE FROM t.test WHERE k = $1`, deleteStartKey+i); err != nil { t.Fatal(err) } } // Reinsert deleted rows. for i := 0; i < 10; i++ { k := deleteStartKey + i if _, err := sqlDB.Exec(`INSERT INTO t.test VALUES($1, $2)`, k, maxValue-k); err != nil { t.Fatal(err) } } // Insert some new rows. numInserts := 10 for i := 0; i < numInserts; i++ { if _, err := sqlDB.Exec(`INSERT INTO t.test VALUES($1, $2)`, maxValue+i+1, maxValue+i+1); err != nil { t.Fatal(err) } } wg.Wait() // for schema change to complete. // Verify the number of keys left behind in the table to validate schema // change operations. tablePrefix := roachpb.Key(keys.MakeTablePrefix(uint32(tableDesc.ID))) tableEnd := tablePrefix.PrefixEnd() if kvs, err := kvDB.Scan(tablePrefix, tableEnd, 0); err != nil { t.Fatal(err) } else if e := keyMultiple * (maxValue + numInserts + 1); len(kvs) != e { t.Fatalf("expected %d key value pairs, but got %d", e, len(kvs)) } // Delete the rows inserted. for i := 0; i < numInserts; i++ { if _, err := sqlDB.Exec(`DELETE FROM t.test WHERE k = $1`, maxValue+i+1); err != nil { t.Fatal(err) } } }
func testSingleKeyInner(t *testing.T, c cluster.Cluster, cfg cluster.TestConfig) { num := c.NumNodes() // Initialize the value for our test key to zero. const key = "test-key" initDB, initDBStopper := c.NewClient(t, 0) defer initDBStopper.Stop() if err := initDB.Put(key, 0); err != nil { t.Fatal(err) } type result struct { err error maxLatency time.Duration } resultCh := make(chan result, num) deadline := timeutil.Now().Add(cfg.Duration) var expected int64 // Start up num workers each reading and writing the same // key. Each worker is configured to talk to a different node in the // cluster. for i := 0; i < num; i++ { db, dbStopper := c.NewClient(t, i) defer dbStopper.Stop() go func() { var r result for timeutil.Now().Before(deadline) { start := timeutil.Now() err := db.Txn(func(txn *client.Txn) error { minExp := atomic.LoadInt64(&expected) r, err := txn.Get(key) if err != nil { return err } b := txn.NewBatch() v := r.ValueInt() b.Put(key, v+1) err = txn.CommitInBatch(b) // Atomic updates after the fact mean that we should read // exp or larger (since concurrent writers might have // committed but not yet performed their atomic update). if err == nil && v < minExp { return util.Errorf("unexpected read: %d, expected >= %d", v, minExp) } return err }) if err != nil { resultCh <- result{err: err} return } atomic.AddInt64(&expected, 1) latency := timeutil.Since(start) if r.maxLatency < latency { r.maxLatency = latency } } resultCh <- r }() } // Verify that none of the workers encountered an error. var results []result for len(results) < num { select { case <-stopper: t.Fatalf("interrupted") case r := <-resultCh: if r.err != nil { t.Fatal(r.err) } results = append(results, r) case <-time.After(1 * time.Second): // Periodically print out progress so that we know the test is still // running. log.Infof("%d", atomic.LoadInt64(&expected)) } } // Verify the resulting value stored at the key is what we expect. r, err := initDB.Get(key) if err != nil { t.Fatal(err) } v := r.ValueInt() if expected != v { t.Fatalf("expected %d, but found %d", expected, v) } var maxLatency []time.Duration for _, r := range results { maxLatency = append(maxLatency, r.maxLatency) } log.Infof("%d increments: %s", v, maxLatency) }
// applySnapshot updates the replica based on the given snapshot. // Returns the new last index. func (r *Replica) applySnapshot(batch engine.Engine, snap raftpb.Snapshot) (uint64, error) { snapData := roachpb.RaftSnapshotData{} err := proto.Unmarshal(snap.Data, &snapData) if err != nil { return 0, err } log.Infof("received snapshot for range %s at index %d. encoded size=%d, %d KV pairs, %d log entries", r.RangeID, snap.Metadata.Index, len(snap.Data), len(snapData.KV), len(snapData.LogEntries)) defer func(start time.Time) { log.Infof("applied snapshot for range %s in %s", r.RangeID, timeutil.Since(start)) }(timeutil.Now()) rangeID := r.RangeID // First, save the HardState. The HardState must not be changed // because it may record a previous vote cast by this node. This is // usually unnecessary because a snapshot is nearly always // accompanied by a new HardState which incorporates both our former // state and new information from the leader, but in the event that // the HardState has not changed, we want to use our own previous // HardState and not one that was transmitted via the snapshot. hardStateKey := keys.RaftHardStateKey(rangeID) hardState, _, err := engine.MVCCGet(context.Background(), batch, hardStateKey, roachpb.ZeroTimestamp, true /* consistent */, nil) if err != nil { return 0, err } // Extract the updated range descriptor. desc := snapData.RangeDescriptor // Delete everything in the range and recreate it from the snapshot. // We need to delete any old Raft log entries here because any log entries // that predate the snapshot will be orphaned and never truncated or GC'd. iter := newReplicaDataIterator(&desc, batch, false /* !replicatedOnly */) defer iter.Close() for ; iter.Valid(); iter.Next() { if err := batch.Clear(iter.Key()); err != nil { return 0, err } } // Determine the unreplicated key prefix so we can drop any // unreplicated keys from the snapshot. unreplicatedPrefix := keys.MakeRangeIDUnreplicatedPrefix(desc.RangeID) // Write the snapshot into the range. for _, kv := range snapData.KV { if bytes.HasPrefix(kv.Key, unreplicatedPrefix) { continue } mvccKey := engine.MVCCKey{ Key: kv.Key, Timestamp: kv.Timestamp, } if err := batch.Put(mvccKey, kv.Value); err != nil { return 0, err } } logEntries := make([]raftpb.Entry, len(snapData.LogEntries)) for i, bytes := range snapData.LogEntries { if err := logEntries[i].Unmarshal(bytes); err != nil { return 0, err } } // Write the snapshot's Raft log into the range. if _, err := r.append(batch, 0, logEntries); err != nil { return 0, err } // Restore the saved HardState. if hardState == nil { err := engine.MVCCDelete(context.Background(), batch, nil, hardStateKey, roachpb.ZeroTimestamp, nil) if err != nil { return 0, err } } else { err := engine.MVCCPut(context.Background(), batch, nil, hardStateKey, roachpb.ZeroTimestamp, *hardState, nil) if err != nil { return 0, err } } // Read the leader lease. lease, err := loadLeaderLease(batch, desc.RangeID) if err != nil { return 0, err } // Load updated range stats. The local newStats variable will be assigned // to r.stats after the batch commits. newStats, err := newRangeStats(desc.RangeID, batch) if err != nil { return 0, err } // The next line sets the persisted last index to the last applied index. // This is not a correctness issue, but means that we may have just // transferred some entries we're about to re-request from the leader and // overwrite. // However, raft.MultiNode currently expects this behaviour, and the // performance implications are not likely to be drastic. If our feelings // about this ever change, we can add a LastIndex field to // raftpb.SnapshotMetadata. if err := setLastIndex(batch, rangeID, snap.Metadata.Index); err != nil { return 0, err } batch.Defer(func() { // Update the range stats. r.stats.Replace(newStats) r.mu.Lock() // As outlined above, last and applied index are the same after applying // the snapshot. r.mu.appliedIndex = snap.Metadata.Index r.mu.leaderLease = lease r.mu.Unlock() // Update other fields which are uninitialized or need updating. // This may not happen if the system config has not yet been loaded. // While config update will correctly set the fields, there is no order // guarantee in ApplySnapshot. // TODO: should go through the standard store lock when adding a replica. if err := r.updateRangeInfo(&desc); err != nil { panic(err) } // Update the range descriptor. This is done last as this is the step that // makes the Replica visible in the Store. if err := r.setDesc(&desc); err != nil { panic(err) } }) return snap.Metadata.Index, nil }