// WaitForRebalance waits until there's been no recent range adds, removes, and // splits. We wait until the cluster is balanced or until `StableInterval` // elapses, whichever comes first. Then, it prints stats about the rebalancing // process. If the replica count appears unbalanced, an error is returned. // // This method is crude but necessary. If we were to wait until range counts // were just about even, we'd miss potential post-rebalance thrashing. func (at *allocatorTest) WaitForRebalance(ctx context.Context, t *testing.T) error { const statsInterval = 20 * time.Second db, err := gosql.Open("postgres", at.f.PGUrl(ctx, 0)) if err != nil { return err } defer func() { _ = db.Close() }() var statsTimer timeutil.Timer var assertTimer timeutil.Timer defer statsTimer.Stop() defer assertTimer.Stop() statsTimer.Reset(statsInterval) assertTimer.Reset(0) for { select { case <-statsTimer.C: statsTimer.Read = true stats, err := at.allocatorStats(db) if err != nil { return err } log.Info(ctx, stats) if StableInterval <= stats.ElapsedSinceLastEvent { host := at.f.Hostname(0) log.Infof(context.Background(), "replica count = %f, max = %f", stats.ReplicaCountStdDev, *flagATMaxStdDev) if stats.ReplicaCountStdDev > *flagATMaxStdDev { _ = at.printRebalanceStats(db, host) return errors.Errorf( "%s elapsed without changes, but replica count standard "+ "deviation is %.2f (>%.2f)", stats.ElapsedSinceLastEvent, stats.ReplicaCountStdDev, *flagATMaxStdDev) } return at.printRebalanceStats(db, host) } statsTimer.Reset(statsInterval) case <-assertTimer.C: assertTimer.Read = true at.f.Assert(ctx, t) assertTimer.Reset(time.Minute) case <-stopper.ShouldStop(): return errors.New("interrupted") } } }
func (ctx *Context) runHeartbeat(cc *grpc.ClientConn, remoteAddr string) error { request := PingRequest{ Addr: ctx.Addr, MaxOffsetNanos: ctx.localClock.MaxOffset().Nanoseconds(), } heartbeatClient := NewHeartbeatClient(cc) var heartbeatTimer timeutil.Timer defer heartbeatTimer.Stop() // Give the first iteration a wait-free heartbeat attempt. nextHeartbeat := 0 * time.Nanosecond for { heartbeatTimer.Reset(nextHeartbeat) select { case <-ctx.Stopper.ShouldStop(): return nil case <-heartbeatTimer.C: heartbeatTimer.Read = true } sendTime := ctx.localClock.PhysicalTime() response, err := ctx.heartbeat(heartbeatClient, request) ctx.setConnHealthy(remoteAddr, err == nil) if err == nil { receiveTime := ctx.localClock.PhysicalTime() // Only update the clock offset measurement if we actually got a // successful response from the server. if pingDuration := receiveTime.Sub(sendTime); pingDuration > maximumPingDurationMult*ctx.localClock.MaxOffset() { request.Offset.Reset() } else { // Offset and error are measured using the remote clock reading // technique described in // http://se.inf.tu-dresden.de/pubs/papers/SRDS1994.pdf, page 6. // However, we assume that drift and min message delay are 0, for // now. request.Offset.MeasuredAt = receiveTime.UnixNano() request.Offset.Uncertainty = (pingDuration / 2).Nanoseconds() remoteTimeNow := time.Unix(0, response.ServerTime).Add(pingDuration / 2) request.Offset.Offset = remoteTimeNow.Sub(receiveTime).Nanoseconds() } ctx.RemoteClocks.UpdateOffset(remoteAddr, request.Offset) if cb := ctx.HeartbeatCB; cb != nil { cb() } } // If the heartbeat timed out, run the next one immediately. Otherwise, // wait out the heartbeat interval on the next iteration. if grpc.Code(err) == codes.DeadlineExceeded { nextHeartbeat = 0 } else { nextHeartbeat = ctx.HeartbeatInterval } } }
// bootstrap connects the node to the gossip network. Bootstrapping // commences in the event there are no connected clients or the // sentinel gossip info is not available. After a successful bootstrap // connection, this method will block on the stalled condvar, which // receives notifications that gossip network connectivity has been // lost and requires re-bootstrapping. func (g *Gossip) bootstrap() { g.server.stopper.RunWorker(func() { ctx := g.AnnotateCtx(context.Background()) ctx = log.WithLogTag(ctx, "bootstrap", nil) var bootstrapTimer timeutil.Timer defer bootstrapTimer.Stop() for { if g.server.stopper.RunTask(func() { g.mu.Lock() defer g.mu.Unlock() haveClients := g.outgoing.len() > 0 haveSentinel := g.mu.is.getInfo(KeySentinel) != nil log.Eventf(ctx, "have clients: %t, have sentinel: %t", haveClients, haveSentinel) if !haveClients || !haveSentinel { // Try to get another bootstrap address from the resolvers. if addr := g.getNextBootstrapAddress(); addr != nil { g.startClient(addr, g.NodeID.Get()) } else { bootstrapAddrs := make([]string, 0, len(g.bootstrapping)) for addr := range g.bootstrapping { bootstrapAddrs = append(bootstrapAddrs, addr) } log.Eventf(ctx, "no next bootstrap address; currently bootstrapping: %v", bootstrapAddrs) // We couldn't start a client, signal that we're stalled so that // we'll retry. g.maybeSignalStatusChangeLocked() } } }) != nil { return } // Pause an interval before next possible bootstrap. bootstrapTimer.Reset(g.bootstrapInterval) log.Eventf(ctx, "sleeping %s until bootstrap", g.bootstrapInterval) select { case <-bootstrapTimer.C: bootstrapTimer.Read = true // break case <-g.server.stopper.ShouldStop(): return } log.Eventf(ctx, "idling until bootstrap required") // Block until we need bootstrapping again. select { case <-g.stalledCh: log.Eventf(ctx, "detected stall; commencing bootstrap") // break case <-g.server.stopper.ShouldStop(): return } } }) }
// start will run continuously and expire old reservations. func (b *bookie) start(stopper *stop.Stopper) { stopper.RunWorker(func() { var timeoutTimer timeutil.Timer defer timeoutTimer.Stop() ctx := context.TODO() for { var timeout time.Duration b.mu.Lock() nextExpiration := b.mu.queue.peek() if nextExpiration == nil { // No reservations to expire. timeout = b.reservationTimeout } else { now := b.clock.Now() if now.GoTime().After(nextExpiration.expireAt.GoTime()) { // We have a reservation expiration, remove it. expiredReservation := b.mu.queue.dequeue() // Is it an active reservation? if b.mu.reservationsByRangeID[expiredReservation.RangeID] == expiredReservation { b.fillReservationLocked(ctx, expiredReservation) } else if log.V(2) { log.Infof(ctx, "[r%d] expired reservation has already been filled", expiredReservation.RangeID) } // Set the timeout to 0 to force another peek. timeout = 0 } else { timeout = nextExpiration.expireAt.GoTime().Sub(now.GoTime()) } } b.mu.Unlock() timeoutTimer.Reset(timeout) select { case <-timeoutTimer.C: timeoutTimer.Read = true case <-stopper.ShouldStop(): return } } }) }
// start will run continuously and mark stores as offline if they haven't been // heard from in longer than timeUntilStoreDead. func (sp *StorePool) start(stopper *stop.Stopper) { stopper.RunWorker(func() { var timeoutTimer timeutil.Timer defer timeoutTimer.Stop() for { var timeout time.Duration sp.mu.Lock() detail := sp.mu.queue.peek() if detail == nil { // No stores yet, wait the full timeout. timeout = sp.timeUntilStoreDead } else { // Check to see if the store should be marked as dead. deadAsOf := detail.lastUpdatedTime.GoTime().Add(sp.timeUntilStoreDead) now := sp.clock.Now() if now.GoTime().After(deadAsOf) { deadDetail := sp.mu.queue.dequeue() deadDetail.markDead(now) // The next store might be dead as well, set the timeout to // 0 to process it immediately. timeout = 0 } else { // Store is still alive, schedule the next check for when // it should timeout. timeout = deadAsOf.Sub(now.GoTime()) } } sp.mu.Unlock() timeoutTimer.Reset(timeout) select { case <-timeoutTimer.C: timeoutTimer.Read = true case <-stopper.ShouldStop(): return } } }) }
// sendToReplicas sends one or more RPCs to clients specified by the // slice of replicas. On success, Send returns the first successful // reply. If an error occurs which is not specific to a single // replica, it's returned immediately. Otherwise, when all replicas // have been tried and failed, returns a send error. func (ds *DistSender) sendToReplicas( opts SendOptions, rangeID roachpb.RangeID, replicas ReplicaSlice, args roachpb.BatchRequest, rpcContext *rpc.Context, ) (*roachpb.BatchResponse, error) { if len(replicas) < 1 { return nil, roachpb.NewSendError( fmt.Sprintf("insufficient replicas (%d) to satisfy send request of %d", len(replicas), 1)) } var ambiguousResult bool var haveCommit bool // We only check for committed txns, not aborts because aborts may // be retried without any risk of inconsistencies. if etArg, ok := args.GetArg(roachpb.EndTransaction); ok && etArg.(*roachpb.EndTransactionRequest).Commit { haveCommit = true } done := make(chan BatchCall, len(replicas)) transportFactory := opts.transportFactory if transportFactory == nil { transportFactory = grpcTransportFactory } transport, err := transportFactory(opts, rpcContext, replicas, args) if err != nil { return nil, err } defer transport.Close() if transport.IsExhausted() { return nil, roachpb.NewSendError( fmt.Sprintf("sending to all %d replicas failed", len(replicas))) } // Send the first request. pending := 1 log.VEventf(opts.ctx, 2, "sending RPC for batch: %s", args.Summary()) transport.SendNext(done) // Wait for completions. This loop will retry operations that fail // with errors that reflect per-replica state and may succeed on // other replicas. var sendNextTimer timeutil.Timer defer sendNextTimer.Stop() for { sendNextTimer.Reset(opts.SendNextTimeout) select { case <-sendNextTimer.C: sendNextTimer.Read = true // On successive RPC timeouts, send to additional replicas if available. if !transport.IsExhausted() { log.VEventf(opts.ctx, 2, "timeout, trying next peer") pending++ transport.SendNext(done) } case call := <-done: pending-- err := call.Err if err == nil { if log.V(2) { log.Infof(opts.ctx, "RPC reply: %s", call.Reply) } else if log.V(1) && call.Reply.Error != nil { log.Infof(opts.ctx, "application error: %s", call.Reply.Error) } if call.Reply.Error == nil { return call.Reply, nil } else if !ds.handlePerReplicaError(opts.ctx, transport, rangeID, call.Reply.Error) { // The error received is not specific to this replica, so we // should return it instead of trying other replicas. However, // if we're trying to commit a transaction and there are // still other RPCs outstanding or an ambiguous RPC error // was already received, we must return an ambiguous commit // error instead of returned error. if haveCommit && (pending > 0 || ambiguousResult) { return nil, roachpb.NewAmbiguousResultError() } return call.Reply, nil } // Extract the detail so it can be included in the error // message if this is our last replica. // // TODO(bdarnell): The last error is not necessarily the best // one to return; we may want to remember the "best" error // we've seen (for example, a NotLeaseHolderError conveys more // information than a RangeNotFound). err = call.Reply.Error.GoError() } else { if log.V(1) { log.Warningf(opts.ctx, "RPC error: %s", err) } // All connection errors except for an unavailable node (this // is GRPC's fail-fast error), may mean that the request // succeeded on the remote server, but we were unable to // receive the reply. Set the ambiguous commit flag. // // We retry ambiguous commit batches to avoid returning the // unrecoverable AmbiguousResultError. This is safe because // repeating an already-successfully applied batch is // guaranteed to return either a TransactionReplayError (in // case the replay happens at the original leader), or a // TransactionRetryError (in case the replay happens at a new // leader). If the original attempt merely timed out or was // lost, then the batch will succeed and we can be assured the // commit was applied just once. // // The Unavailable code is used by GRPC to indicate that a // request fails fast and is not sent, so we can be sure there // is no ambiguity on these errors. Note that these are common // if a node is down. // See https://github.com/grpc/grpc-go/blob/52f6504dc290bd928a8139ba94e3ab32ed9a6273/call.go#L182 // See https://github.com/grpc/grpc-go/blob/52f6504dc290bd928a8139ba94e3ab32ed9a6273/stream.go#L158 if haveCommit && grpc.Code(err) != codes.Unavailable { ambiguousResult = true } } // Send to additional replicas if available. if !transport.IsExhausted() { log.VEventf(opts.ctx, 2, "error, trying next peer: %s", err) pending++ transport.SendNext(done) } if pending == 0 { if ambiguousResult { err = roachpb.NewAmbiguousResultError() } else { err = roachpb.NewSendError( fmt.Sprintf("sending to all %d replicas failed; last error: %v", len(replicas), err), ) } if log.V(2) { log.ErrEvent(opts.ctx, err.Error()) } return nil, err } } } }
// Run performs a continuous load test with the given parameters, checking the // health of the cluster regularly. Any failure of a CockroachDB node or load // generator constitutes a test failure. The test runs for the duration given // by the `test.timeout` flag, minus the time it takes to reliably tear down // the test cluster. func (cl continuousLoadTest) Run(ctx context.Context, t testing.TB) { f := MakeFarmer(t, cl.Prefix+cl.shortTestTimeout(), stopper) // If the timeout flag was set, calculate an appropriate lower timeout by // subtracting expected cluster creation and teardown times to allow for // proper shutdown at the end of the test. if fl := flag.Lookup("test.timeout"); fl != nil { timeout, err := time.ParseDuration(fl.Value.String()) if err != nil { t.Fatal(err) } // We've empirically observed 6-7 minute teardown times. We set aside a // larger duration to account for outliers and setup time. if setupTeardownDuration := 10 * time.Minute; timeout <= setupTeardownDuration { t.Fatalf("test.timeout must be greater than create/destroy interval %s", setupTeardownDuration) } else { var cancel context.CancelFunc ctx, cancel = context.WithTimeout(ctx, timeout-setupTeardownDuration) defer cancel() } } if deadline, ok := ctx.Deadline(); ok { log.Infof(ctx, "load test will end at %s", deadline) } else { log.Infof(ctx, "load test will run indefinitely") } // Start cluster and load. defer func() { if r := recover(); r != nil { t.Errorf("recovered from panic to destroy cluster: %v", r) } f.MustDestroy(t) }() f.AddVars["benchmark_name"] = cl.BenchmarkPrefix + cl.shortTestTimeout() if cl.CockroachDiskSizeGB != 0 { f.AddVars["cockroach_disk_size"] = strconv.Itoa(cl.CockroachDiskSizeGB) } if err := f.Resize(cl.NumNodes); err != nil { t.Fatal(err) } CheckGossip(ctx, t, f, longWaitTime, HasPeers(cl.NumNodes)) start := timeutil.Now() if err := f.StartLoad(ctx, cl.Process, *flagCLTWriters); err != nil { t.Fatal(err) } f.Assert(ctx, t) // Run load, checking the health of the cluster periodically. const healthCheckInterval = 2 * time.Minute var healthCheckTimer timeutil.Timer healthCheckTimer.Reset(healthCheckInterval) lastQueryCount := 0.0 lastHealthCheck := timeutil.Now() for { select { case <-healthCheckTimer.C: // Check that all nodes are up and that queries are being processed. healthCheckTimer.Read = true f.Assert(ctx, t) queryCount, err := cl.queryCount(f) if err != nil { t.Fatal(err) } if lastQueryCount == queryCount { t.Fatalf("no queries in the last %s", healthCheckInterval) } qps := (queryCount - lastQueryCount) / timeutil.Now().Sub(lastHealthCheck).Seconds() if qps < *flagCLTMinQPS { t.Fatalf("qps (%.1f) < min (%.1f)", qps, *flagCLTMinQPS) } lastQueryCount = queryCount lastHealthCheck = timeutil.Now() healthCheckTimer.Reset(healthCheckInterval) log.Infof(ctx, "health check ok %s (%.1f qps)", timeutil.Now().Sub(start), qps) case <-ctx.Done(): log.Infof(ctx, "load test finished") f.Assert(ctx, t) if err := f.Stop(ctx, 0, cl.Process); err != nil { t.Error(err) } return case <-stopper.ShouldStop(): t.Fatal("interrupted") } } }
// printStatsLoop blocks and periodically logs transaction statistics // (throughput, success rates, durations, ...). Note that this only captures // write txns, since read-only txns are stateless as far as TxnCoordSender is // concerned. stats). func (tc *TxnCoordSender) printStatsLoop(ctx context.Context) { res := time.Millisecond // for duration logging resolution var statusLogTimer timeutil.Timer defer statusLogTimer.Stop() scale := metric.Scale1M for { statusLogTimer.Reset(statusLogInterval) select { case <-statusLogTimer.C: statusLogTimer.Read = true // Take a snapshot of metrics. There's some chance of skew, since the snapshots are // not done atomically, but that should be fine for these debug stats. metrics := tc.metrics durations, durationsWindow := metrics.Durations.Windowed() restarts, restartsWindow := metrics.Restarts.Windowed() if restartsWindow != durationsWindow { log.Fatalf(ctx, "misconfigured windowed histograms: %s != %s", restartsWindow, durationsWindow, ) } commitRate := metrics.Commits.Rates[scale].Value() commit1PCRate := metrics.Commits1PC.Rates[scale].Value() abortRate := metrics.Aborts.Rates[scale].Value() abandonRate := metrics.Abandons.Rates[scale].Value() // Show transaction stats over the last minute. Maybe this should // be shorter in the future. We'll revisit if we get sufficient // feedback. totalRate := commitRate + abortRate + abandonRate var pCommitted, pCommitted1PC, pAbandoned, pAborted float64 if totalRate > 0 { pCommitted = 100 * (commitRate / totalRate) pCommitted1PC = 100 * (commit1PCRate / totalRate) pAborted = 100 * (abortRate / totalRate) pAbandoned = 100 * (abandonRate / totalRate) } dMean := durations.Mean() dDev := durations.StdDev() dMax := durations.Max() rMean := restarts.Mean() rDev := restarts.StdDev() rMax := restarts.Max() num := durations.TotalCount() // We could skip calculating everything if !log.V(1) but we want to make // sure the code above doesn't silently break. if log.V(1) { log.Infof(ctx, "txn coordinator: %.2f txn/sec, %.2f/%.2f/%.2f/%.2f %%cmmt/cmmt1pc/abrt/abnd, "+ "%s/%s/%s avg/σ/max duration, %.1f/%.1f/%d avg/σ/max restarts (%d samples over %s)", totalRate, pCommitted, pCommitted1PC, pAborted, pAbandoned, util.TruncateDuration(time.Duration(dMean), res), util.TruncateDuration(time.Duration(dDev), res), util.TruncateDuration(time.Duration(dMax), res), rMean, rDev, rMax, num, restartsWindow, ) } case <-tc.stopper.ShouldStop(): return } } }
// processQueue opens a Raft client stream and sends messages from the // designated queue (ch) via that stream, exiting when an error is received or // when it idles out. All messages remaining in the queue at that point are // lost and a new instance of processQueue will be started by the next message // to be sent. func (t *RaftTransport) processQueue( nodeID roachpb.NodeID, ch chan *RaftMessageRequest, stats *raftTransportStats, stream MultiRaft_RaftMessageBatchClient, ) error { errCh := make(chan error, 1) // Starting workers in a task prevents data races during shutdown. if err := t.rpcContext.Stopper.RunTask(func() { t.rpcContext.Stopper.RunWorker(func() { ctx := t.AnnotateCtx(context.Background()) errCh <- func() error { for { resp, err := stream.Recv() if err != nil { return err } atomic.AddInt64(&stats.clientRecv, 1) t.recvMu.Lock() handler, ok := t.recvMu.handlers[resp.ToReplica.StoreID] t.recvMu.Unlock() if !ok { log.Warningf(ctx, "no handler found for store %s in response %s", resp.ToReplica.StoreID, resp) continue } if err := handler.HandleRaftResponse(stream.Context(), resp); err != nil { return err } } }() }) }); err != nil { return err } var raftIdleTimer timeutil.Timer defer raftIdleTimer.Stop() batch := &RaftMessageRequestBatch{} for { raftIdleTimer.Reset(raftIdleTimeout) select { case <-t.rpcContext.Stopper.ShouldStop(): return nil case <-raftIdleTimer.C: raftIdleTimer.Read = true return nil case err := <-errCh: return err case req := <-ch: batch.Requests = append(batch.Requests, *req) // Pull off as many queued requests as possible. // // TODO(peter): Think about limiting the size of the batch we send. for done := false; !done; { select { case req = <-ch: batch.Requests = append(batch.Requests, *req) default: done = true } } err := stream.Send(batch) batch.Requests = batch.Requests[:0] atomic.AddInt64(&stats.clientSent, 1) if err != nil { return err } } } }
// Run performs a continuous load test with the given parameters, checking the // health of the cluster regularly. Any failure of a CockroachDB node or load // generator constitutes a test failure. The test runs for the duration given // by the `test.timeout` flag, minus the time it takes to reliably tear down // the test cluster. func (cl continuousLoadTest) Run(t *testing.T) { f := farmer(t, cl.Prefix+cl.shortTestTimeout()) ctx, err := WithClusterTimeout(context.Background()) if err != nil { t.Fatal(err) } if deadline, ok := ctx.Deadline(); ok { log.Infof(ctx, "load test will end at %s", deadline) } else { log.Infof(ctx, "load test will run indefinitely") } // Start cluster and load. defer func() { if r := recover(); r != nil { t.Errorf("recovered from panic to destroy cluster: %v", r) } f.MustDestroy(t) }() f.AddVars["benchmark_name"] = cl.BenchmarkPrefix + cl.shortTestTimeout() if cl.CockroachDiskSizeGB != 0 { f.AddVars["cockroach_disk_size"] = strconv.Itoa(cl.CockroachDiskSizeGB) } if err := f.Resize(cl.NumNodes); err != nil { t.Fatal(err) } checkGossip(t, f, longWaitTime, hasPeers(cl.NumNodes)) start := timeutil.Now() if err := cl.startLoad(f); err != nil { t.Fatal(err) } cl.assert(t, f) // Run load, checking the health of the cluster periodically. const healthCheckInterval = 2 * time.Minute var healthCheckTimer timeutil.Timer healthCheckTimer.Reset(healthCheckInterval) lastQueryCount := 0.0 lastHealthCheck := timeutil.Now() for { select { case <-healthCheckTimer.C: // Check that all nodes are up and that queries are being processed. healthCheckTimer.Read = true cl.assert(t, f) queryCount, err := cl.queryCount(f) if err != nil { t.Fatal(err) } if lastQueryCount == queryCount { t.Fatalf("no queries in the last %s", healthCheckInterval) } qps := (queryCount - lastQueryCount) / timeutil.Now().Sub(lastHealthCheck).Seconds() if qps < *flagCLTMinQPS { t.Fatalf("qps (%.1f) < min (%.1f)", qps, *flagCLTMinQPS) } lastQueryCount = queryCount lastHealthCheck = timeutil.Now() healthCheckTimer.Reset(healthCheckInterval) log.Infof(ctx, "health check ok %s (%.1f qps)", timeutil.Now().Sub(start), qps) case <-ctx.Done(): log.Infof(ctx, "load test finished") cl.assert(t, f) if err := f.Stop(0, cl.Process); err != nil { t.Error(err) } return case <-stopper: t.Fatal("interrupted") } } }