// WaitForRebalance waits until there's been no recent range adds, removes, and // splits. We wait until the cluster is balanced or until `StableInterval` // elapses, whichever comes first. Then, it prints stats about the rebalancing // process. // // This method is crude but necessary. If we were to wait until range counts // were just about even, we'd miss potential post-rebalance thrashing. func (at *allocatorTest) WaitForRebalance() error { db, err := gosql.Open("postgres", at.f.PGUrl(1)) if err != nil { return err } defer func() { _ = db.Close() }() var timer timeutil.Timer defer timer.Stop() timer.Reset(0) for { select { case <-timer.C: timer.Read = true stable, err := at.checkAllocatorStable(db) if err != nil { return err } if stable { host := at.f.Nodes()[0] if err := at.printRebalanceStats(db, host, 8080); err != nil { return err } return nil } case <-stopper: return errors.New("interrupted") } timer.Reset(10 * time.Second) } }
// processQueue creates a client and sends messages from its designated queue // via that client, exiting when the client fails or when it idles out. All // messages remaining in the queue at that point are lost and a new instance of // processQueue should be started by the next message to be sent. // TODO(tschottdorf) should let raft know if the node is down; // need a feedback mechanism for that. Potentially easiest is to arrange for // the next call to Send() to fail appropriately. func (t *RaftTransport) processQueue(ch chan *RaftMessageRequest, toReplica roachpb.ReplicaDescriptor) error { addr, err := t.resolver(toReplica.NodeID) if err != nil { return err } conn, err := t.rpcContext.GRPCDial(addr.String()) if err != nil { return err } client := NewMultiRaftClient(conn) ctx, cancel := context.WithCancel(context.TODO()) defer cancel() stream, err := client.RaftMessage(ctx) if err != nil { return err } errCh := make(chan error, 1) // Starting workers in a task prevents data races during shutdown. if err := t.rpcContext.Stopper.RunTask(func() { t.rpcContext.Stopper.RunWorker(func() { errCh <- stream.RecvMsg(new(RaftMessageResponse)) }) }); err != nil { return err } var raftIdleTimer timeutil.Timer defer raftIdleTimer.Stop() for { raftIdleTimer.Reset(raftIdleTimeout) select { case <-t.rpcContext.Stopper.ShouldStop(): return nil case <-raftIdleTimer.C: raftIdleTimer.Read = true return nil case err := <-errCh: return err case req := <-ch: err := stream.Send(req) if req.Message.Type == raftpb.MsgSnap { select { case <-t.rpcContext.Stopper.ShouldStop(): return nil case t.SnapshotStatusChan <- RaftSnapshotStatus{req, err}: } } if err != nil { return err } } } }
func (ctx *Context) runHeartbeat(cc *grpc.ClientConn, remoteAddr string) error { request := PingRequest{Addr: ctx.Addr} heartbeatClient := NewHeartbeatClient(cc) var heartbeatTimer timeutil.Timer defer heartbeatTimer.Stop() for { // If we should stop, return immediately. Note that we check this // at the beginning and end of the loop because we may 'continue' // before reaching the end. select { case <-ctx.Stopper.ShouldStop(): return nil default: } sendTime := ctx.localClock.PhysicalTime() response, err := ctx.heartbeat(heartbeatClient, request) ctx.setConnHealthy(remoteAddr, err == nil) if err != nil { if grpc.Code(err) == codes.DeadlineExceeded { continue } return err } receiveTime := ctx.localClock.PhysicalTime() // Only update the clock offset measurement if we actually got a // successful response from the server. if pingDuration := receiveTime.Sub(sendTime); pingDuration > maximumPingDurationMult*ctx.localClock.MaxOffset() { request.Offset.Reset() } else { // Offset and error are measured using the remote clock reading // technique described in // http://se.inf.tu-dresden.de/pubs/papers/SRDS1994.pdf, page 6. // However, we assume that drift and min message delay are 0, for // now. request.Offset.MeasuredAt = receiveTime.UnixNano() request.Offset.Uncertainty = (pingDuration / 2).Nanoseconds() remoteTimeNow := time.Unix(0, response.ServerTime).Add(pingDuration / 2) request.Offset.Offset = remoteTimeNow.Sub(receiveTime).Nanoseconds() } ctx.RemoteClocks.UpdateOffset(remoteAddr, request.Offset) if cb := ctx.HeartbeatCB; cb != nil { cb() } // Wait after the heartbeat so that the first iteration gets a wait-free // heartbeat attempt. heartbeatTimer.Reset(ctx.HeartbeatInterval) select { case <-ctx.Stopper.ShouldStop(): return nil case <-heartbeatTimer.C: heartbeatTimer.Read = true } } }
// startStats blocks and periodically logs transaction statistics (throughput, // success rates, durations, ...). Note that this only captures write txns, // since read-only txns are stateless as far as TxnCoordSender is concerned. // stats). func (tc *TxnCoordSender) startStats() { res := time.Millisecond // for duration logging resolution var statusLogTimer timeutil.Timer defer statusLogTimer.Stop() scale := metric.Scale1M for { statusLogTimer.Reset(statusLogInterval) select { case <-statusLogTimer.C: statusLogTimer.Read = true if !log.V(1) { continue } // Take a snapshot of metrics. There's some chance of skew, since the snapshots are // not done atomically, but that should be fine for these debug stats. metrics := tc.metrics durations := metrics.Durations[scale].Current() restarts := metrics.Restarts.Current() commitRate := metrics.Commits.Rates[scale].Value() commit1PCRate := metrics.Commits1PC.Rates[scale].Value() abortRate := metrics.Aborts.Rates[scale].Value() abandonRate := metrics.Abandons.Rates[scale].Value() // Show transaction stats over the last minute. Maybe this should be shorter in the future. // We'll revisit if we get sufficient feedback. totalRate := commitRate + abortRate + abandonRate var pCommitted, pCommitted1PC, pAbandoned, pAborted float64 if totalRate > 0 { pCommitted = 100 * (commitRate / totalRate) pCommitted1PC = 100 * (commit1PCRate / totalRate) pAborted = 100 * (abortRate / totalRate) pAbandoned = 100 * (abandonRate / totalRate) } dMean := durations.Mean() dDev := durations.StdDev() dMax := durations.Max() rMean := restarts.Mean() rDev := restarts.StdDev() rMax := restarts.Max() num := durations.TotalCount() log.Infof(tc.ctx, "txn coordinator: %.2f txn/sec, %.2f/%.2f/%.2f/%.2f %%cmmt/cmmt1pc/abrt/abnd, %s/%s/%s avg/σ/max duration, %.1f/%.1f/%d avg/σ/max restarts (%d samples)", totalRate, pCommitted, pCommitted1PC, pAborted, pAbandoned, util.TruncateDuration(time.Duration(dMean), res), util.TruncateDuration(time.Duration(dDev), res), util.TruncateDuration(time.Duration(dMax), res), rMean, rDev, rMax, num, ) case <-tc.stopper.ShouldStop(): return } } }
func (ctx *Context) runHeartbeat(cc *grpc.ClientConn, remoteAddr string) error { request := PingRequest{Addr: ctx.Addr} heartbeatClient := NewHeartbeatClient(cc) var heartbeatTimer timeutil.Timer defer heartbeatTimer.Stop() // Give the first iteration a wait-free heartbeat attempt. nextHeartbeat := 0 * time.Nanosecond for { heartbeatTimer.Reset(nextHeartbeat) select { case <-ctx.Stopper.ShouldStop(): return nil case <-heartbeatTimer.C: heartbeatTimer.Read = true } sendTime := ctx.localClock.PhysicalTime() response, err := ctx.heartbeat(heartbeatClient, request) ctx.setConnHealthy(remoteAddr, err == nil) if err == nil { receiveTime := ctx.localClock.PhysicalTime() // Only update the clock offset measurement if we actually got a // successful response from the server. if pingDuration := receiveTime.Sub(sendTime); pingDuration > maximumPingDurationMult*ctx.localClock.MaxOffset() { request.Offset.Reset() } else { // Offset and error are measured using the remote clock reading // technique described in // http://se.inf.tu-dresden.de/pubs/papers/SRDS1994.pdf, page 6. // However, we assume that drift and min message delay are 0, for // now. request.Offset.MeasuredAt = receiveTime.UnixNano() request.Offset.Uncertainty = (pingDuration / 2).Nanoseconds() remoteTimeNow := time.Unix(0, response.ServerTime).Add(pingDuration / 2) request.Offset.Offset = remoteTimeNow.Sub(receiveTime).Nanoseconds() } ctx.RemoteClocks.UpdateOffset(remoteAddr, request.Offset) if cb := ctx.HeartbeatCB; cb != nil { cb() } } // If the heartbeat timed out, run the next one immediately. Otherwise, // wait out the heartbeat interval on the next iteration. if grpc.Code(err) == codes.DeadlineExceeded { nextHeartbeat = 0 } else { nextHeartbeat = ctx.HeartbeatInterval } } }
// bootstrap connects the node to the gossip network. Bootstrapping // commences in the event there are no connected clients or the // sentinel gossip info is not available. After a successful bootstrap // connection, this method will block on the stalled condvar, which // receives notifications that gossip network connectivity has been // lost and requires re-bootstrapping. func (g *Gossip) bootstrap() { g.server.stopper.RunWorker(func() { ctx := log.WithLogTag(g.ctx, "bootstrap", nil) var bootstrapTimer timeutil.Timer defer bootstrapTimer.Stop() for { if g.server.stopper.RunTask(func() { g.mu.Lock() defer g.mu.Unlock() haveClients := g.outgoing.len() > 0 haveSentinel := g.mu.is.getInfo(KeySentinel) != nil log.Eventf(ctx, "have clients: %t, have sentinel: %t", haveClients, haveSentinel) if !haveClients || !haveSentinel { // Try to get another bootstrap address from the resolvers. if addr := g.getNextBootstrapAddress(); addr != nil { g.startClient(addr, g.mu.is.NodeID) } else { bootstrapAddrs := make([]string, 0, len(g.bootstrapping)) for addr := range g.bootstrapping { bootstrapAddrs = append(bootstrapAddrs, addr) } log.Eventf(ctx, "no next bootstrap address; currently bootstrapping: %v", bootstrapAddrs) // We couldn't start a client, signal that we're stalled so that // we'll retry. g.maybeSignalStatusChangeLocked() } } }) != nil { return } // Pause an interval before next possible bootstrap. bootstrapTimer.Reset(g.bootstrapInterval) log.Eventf(ctx, "sleeping %s until bootstrap", g.bootstrapInterval) select { case <-bootstrapTimer.C: bootstrapTimer.Read = true // break case <-g.server.stopper.ShouldStop(): return } log.Eventf(ctx, "idling until bootstrap required") // Block until we need bootstrapping again. select { case <-g.stalledCh: log.Eventf(ctx, "detected stall; commencing bootstrap") // break case <-g.server.stopper.ShouldStop(): return } } }) }
// WaitForRebalance waits until there's been no recent range adds, removes, and // splits. We wait until the cluster is balanced or until `StableInterval` // elapses, whichever comes first. Then, it prints stats about the rebalancing // process. If the replica count appears unbalanced, an error is returned. // // This method is crude but necessary. If we were to wait until range counts // were just about even, we'd miss potential post-rebalance thrashing. func (at *allocatorTest) WaitForRebalance(t *testing.T) error { const statsInterval = 20 * time.Second db, err := gosql.Open("postgres", at.f.PGUrl(0)) if err != nil { return err } defer func() { _ = db.Close() }() var statsTimer timeutil.Timer var assertTimer timeutil.Timer defer statsTimer.Stop() defer assertTimer.Stop() statsTimer.Reset(statsInterval) assertTimer.Reset(0) for { select { case <-statsTimer.C: statsTimer.Read = true stats, err := at.allocatorStats(db) if err != nil { return err } log.Info(context.Background(), stats) if StableInterval <= stats.ElapsedSinceLastEvent { host := at.f.Nodes()[0] log.Infof(context.Background(), "replica count = %f, max = %f", stats.ReplicaCountStdDev, *flagATMaxStdDev) if stats.ReplicaCountStdDev > *flagATMaxStdDev { _ = at.printRebalanceStats(db, host) return errors.Errorf( "%s elapsed without changes, but replica count standard "+ "deviation is %.2f (>%.2f)", stats.ElapsedSinceLastEvent, stats.ReplicaCountStdDev, *flagATMaxStdDev) } return at.printRebalanceStats(db, host) } statsTimer.Reset(statsInterval) case <-assertTimer.C: assertTimer.Read = true at.f.Assert(t) assertTimer.Reset(time.Minute) case <-stopper: return errors.New("interrupted") } } }
// bootstrap connects the node to the gossip network. Bootstrapping // commences in the event there are no connected clients or the // sentinel gossip info is not available. After a successful bootstrap // connection, this method will block on the stalled condvar, which // receives notifications that gossip network connectivity has been // lost and requires re-bootstrapping. func (g *Gossip) bootstrap() { stopper := g.server.stopper stopper.RunWorker(func() { var bootstrapTimer timeutil.Timer defer bootstrapTimer.Stop() for { if stopper.RunTask(func() { g.mu.Lock() defer g.mu.Unlock() haveClients := g.outgoing.len() > 0 haveSentinel := g.is.getInfo(KeySentinel) != nil if !haveClients || !haveSentinel { // Try to get another bootstrap address from the resolvers. if addr := g.getNextBootstrapAddress(); addr != nil { g.startClient(addr, stopper) } else { // We couldn't start a client, signal that we're stalled so that // we'll retry. g.maybeSignalStalledLocked() } } }) != nil { return } // Pause an interval before next possible bootstrap. bootstrapTimer.Reset(g.bootstrapInterval) select { case <-bootstrapTimer.C: bootstrapTimer.Read = true // break case <-stopper.ShouldStop(): return } // Block until we need bootstrapping again. select { case <-g.stalled: // break case <-stopper.ShouldStop(): return } } }) }
// start will run continuously and expire old reservations. func (b *bookie) start(stopper *stop.Stopper) { stopper.RunWorker(func() { var timeoutTimer timeutil.Timer defer timeoutTimer.Stop() for { var timeout time.Duration b.mu.Lock() nextExpiration := b.mu.queue.peek() if nextExpiration == nil { // No reservations to expire. timeout = b.reservationTimeout } else { now := b.clock.Now() if now.GoTime().After(nextExpiration.expireAt.GoTime()) { // We have a reservation expiration, remove it. expiredReservation := b.mu.queue.dequeue() // Is it an active reservation? if b.mu.reservationsByRangeID[expiredReservation.RangeID] == expiredReservation { b.fillReservationLocked(expiredReservation) } else if log.V(2) { log.Infof(context.TODO(), "the reservation for rangeID %d has already been filled.", expiredReservation.RangeID) } // Set the timeout to 0 to force another peek. timeout = 0 } else { timeout = nextExpiration.expireAt.GoTime().Sub(now.GoTime()) } } b.mu.Unlock() timeoutTimer.Reset(timeout) select { case <-timeoutTimer.C: timeoutTimer.Read = true case <-stopper.ShouldStop(): return } } }) }
// start will run continuously and mark stores as offline if they haven't been // heard from in longer than timeUntilStoreDead. func (sp *StorePool) start(stopper *stop.Stopper) { stopper.RunWorker(func() { var timeoutTimer timeutil.Timer defer timeoutTimer.Stop() for { var timeout time.Duration sp.mu.Lock() detail := sp.mu.queue.peek() if detail == nil { // No stores yet, wait the full timeout. timeout = sp.timeUntilStoreDead } else { // Check to see if the store should be marked as dead. deadAsOf := detail.lastUpdatedTime.GoTime().Add(sp.timeUntilStoreDead) now := sp.clock.Now() if now.GoTime().After(deadAsOf) { deadDetail := sp.mu.queue.dequeue() deadDetail.markDead(now) // The next store might be dead as well, set the timeout to // 0 to process it immediately. timeout = 0 } else { // Store is still alive, schedule the next check for when // it should timeout. timeout = deadAsOf.Sub(now.GoTime()) } } sp.mu.Unlock() timeoutTimer.Reset(timeout) select { case <-timeoutTimer.C: timeoutTimer.Read = true case <-stopper.ShouldStop(): return } } }) }
// sendToReplicas sends one or more RPCs to clients specified by the slice of // replicas. On success, Send returns the first successful reply. Otherwise, // Send returns an error if and as soon as the number of failed RPCs exceeds // the available endpoints less the number of required replies. func (ds *DistSender) sendToReplicas( opts SendOptions, rangeID roachpb.RangeID, replicas ReplicaSlice, args roachpb.BatchRequest, rpcContext *rpc.Context, ) (*roachpb.BatchResponse, error) { if len(replicas) < 1 { return nil, roachpb.NewSendError( fmt.Sprintf("insufficient replicas (%d) to satisfy send request of %d", len(replicas), 1)) } done := make(chan BatchCall, len(replicas)) transportFactory := opts.transportFactory if transportFactory == nil { transportFactory = grpcTransportFactory } transport, err := transportFactory(opts, rpcContext, replicas, args) if err != nil { return nil, err } defer transport.Close() if transport.IsExhausted() { return nil, roachpb.NewSendError( fmt.Sprintf("sending to all %d replicas failed", len(replicas))) } // Send the first request. pending := 1 transport.SendNext(done) // Wait for completions. This loop will retry operations that fail // with errors that reflect per-replica state and may succeed on // other replicas. var sendNextTimer timeutil.Timer defer sendNextTimer.Stop() for { sendNextTimer.Reset(opts.SendNextTimeout) select { case <-sendNextTimer.C: sendNextTimer.Read = true // On successive RPC timeouts, send to additional replicas if available. if !transport.IsExhausted() { log.Trace(opts.Context, "timeout, trying next peer") pending++ transport.SendNext(done) } case call := <-done: pending-- err := call.Err if err == nil { if log.V(2) { log.Infof(opts.Context, "RPC reply: %+v", call.Reply) } else if log.V(1) && call.Reply.Error != nil { log.Infof(opts.Context, "application error: %s", call.Reply.Error) } if !ds.handlePerReplicaError(rangeID, call.Reply.Error) { return call.Reply, nil } // Extract the detail so it can be included in the error // message if this is our last replica. // // TODO(bdarnell): The last error is not necessarily the best // one to return; we may want to remember the "best" error // we've seen (for example, a NotLeaseHolderError conveys more // information than a RangeNotFound). err = call.Reply.Error.GoError() } else if log.V(1) { log.Warningf(opts.Context, "RPC error: %s", err) } // Send to additional replicas if available. if !transport.IsExhausted() { log.Tracef(opts.Context, "error, trying next peer: %s", err) pending++ transport.SendNext(done) } if pending == 0 { return nil, roachpb.NewSendError( fmt.Sprintf("sending to all %d replicas failed; last error: %v", len(replicas), err)) } } } }
// processQueue opens a Raft client stream and sends messages from the // designated queue (ch) via that stream, exiting when an error is received or // when it idles out. All messages remaining in the queue at that point are // lost and a new instance of processQueue will be started by the next message // to be sent. func (t *RaftTransport) processQueue( nodeID roachpb.NodeID, ch chan *RaftMessageRequest, stats *raftTransportStats, stream MultiRaft_RaftMessageClient, ) error { errCh := make(chan error, 1) // Starting workers in a task prevents data races during shutdown. if err := t.rpcContext.Stopper.RunTask(func() { t.rpcContext.Stopper.RunWorker(func() { errCh <- func() error { for { resp, err := stream.Recv() if err != nil { return err } atomic.AddInt64(&stats.clientRecv, 1) t.mu.Lock() handler, ok := t.mu.handlers[resp.ToReplica.StoreID] t.mu.Unlock() if !ok { log.Warningf(context.TODO(), "no handler found for store %s in response %s", resp.ToReplica.StoreID, resp) continue } handler.HandleRaftResponse(stream.Context(), resp) } }() }) }); err != nil { return err } var raftIdleTimer timeutil.Timer defer raftIdleTimer.Stop() for { raftIdleTimer.Reset(raftIdleTimeout) select { case <-t.rpcContext.Stopper.ShouldStop(): return nil case <-raftIdleTimer.C: raftIdleTimer.Read = true return nil case err := <-errCh: return err case req := <-ch: err := stream.Send(req) atomic.AddInt64(&stats.clientSent, 1) if req.Message.Type == raftpb.MsgSnap { select { case <-t.rpcContext.Stopper.ShouldStop(): return nil case t.SnapshotStatusChan <- RaftSnapshotStatus{req, err}: } } if err != nil { return err } } } }
// processQueue creates a client and sends messages from its designated queue // via that client, exiting when the client fails or when it idles out. All // messages remaining in the queue at that point are lost and a new instance of // processQueue should be started by the next message to be sent. // TODO(tschottdorf) should let raft know if the node is down; // need a feedback mechanism for that. Potentially easiest is to arrange for // the next call to Send() to fail appropriately. func (t *RaftTransport) processQueue(nodeID roachpb.NodeID) { t.mu.Lock() ch, ok := t.mu.queues[nodeID] t.mu.Unlock() if !ok { return } // Clean-up when the loop below shuts down. defer func() { t.mu.Lock() delete(t.mu.queues, nodeID) t.mu.Unlock() }() addr, err := t.resolver(nodeID) if err != nil { if log.V(1) { log.Errorf("failed to get address for node %d: %s", nodeID, err) } return } if log.V(1) { log.Infof("dialing node %d at %s", nodeID, addr) } conn, err := t.rpcContext.GRPCDial(addr.String()) if err != nil { if log.V(1) { log.Errorf("failed to dial: %s", err) } return } client := NewMultiRaftClient(conn) ctx, cancel := context.WithCancel(context.TODO()) defer cancel() if log.V(1) { log.Infof("establishing Raft transport stream to node %d at %s", nodeID, addr) } // We start two streams; one will be used for snapshots, the other for all // other traffic. This is done to prevent snapshots from blocking other // traffic. streams := make([]MultiRaft_RaftMessageClient, 2) for i := range streams { stream, err := client.RaftMessage(ctx) if err != nil { if log.V(1) { log.Errorf("failed to establish Raft transport stream to node %d at %s: %s", nodeID, addr, err) } return } streams[i] = stream } errCh := make(chan error, len(streams)) // Starting workers in a task prevents data races during shutdown. t.rpcContext.Stopper.RunTask(func() { for i := range streams { // Avoid closing over a `range` binding. stream := streams[i] t.rpcContext.Stopper.RunWorker(func() { // NB: only one error will ever be read from this channel. That's fine, // given that the channel is buffered to the maximum number of errors // that will be written to it. errCh <- stream.RecvMsg(new(RaftMessageResponse)) }) } }) snapStream := streams[0] restStream := streams[1] var raftIdleTimer timeutil.Timer defer raftIdleTimer.Stop() for { raftIdleTimer.Reset(raftIdleTimeout) select { case <-t.rpcContext.Stopper.ShouldStop(): return case <-raftIdleTimer.C: raftIdleTimer.Read = true if log.V(1) { log.Infof("closing Raft transport to %d at %s due to inactivity", nodeID, addr) } return case err := <-errCh: if log.V(1) { if err != nil { log.Infof("remote node %d at %s closed Raft transport with error: %s", nodeID, addr, err) } else { log.Infof("remote node %d at %s closed Raft transport", nodeID, addr) } } return case req := <-ch: if req.Message.Type == raftpb.MsgSnap { t.rpcContext.Stopper.RunAsyncTask(func() { err := snapStream.Send(req) if err != nil { log.Errorf("failed to send Raft snapshot to node %d at %s: %s", nodeID, addr, err) } else if log.V(1) { log.Infof("successfully sent a Raft snapshot to node %d at %s", nodeID, addr) } t.SnapshotStatusChan <- RaftSnapshotStatus{req, err} }) } else { if err := restStream.Send(req); err != nil { log.Error(err) return } } } } }