// startStats blocks and periodically logs transaction statistics (throughput, // success rates, durations, ...). Note that this only captures write txns, // since read-only txns are stateless as far as TxnCoordSender is concerned. // stats). func (tc *TxnCoordSender) startStats() { res := time.Millisecond // for duration logging resolution var statusLogTimer util.Timer defer statusLogTimer.Stop() scale := metric.Scale1M for { statusLogTimer.Reset(statusLogInterval) select { case <-statusLogTimer.C: statusLogTimer.Read = true if !log.V(1) { continue } // Take a snapshot of metrics. There's some chance of skew, since the snapshots are // not done atomically, but that should be fine for these debug stats. metrics := tc.metrics durations := metrics.Durations[scale].Current() restarts := metrics.Restarts.Current() commitRate := metrics.Commits.Rates[scale].Value() commit1PCRate := metrics.Commits1PC.Rates[scale].Value() abortRate := metrics.Aborts.Rates[scale].Value() abandonRate := metrics.Abandons.Rates[scale].Value() // Show transaction stats over the last minute. Maybe this should be shorter in the future. // We'll revisit if we get sufficient feedback. totalRate := commitRate + abortRate + abandonRate var pCommitted, pCommitted1PC, pAbandoned, pAborted float64 if totalRate > 0 { pCommitted = 100 * (commitRate / totalRate) pCommitted1PC = 100 * (commit1PCRate / totalRate) pAborted = 100 * (abortRate / totalRate) pAbandoned = 100 * (abandonRate / totalRate) } dMean := durations.Mean() dDev := durations.StdDev() dMax := durations.Max() rMean := restarts.Mean() rDev := restarts.StdDev() rMax := restarts.Max() num := durations.TotalCount() log.Infof( "txn coordinator: %.2f txn/sec, %.2f/%.2f/%.2f/%.2f %%cmmt/cmmt1pc/abrt/abnd, %s/%s/%s avg/σ/max duration, %.1f/%.1f/%d avg/σ/max restarts (%d samples)", totalRate, pCommitted, pCommitted1PC, pAborted, pAbandoned, util.TruncateDuration(time.Duration(dMean), res), util.TruncateDuration(time.Duration(dDev), res), util.TruncateDuration(time.Duration(dMax), res), rMean, rDev, rMax, num, ) case <-tc.stopper.ShouldStop(): return } } }
func (ctx *Context) runHeartbeat(cc *grpc.ClientConn, remoteAddr string) error { request := PingRequest{Addr: ctx.localAddr} heartbeatClient := NewHeartbeatClient(cc) var heartbeatTimer util.Timer defer heartbeatTimer.Stop() for { sendTime := ctx.localClock.PhysicalTime() response, err := ctx.heartbeat(heartbeatClient, request) if err != nil { if grpc.Code(err) == codes.DeadlineExceeded { continue } return err } receiveTime := ctx.localClock.PhysicalTime() // Only update the clock offset measurement if we actually got a // successful response from the server. if pingDuration := receiveTime.Sub(sendTime); pingDuration > maximumPingDurationMult*ctx.localClock.MaxOffset() { request.Offset.Reset() } else { // Offset and error are measured using the remote clock reading // technique described in // http://se.inf.tu-dresden.de/pubs/papers/SRDS1994.pdf, page 6. // However, we assume that drift and min message delay are 0, for // now. request.Offset.MeasuredAt = receiveTime.UnixNano() request.Offset.Uncertainty = (pingDuration / 2).Nanoseconds() remoteTimeNow := time.Unix(0, response.ServerTime).Add(pingDuration / 2) request.Offset.Offset = remoteTimeNow.Sub(receiveTime).Nanoseconds() } ctx.RemoteClocks.UpdateOffset(remoteAddr, request.Offset) if cb := ctx.HeartbeatCB; cb != nil { cb() } // Wait after the heartbeat so that the first iteration gets a wait-free // heartbeat attempt. heartbeatTimer.Reset(ctx.HeartbeatInterval) select { case <-ctx.Stopper.ShouldStop(): return nil case <-heartbeatTimer.C: heartbeatTimer.Read = true } } }
func (ctx *Context) runHeartbeat(cc *grpc.ClientConn, remoteAddr string) error { request := PingRequest{Addr: ctx.localAddr} heartbeatClient := NewHeartbeatClient(cc) var heartbeatTimer util.Timer defer heartbeatTimer.Stop() for { sendTime := ctx.localClock.PhysicalNow() goCtx, cancel := context.WithTimeout(context.Background(), ctx.HeartbeatTimeout) response, err := heartbeatClient.Ping(goCtx, &request) if err != nil { cancel() return err } receiveTime := ctx.localClock.PhysicalNow() // Only update the clock offset measurement if we actually got a // successful response from the server. if receiveTime > sendTime+maximumClockReadingDelay.Nanoseconds() { request.Offset.Reset() } else { // Offset and error are measured using the remote clock reading // technique described in // http://se.inf.tu-dresden.de/pubs/papers/SRDS1994.pdf, page 6. // However, we assume that drift and min message delay are 0, for // now. request.Offset.MeasuredAt = receiveTime request.Offset.Uncertainty = (receiveTime - sendTime) / 2 remoteTimeNow := response.ServerTime + request.Offset.Uncertainty request.Offset.Offset = remoteTimeNow - receiveTime ctx.RemoteClocks.UpdateOffset(remoteAddr, request.Offset) } // Wait after the heartbeat so that the first iteration gets a wait-free // heartbeat attempt. heartbeatTimer.Reset(ctx.HeartbeatInterval) select { case <-ctx.Stopper.ShouldStop(): return nil case <-heartbeatTimer.C: heartbeatTimer.Read = true } } }
// MonitorRemoteOffsets periodically checks that the offset of this server's // clock from the true cluster time is within MaxOffset. If the offset exceeds // MaxOffset, then this method will trigger a fatal error, causing the node to // suicide. func (r *RemoteClockMonitor) MonitorRemoteOffsets(stopper *stop.Stopper) error { if log.V(1) { log.Infof("monitoring cluster offset every %s", r.monitorInterval) } var monitorTimer util.Timer defer monitorTimer.Stop() for { monitorTimer.Reset(r.monitorInterval) select { case <-stopper.ShouldStop(): return nil case <-monitorTimer.C: monitorTimer.Read = true offsetInterval, err := r.findOffsetInterval() // By the contract of the hlc, if the value is 0, then safety checking // of the max offset is disabled. However we may still want to // propagate the information to a status node. // TODO(embark): once there is a framework for collecting timeseries // data about the db, propagate the offset status to that. if maxOffset := r.clock.MaxOffset(); maxOffset != 0 { if err != nil { return util.Errorf("clock offset could not be determined: %s", err) } if !isHealthyOffsetInterval(offsetInterval, maxOffset) { return util.Errorf( "clock offset is in interval: %s, which indicates that the true offset is greater than the max offset: %s", offsetInterval, maxOffset, ) } if log.V(1) { log.Infof("healthy cluster offset: %s", offsetInterval) } } r.metrics.clusterOffsetLowerBound.Update(int64(offsetInterval.lowerbound)) r.metrics.clusterOffsetUpperBound.Update(int64(offsetInterval.upperbound)) r.mu.Lock() r.mu.lastMonitoredAt = r.clock.PhysicalTime() r.mu.Unlock() } } }
// bootstrap connects the node to the gossip network. Bootstrapping // commences in the event there are no connected clients or the // sentinel gossip info is not available. After a successful bootstrap // connection, this method will block on the stalled condvar, which // receives notifications that gossip network connectivity has been // lost and requires re-bootstrapping. func (g *Gossip) bootstrap() { stopper := g.server.stopper stopper.RunWorker(func() { var bootstrapTimer util.Timer defer bootstrapTimer.Stop() for { stopper.RunTask(func() { g.mu.Lock() defer g.mu.Unlock() haveClients := g.outgoing.len() > 0 haveSentinel := g.is.getInfo(KeySentinel) != nil if !haveClients || !haveSentinel { // Try to get another bootstrap address from the resolvers. if addr := g.getNextBootstrapAddress(); addr != nil { g.startClient(addr, stopper) } else { // We couldn't start a client, signal that we're stalled so that // we'll retry. g.maybeSignalStalledLocked() } } }) // Pause an interval before next possible bootstrap. bootstrapTimer.Reset(g.bootstrapInterval) select { case <-bootstrapTimer.C: bootstrapTimer.Read = true // break case <-stopper.ShouldStop(): return } // Block until we need bootstrapping again. select { case <-g.stalled: // break case <-stopper.ShouldStop(): return } } }) }
// start will run continuously and mark stores as offline if they haven't been // heard from in longer than timeUntilStoreDead. func (sp *StorePool) start(stopper *stop.Stopper) { stopper.RunWorker(func() { var timeoutTimer util.Timer defer timeoutTimer.Stop() for { var timeout time.Duration sp.mu.Lock() detail := sp.queue.peek() if detail == nil { // No stores yet, wait the full timeout. timeout = sp.timeUntilStoreDead } else { // Check to see if the store should be marked as dead. deadAsOf := detail.lastUpdatedTime.GoTime().Add(sp.timeUntilStoreDead) now := sp.clock.Now() if now.GoTime().After(deadAsOf) { deadDetail := sp.queue.dequeue() deadDetail.markDead(now) // The next store might be dead as well, set the timeout to // 0 to process it immediately. timeout = 0 } else { // Store is still alive, schedule the next check for when // it should timeout. timeout = deadAsOf.Sub(now.GoTime()) } } sp.mu.Unlock() timeoutTimer.Reset(timeout) select { case <-timeoutTimer.C: timeoutTimer.Read = true case <-stopper.ShouldStop(): return } } }) }
// runHeartbeat sends periodic heartbeats to client, marking the client healthy // or unhealthy and reconnecting appropriately until either the Client or the // supplied channel is closed. func (c *Client) runHeartbeat(retryOpts retry.Options) { healthReceived := c.healthReceived setHealthReceived := func() { if healthReceived != nil { close(healthReceived) healthReceived = nil } } isHealthy := false setHealthy := func() { if isHealthy { return } isHealthy = true close(c.healthy.Load().(chan struct{})) setHealthReceived() } setUnhealthy := func() { if isHealthy { isHealthy = false c.healthy.Store(make(chan struct{})) } setHealthReceived() } var err = errUnstarted // initial condition var heartbeatTimer util.Timer defer heartbeatTimer.Stop() for { for r := retry.Start(retryOpts); r.Next(); { if c.maybeClose(retryOpts.Closer) { return } // Reconnect on failure. if err != nil { // If reconnects are disabled, and we already have a failed connection, return now. if c.disableReconnects && c.internalConn() != nil { return } if err = c.connect(); err != nil { setUnhealthy() log.Warning(err) continue } } // Heartbeat regardless of failure. if err = c.heartbeat(retryOpts.Closer); err != nil { setUnhealthy() log.Warning(err) if c.maybeClose(retryOpts.Closer) { return } continue } setHealthy() break } // Wait after the heartbeat so that the first iteration gets a wait-free // heartbeat attempt. heartbeatTimer.Reset(c.heartbeatInterval) select { case <-c.closer: return case <-retryOpts.Closer: c.close() return case <-heartbeatTimer.C: heartbeatTimer.Read = true // TODO(tamird): Perhaps retry more aggressively when the client is unhealthy. } } }
// Send sends one or more RPCs to clients specified by the slice of // replicas. On success, Send returns the first successful reply. Otherwise, // Send returns an error if and as soon as the number of failed RPCs exceeds // the available endpoints less the number of required replies. // // TODO(pmattis): Get rid of the getArgs function which requires the caller to // maintain a map from address to replica. Instead, pass in the list of // replicas instead of a list of addresses and use that to populate the // requests. func send(opts SendOptions, replicas ReplicaSlice, args roachpb.BatchRequest, context *rpc.Context) (proto.Message, error) { sp := opts.Trace if sp == nil { sp = tracing.NilSpan() } if len(replicas) < 1 { return nil, roachpb.NewSendError( fmt.Sprintf("insufficient replicas (%d) to satisfy send request of %d", len(replicas), 1), false) } done := make(chan *netrpc.Call, len(replicas)) clients := make([]batchClient, 0, len(replicas)) for i, replica := range replicas { clients = append(clients, batchClient{ Client: rpc.NewClient(&replica.NodeDesc.Address, context), replica: &replicas[i], args: args, }) } var orderedClients []batchClient switch opts.Ordering { case orderStable: orderedClients = clients case orderRandom: // Randomly permute order, but keep known-unhealthy clients last. var nHealthy int for i, client := range clients { select { case <-client.Healthy(): clients[i], clients[nHealthy] = clients[nHealthy], clients[i] nHealthy++ default: } } shuffleClients(clients[:nHealthy]) shuffleClients(clients[nHealthy:]) orderedClients = clients } // TODO(spencer): going to need to also sort by affinity; closest // ping time should win. Makes sense to have the rpc client/server // heartbeat measure ping times. With a bit of seasoning, each // node will be able to order the healthy replicas based on latency. // Send the first request. sendOneFn(&orderedClients[0], opts.Timeout, context, sp, done) orderedClients = orderedClients[1:] var errors, retryableErrors int // Wait for completions. var sendNextTimer util.Timer defer sendNextTimer.Stop() for { sendNextTimer.Reset(opts.SendNextTimeout) select { case <-sendNextTimer.C: sendNextTimer.Read = true // On successive RPC timeouts, send to additional replicas if available. if len(orderedClients) > 0 { sp.LogEvent("timeout, trying next peer") sendOneFn(&orderedClients[0], opts.Timeout, context, sp, done) orderedClients = orderedClients[1:] } case call := <-done: if call.Error == nil { // Verify response data integrity if this is a proto response. if req, reqOk := call.Args.(roachpb.Request); reqOk { if resp, respOk := call.Reply.(roachpb.Response); respOk { if err := resp.Verify(req); err != nil { call.Error = err } } else { call.Error = util.Errorf("response to proto request must be a proto") } } } err := call.Error if err == nil { if log.V(2) { log.Infof("successful reply: %+v", call.Reply) } return call.Reply.(proto.Message), nil } // Error handling. if log.V(1) { log.Warningf("error reply: %s", err) } errors++ // Since we have a reconnecting client here, disconnect errors are retryable. disconnected := err == netrpc.ErrShutdown || err == io.ErrUnexpectedEOF if retryErr, ok := err.(retry.Retryable); disconnected || (ok && retryErr.CanRetry()) { retryableErrors++ } if remainingNonErrorRPCs := len(replicas) - errors; remainingNonErrorRPCs < 1 { return nil, roachpb.NewSendError( fmt.Sprintf("too many errors encountered (%d of %d total): %v", errors, len(clients), err), remainingNonErrorRPCs+retryableErrors >= 1) } // Send to additional replicas if available. if len(orderedClients) > 0 { sp.LogEvent("error, trying next peer") sendOneFn(&orderedClients[0], opts.Timeout, context, sp, done) orderedClients = orderedClients[1:] } } } }
// processQueue creates a client and sends messages from its designated queue // via that client, exiting when the client fails or when it idles out. All // messages remaining in the queue at that point are lost and a new instance of // processQueue should be started by the next message to be sent. // TODO(tschottdorf) should let raft know if the node is down; // need a feedback mechanism for that. Potentially easiest is to arrange for // the next call to Send() to fail appropriately. func (t *RaftTransport) processQueue(nodeID roachpb.NodeID) { t.mu.Lock() ch, ok := t.mu.queues[nodeID] t.mu.Unlock() if !ok { return } // Clean-up when the loop below shuts down. defer func() { t.mu.Lock() delete(t.mu.queues, nodeID) t.mu.Unlock() }() addr, err := t.resolver(nodeID) if err != nil { if log.V(1) { log.Errorf("failed to get address for node %d: %s", nodeID, err) } return } if log.V(1) { log.Infof("dialing node %d at %s", nodeID, addr) } conn, err := t.rpcContext.GRPCDial(addr.String()) if err != nil { if log.V(1) { log.Errorf("failed to dial: %s", err) } return } client := NewMultiRaftClient(conn) ctx, cancel := context.WithCancel(context.Background()) defer cancel() if log.V(1) { log.Infof("establishing Raft transport stream to node %d at %s", nodeID, addr) } stream, err := client.RaftMessage(ctx) if err != nil { if log.V(1) { log.Errorf("failed to establish Raft transport stream to node %d at %s: %s", nodeID, addr, err) } return } errCh := make(chan error, 1) // Starting workers in a task prevents data races during shutdown. t.rpcContext.Stopper.RunTask(func() { t.rpcContext.Stopper.RunWorker(func() { errCh <- stream.RecvMsg(&RaftMessageResponse{}) }) }) var raftIdleTimer util.Timer defer raftIdleTimer.Stop() for { raftIdleTimer.Reset(raftIdleTimeout) select { case <-t.rpcContext.Stopper.ShouldStop(): return case <-raftIdleTimer.C: raftIdleTimer.Read = true if log.V(1) { log.Infof("closing Raft transport to %d at %s due to inactivity", nodeID, addr) } return case err := <-errCh: if log.V(1) { if err != nil { log.Infof("remote node %d at %s closed Raft transport with error: %s", nodeID, addr, err) } else { log.Infof("remote node %d at %s closed Raft transport", nodeID, addr) } } return case req := <-ch: if err := stream.Send(req); err != nil { log.Error(err) return } } } }
// processQueue creates a client and sends messages from its designated queue // via that client, exiting when the client fails or when it idles out. All // messages remaining in the queue at that point are lost and a new instance of // processQueue should be started by the next message to be sent. // TODO(tschottdorf) should let raft know if the node is down; // need a feedback mechanism for that. Potentially easiest is to arrange for // the next call to Send() to fail appropriately. func (t *rpcTransport) processQueue(nodeID roachpb.NodeID, storeID roachpb.StoreID) { t.mu.Lock() ch, ok := t.queues[storeID] t.mu.Unlock() if !ok { return } // Clean-up when the loop below shuts down. defer func() { t.mu.Lock() delete(t.queues, storeID) t.mu.Unlock() }() addr, err := t.gossip.GetNodeIDAddress(nodeID) if err != nil { if log.V(1) { log.Errorf("could not get address for node %d: %s", nodeID, err) } return } var dialOpt grpc.DialOption if t.rpcContext.Insecure { dialOpt = grpc.WithInsecure() } else { tlsConfig, err := t.rpcContext.GetClientTLSConfig() if err != nil { log.Error(err) return } dialOpt = grpc.WithTransportCredentials(credentials.NewTLS(tlsConfig)) } conn, err := grpc.Dial(addr.String(), dialOpt) if err != nil { log.Errorf("failed to dial: %v", err) return } defer func() { if err := conn.Close(); err != nil { log.Error(err) } }() client := storage.NewMultiRaftClient(conn) ctx := grpcutil.NewContextWithStopper(context.Background(), t.rpcContext.Stopper) stream, err := client.RaftMessage(ctx) if err != nil { log.Error(err) return } defer func() { if err := stream.CloseSend(); err != nil { log.Error(err) } }() var raftIdleTimer util.Timer defer raftIdleTimer.Stop() for { raftIdleTimer.Reset(raftIdleTimeout) select { case <-ctx.Done(): return case <-raftIdleTimer.C: raftIdleTimer.Read = true if log.V(1) { log.Infof("closing Raft transport to %d due to inactivity", nodeID) } return case req := <-ch: if err := stream.Send(req); err != nil { log.Error(err) return } } } }
// startStats blocks and periodically logs transaction statistics (throughput, // success rates, durations, ...). Note that this only captures write txns, // since read-only txns are stateless as far as TxnCoordSender is concerned. // stats). // TODO(mrtracy): Add this to TimeSeries. func (tc *TxnCoordSender) startStats() { res := time.Millisecond // for duration logging resolution lastNow := tc.clock.PhysicalNow() var statusLogTimer util.Timer defer statusLogTimer.Stop() for { statusLogTimer.Reset(statusLogInterval) select { case <-statusLogTimer.C: statusLogTimer.Read = true if !log.V(1) { continue } tc.Lock() curStats := tc.txnStats tc.txnStats = txnCoordStats{} tc.Unlock() now := tc.clock.PhysicalNow() // Tests have weird clocks. if now-lastNow <= 0 { continue } num := len(curStats.durations) // Only compute when non-empty input. var dMax, dMean, dDev, rMax, rMean, rDev float64 var err error if num > 0 { // There should never be an error in the below // computations. dMax, err = stats.Max(curStats.durations) if err != nil { panic(err) } dMean, err = stats.Mean(curStats.durations) if err != nil { panic(err) } dDev, err = stats.StdDevP(curStats.durations) if err != nil { panic(err) } rMax, err = stats.Max(curStats.restarts) if err != nil { panic(err) } rMean, err = stats.Mean(curStats.restarts) if err != nil { panic(err) } rDev, err = stats.StdDevP(curStats.restarts) if err != nil { panic(err) } } rate := float64(int64(num)*int64(time.Second)) / float64(now-lastNow) var pCommitted, pAbandoned, pAborted float32 if fNum := float32(num); fNum > 0 { pCommitted = 100 * float32(curStats.committed) / fNum pAbandoned = 100 * float32(curStats.abandoned) / fNum pAborted = 100 * float32(curStats.aborted) / fNum } log.Infof( "txn coordinator: %.2f txn/sec, %.2f/%.2f/%.2f %%cmmt/abrt/abnd, %s/%s/%s avg/σ/max duration, %.1f/%.1f/%.1f avg/σ/max restarts (%d samples)", rate, pCommitted, pAborted, pAbandoned, util.TruncateDuration(time.Duration(dMean), res), util.TruncateDuration(time.Duration(dDev), res), util.TruncateDuration(time.Duration(dMax), res), rMean, rDev, rMax, num, ) lastNow = now case <-tc.stopper.ShouldStop(): return } } }
// Send sends one or more RPCs to clients specified by the slice of // replicas. On success, Send returns the first successful reply. Otherwise, // Send returns an error if and as soon as the number of failed RPCs exceeds // the available endpoints less the number of required replies. func send(opts SendOptions, replicas ReplicaSlice, args roachpb.BatchRequest, rpcContext *rpc.Context) (*roachpb.BatchResponse, error) { if len(replicas) < 1 { return nil, roachpb.NewSendError( fmt.Sprintf("insufficient replicas (%d) to satisfy send request of %d", len(replicas), 1), false) } done := make(chan batchCall, len(replicas)) clients := make([]batchClient, 0, len(replicas)) for _, replica := range replicas { conn, err := rpcContext.GRPCDial(replica.NodeDesc.Address.String()) if err != nil { return nil, err } argsCopy := args argsCopy.Replica = replica.ReplicaDescriptor clients = append(clients, batchClient{ remoteAddr: replica.NodeDesc.Address.String(), conn: conn, client: roachpb.NewInternalClient(conn), args: argsCopy, }) } // Put known-unhealthy clients last. nHealthy, err := splitHealthy(clients) if err != nil { return nil, err } var orderedClients []batchClient switch opts.Ordering { case orderStable: orderedClients = clients case orderRandom: // Randomly permute order, but keep known-unhealthy clients last. shuffleClients(clients[:nHealthy]) shuffleClients(clients[nHealthy:]) orderedClients = clients } // TODO(spencer): going to need to also sort by affinity; closest // ping time should win. Makes sense to have the rpc client/server // heartbeat measure ping times. With a bit of seasoning, each // node will be able to order the healthy replicas based on latency. // Send the first request. sendOneFn(opts, rpcContext, orderedClients[0], done) orderedClients = orderedClients[1:] var errors, retryableErrors int // Wait for completions. var sendNextTimer util.Timer defer sendNextTimer.Stop() for { sendNextTimer.Reset(opts.SendNextTimeout) select { case <-sendNextTimer.C: sendNextTimer.Read = true // On successive RPC timeouts, send to additional replicas if available. if len(orderedClients) > 0 { log.Trace(opts.Context, "timeout, trying next peer") sendOneFn(opts, rpcContext, orderedClients[0], done) orderedClients = orderedClients[1:] } case call := <-done: err := call.err if err == nil { if log.V(2) { log.Infof("successful reply: %+v", call.reply) } return call.reply, nil } // Error handling. if log.V(1) { log.Warningf("error reply: %s", err) } errors++ // Since we have a reconnecting client here, disconnect errors are retryable. disconnected := err == io.ErrUnexpectedEOF if retryErr, ok := err.(retry.Retryable); disconnected || (ok && retryErr.CanRetry()) { retryableErrors++ } if remainingNonErrorRPCs := len(replicas) - errors; remainingNonErrorRPCs < 1 { return nil, roachpb.NewSendError( fmt.Sprintf("too many errors encountered (%d of %d total): %v", errors, len(clients), err), remainingNonErrorRPCs+retryableErrors >= 1) } // Send to additional replicas if available. if len(orderedClients) > 0 { log.Trace(opts.Context, "error, trying next peer") sendOneFn(opts, rpcContext, orderedClients[0], done) orderedClients = orderedClients[1:] } } } }
// processQueue creates a client and sends messages from its designated queue // via that client, exiting when the client fails or when it idles out. All // messages remaining in the queue at that point are lost and a new instance of // processQueue should be started by the next message to be sent. // TODO(tschottdorf) should let raft know if the node is down; // need a feedback mechanism for that. Potentially easiest is to arrange for // the next call to Send() to fail appropriately. func (t *RaftTransport) processQueue(nodeID roachpb.NodeID) { t.mu.Lock() ch, ok := t.mu.queues[nodeID] t.mu.Unlock() if !ok { return } // Clean-up when the loop below shuts down. defer func() { t.mu.Lock() delete(t.mu.queues, nodeID) t.mu.Unlock() }() addr, err := t.resolver(nodeID) if err != nil { if log.V(1) { log.Errorf("failed to get address for node %d: %s", nodeID, err) } return } if log.V(1) { log.Infof("dialing node %d at %s", nodeID, addr) } conn, err := t.rpcContext.GRPCDial(addr.String()) if err != nil { if log.V(1) { log.Errorf("failed to dial: %s", err) } return } client := NewMultiRaftClient(conn) ctx, cancel := context.WithCancel(context.TODO()) defer cancel() if log.V(1) { log.Infof("establishing Raft transport stream to node %d at %s", nodeID, addr) } // We start two streams; one will be used for snapshots, the other for all // other traffic. This is done to prevent snapshots from blocking other // traffic. streams := make([]MultiRaft_RaftMessageClient, 2) for i := range streams { stream, err := client.RaftMessage(ctx) if err != nil { if log.V(1) { log.Errorf("failed to establish Raft transport stream to node %d at %s: %s", nodeID, addr, err) } return } streams[i] = stream } errCh := make(chan error, len(streams)) // Starting workers in a task prevents data races during shutdown. t.rpcContext.Stopper.RunTask(func() { for i := range streams { // Avoid closing over a `range` binding. stream := streams[i] t.rpcContext.Stopper.RunWorker(func() { // NB: only one error will ever be read from this channel. That's fine, // given that the channel is buffered to the maximum number of errors // that will be written to it. errCh <- stream.RecvMsg(new(RaftMessageResponse)) }) } }) snapStream := streams[0] restStream := streams[1] var raftIdleTimer util.Timer defer raftIdleTimer.Stop() for { raftIdleTimer.Reset(raftIdleTimeout) select { case <-t.rpcContext.Stopper.ShouldStop(): return case <-raftIdleTimer.C: raftIdleTimer.Read = true if log.V(1) { log.Infof("closing Raft transport to %d at %s due to inactivity", nodeID, addr) } return case err := <-errCh: if log.V(1) { if err != nil { log.Infof("remote node %d at %s closed Raft transport with error: %s", nodeID, addr, err) } else { log.Infof("remote node %d at %s closed Raft transport", nodeID, addr) } } return case req := <-ch: if req.Message.Type == raftpb.MsgSnap { t.rpcContext.Stopper.RunAsyncTask(func() { err := snapStream.Send(req) if err != nil { log.Errorf("failed to send Raft snapshot to node %d at %s: %s", nodeID, addr, err) } else if log.V(1) { log.Infof("successfully sent a Raft snapshot to node %d at %s", nodeID, addr) } t.SnapshotStatusChan <- RaftSnapshotStatus{req, err} }) } else { if err := restStream.Send(req); err != nil { log.Error(err) return } } } } }