示例#1
0
// startStats blocks and periodically logs transaction statistics (throughput,
// success rates, durations, ...). Note that this only captures write txns,
// since read-only txns are stateless as far as TxnCoordSender is concerned.
// stats).
func (tc *TxnCoordSender) startStats() {
	res := time.Millisecond // for duration logging resolution
	var statusLogTimer util.Timer
	defer statusLogTimer.Stop()
	scale := metric.Scale1M
	for {
		statusLogTimer.Reset(statusLogInterval)
		select {
		case <-statusLogTimer.C:
			statusLogTimer.Read = true
			if !log.V(1) {
				continue
			}

			// Take a snapshot of metrics. There's some chance of skew, since the snapshots are
			// not done atomically, but that should be fine for these debug stats.
			metrics := tc.metrics
			durations := metrics.Durations[scale].Current()
			restarts := metrics.Restarts.Current()
			commitRate := metrics.Commits.Rates[scale].Value()
			commit1PCRate := metrics.Commits1PC.Rates[scale].Value()
			abortRate := metrics.Aborts.Rates[scale].Value()
			abandonRate := metrics.Abandons.Rates[scale].Value()

			// Show transaction stats over the last minute. Maybe this should be shorter in the future.
			// We'll revisit if we get sufficient feedback.
			totalRate := commitRate + abortRate + abandonRate
			var pCommitted, pCommitted1PC, pAbandoned, pAborted float64
			if totalRate > 0 {
				pCommitted = 100 * (commitRate / totalRate)
				pCommitted1PC = 100 * (commit1PCRate / totalRate)
				pAborted = 100 * (abortRate / totalRate)
				pAbandoned = 100 * (abandonRate / totalRate)
			}

			dMean := durations.Mean()
			dDev := durations.StdDev()
			dMax := durations.Max()
			rMean := restarts.Mean()
			rDev := restarts.StdDev()
			rMax := restarts.Max()
			num := durations.TotalCount()

			log.Infof(
				"txn coordinator: %.2f txn/sec, %.2f/%.2f/%.2f/%.2f %%cmmt/cmmt1pc/abrt/abnd, %s/%s/%s avg/σ/max duration, %.1f/%.1f/%d avg/σ/max restarts (%d samples)",
				totalRate, pCommitted, pCommitted1PC, pAborted, pAbandoned,
				util.TruncateDuration(time.Duration(dMean), res),
				util.TruncateDuration(time.Duration(dDev), res),
				util.TruncateDuration(time.Duration(dMax), res),
				rMean, rDev, rMax, num,
			)

		case <-tc.stopper.ShouldStop():
			return
		}
	}
}
示例#2
0
func (ctx *Context) runHeartbeat(cc *grpc.ClientConn, remoteAddr string) error {
	request := PingRequest{Addr: ctx.localAddr}
	heartbeatClient := NewHeartbeatClient(cc)

	var heartbeatTimer util.Timer
	defer heartbeatTimer.Stop()
	for {
		sendTime := ctx.localClock.PhysicalTime()
		response, err := ctx.heartbeat(heartbeatClient, request)
		if err != nil {
			if grpc.Code(err) == codes.DeadlineExceeded {
				continue
			}
			return err
		}
		receiveTime := ctx.localClock.PhysicalTime()

		// Only update the clock offset measurement if we actually got a
		// successful response from the server.
		if pingDuration := receiveTime.Sub(sendTime); pingDuration > maximumPingDurationMult*ctx.localClock.MaxOffset() {
			request.Offset.Reset()
		} else {
			// Offset and error are measured using the remote clock reading
			// technique described in
			// http://se.inf.tu-dresden.de/pubs/papers/SRDS1994.pdf, page 6.
			// However, we assume that drift and min message delay are 0, for
			// now.
			request.Offset.MeasuredAt = receiveTime.UnixNano()
			request.Offset.Uncertainty = (pingDuration / 2).Nanoseconds()
			remoteTimeNow := time.Unix(0, response.ServerTime).Add(pingDuration / 2)
			request.Offset.Offset = remoteTimeNow.Sub(receiveTime).Nanoseconds()
		}
		ctx.RemoteClocks.UpdateOffset(remoteAddr, request.Offset)

		if cb := ctx.HeartbeatCB; cb != nil {
			cb()
		}

		// Wait after the heartbeat so that the first iteration gets a wait-free
		// heartbeat attempt.
		heartbeatTimer.Reset(ctx.HeartbeatInterval)
		select {
		case <-ctx.Stopper.ShouldStop():
			return nil
		case <-heartbeatTimer.C:
			heartbeatTimer.Read = true
		}
	}
}
示例#3
0
func (ctx *Context) runHeartbeat(cc *grpc.ClientConn, remoteAddr string) error {
	request := PingRequest{Addr: ctx.localAddr}
	heartbeatClient := NewHeartbeatClient(cc)

	var heartbeatTimer util.Timer
	defer heartbeatTimer.Stop()
	for {
		sendTime := ctx.localClock.PhysicalNow()
		goCtx, cancel := context.WithTimeout(context.Background(), ctx.HeartbeatTimeout)
		response, err := heartbeatClient.Ping(goCtx, &request)
		if err != nil {
			cancel()
			return err
		}
		receiveTime := ctx.localClock.PhysicalNow()

		// Only update the clock offset measurement if we actually got a
		// successful response from the server.
		if receiveTime > sendTime+maximumClockReadingDelay.Nanoseconds() {
			request.Offset.Reset()
		} else {
			// Offset and error are measured using the remote clock reading
			// technique described in
			// http://se.inf.tu-dresden.de/pubs/papers/SRDS1994.pdf, page 6.
			// However, we assume that drift and min message delay are 0, for
			// now.
			request.Offset.MeasuredAt = receiveTime
			request.Offset.Uncertainty = (receiveTime - sendTime) / 2
			remoteTimeNow := response.ServerTime + request.Offset.Uncertainty
			request.Offset.Offset = remoteTimeNow - receiveTime
			ctx.RemoteClocks.UpdateOffset(remoteAddr, request.Offset)
		}

		// Wait after the heartbeat so that the first iteration gets a wait-free
		// heartbeat attempt.
		heartbeatTimer.Reset(ctx.HeartbeatInterval)
		select {
		case <-ctx.Stopper.ShouldStop():
			return nil
		case <-heartbeatTimer.C:
			heartbeatTimer.Read = true
		}
	}
}
示例#4
0
// MonitorRemoteOffsets periodically checks that the offset of this server's
// clock from the true cluster time is within MaxOffset. If the offset exceeds
// MaxOffset, then this method will trigger a fatal error, causing the node to
// suicide.
func (r *RemoteClockMonitor) MonitorRemoteOffsets(stopper *stop.Stopper) error {
	if log.V(1) {
		log.Infof("monitoring cluster offset every %s", r.monitorInterval)
	}
	var monitorTimer util.Timer
	defer monitorTimer.Stop()
	for {
		monitorTimer.Reset(r.monitorInterval)
		select {
		case <-stopper.ShouldStop():
			return nil
		case <-monitorTimer.C:
			monitorTimer.Read = true
			offsetInterval, err := r.findOffsetInterval()
			// By the contract of the hlc, if the value is 0, then safety checking
			// of the max offset is disabled. However we may still want to
			// propagate the information to a status node.
			// TODO(embark): once there is a framework for collecting timeseries
			// data about the db, propagate the offset status to that.
			if maxOffset := r.clock.MaxOffset(); maxOffset != 0 {
				if err != nil {
					return util.Errorf("clock offset could not be determined: %s", err)
				}

				if !isHealthyOffsetInterval(offsetInterval, maxOffset) {
					return util.Errorf(
						"clock offset is in interval: %s, which indicates that the true offset is greater than the max offset: %s",
						offsetInterval, maxOffset,
					)
				}
				if log.V(1) {
					log.Infof("healthy cluster offset: %s", offsetInterval)
				}
			}

			r.metrics.clusterOffsetLowerBound.Update(int64(offsetInterval.lowerbound))
			r.metrics.clusterOffsetUpperBound.Update(int64(offsetInterval.upperbound))

			r.mu.Lock()
			r.mu.lastMonitoredAt = r.clock.PhysicalTime()
			r.mu.Unlock()
		}
	}
}
示例#5
0
// bootstrap connects the node to the gossip network. Bootstrapping
// commences in the event there are no connected clients or the
// sentinel gossip info is not available. After a successful bootstrap
// connection, this method will block on the stalled condvar, which
// receives notifications that gossip network connectivity has been
// lost and requires re-bootstrapping.
func (g *Gossip) bootstrap() {
	stopper := g.server.stopper

	stopper.RunWorker(func() {
		var bootstrapTimer util.Timer
		defer bootstrapTimer.Stop()
		for {
			stopper.RunTask(func() {
				g.mu.Lock()
				defer g.mu.Unlock()
				haveClients := g.outgoing.len() > 0
				haveSentinel := g.is.getInfo(KeySentinel) != nil
				if !haveClients || !haveSentinel {
					// Try to get another bootstrap address from the resolvers.
					if addr := g.getNextBootstrapAddress(); addr != nil {
						g.startClient(addr, stopper)
					} else {
						// We couldn't start a client, signal that we're stalled so that
						// we'll retry.
						g.maybeSignalStalledLocked()
					}
				}
			})

			// Pause an interval before next possible bootstrap.
			bootstrapTimer.Reset(g.bootstrapInterval)
			select {
			case <-bootstrapTimer.C:
				bootstrapTimer.Read = true
				// break
			case <-stopper.ShouldStop():
				return
			}
			// Block until we need bootstrapping again.
			select {
			case <-g.stalled:
				// break
			case <-stopper.ShouldStop():
				return
			}
		}
	})
}
示例#6
0
// start will run continuously and mark stores as offline if they haven't been
// heard from in longer than timeUntilStoreDead.
func (sp *StorePool) start(stopper *stop.Stopper) {
	stopper.RunWorker(func() {
		var timeoutTimer util.Timer
		defer timeoutTimer.Stop()
		for {
			var timeout time.Duration
			sp.mu.Lock()
			detail := sp.queue.peek()
			if detail == nil {
				// No stores yet, wait the full timeout.
				timeout = sp.timeUntilStoreDead
			} else {
				// Check to see if the store should be marked as dead.
				deadAsOf := detail.lastUpdatedTime.GoTime().Add(sp.timeUntilStoreDead)
				now := sp.clock.Now()
				if now.GoTime().After(deadAsOf) {
					deadDetail := sp.queue.dequeue()
					deadDetail.markDead(now)
					// The next store might be dead as well, set the timeout to
					// 0 to process it immediately.
					timeout = 0
				} else {
					// Store is still alive, schedule the next check for when
					// it should timeout.
					timeout = deadAsOf.Sub(now.GoTime())
				}
			}
			sp.mu.Unlock()
			timeoutTimer.Reset(timeout)
			select {
			case <-timeoutTimer.C:
				timeoutTimer.Read = true
			case <-stopper.ShouldStop():
				return
			}
		}
	})
}
示例#7
0
// runHeartbeat sends periodic heartbeats to client, marking the client healthy
// or unhealthy and reconnecting appropriately until either the Client or the
// supplied channel is closed.
func (c *Client) runHeartbeat(retryOpts retry.Options) {
	healthReceived := c.healthReceived
	setHealthReceived := func() {
		if healthReceived != nil {
			close(healthReceived)
			healthReceived = nil
		}
	}

	isHealthy := false
	setHealthy := func() {
		if isHealthy {
			return
		}
		isHealthy = true
		close(c.healthy.Load().(chan struct{}))
		setHealthReceived()
	}
	setUnhealthy := func() {
		if isHealthy {
			isHealthy = false
			c.healthy.Store(make(chan struct{}))
		}
		setHealthReceived()
	}

	var err = errUnstarted // initial condition
	var heartbeatTimer util.Timer
	defer heartbeatTimer.Stop()
	for {
		for r := retry.Start(retryOpts); r.Next(); {
			if c.maybeClose(retryOpts.Closer) {
				return
			}

			// Reconnect on failure.
			if err != nil {
				// If reconnects are disabled, and we already have a failed connection, return now.
				if c.disableReconnects && c.internalConn() != nil {
					return
				}
				if err = c.connect(); err != nil {
					setUnhealthy()
					log.Warning(err)
					continue
				}
			}

			// Heartbeat regardless of failure.
			if err = c.heartbeat(retryOpts.Closer); err != nil {
				setUnhealthy()
				log.Warning(err)
				if c.maybeClose(retryOpts.Closer) {
					return
				}
				continue
			}

			setHealthy()
			break
		}

		// Wait after the heartbeat so that the first iteration gets a wait-free
		// heartbeat attempt.
		heartbeatTimer.Reset(c.heartbeatInterval)
		select {
		case <-c.closer:
			return
		case <-retryOpts.Closer:
			c.close()
			return
		case <-heartbeatTimer.C:
			heartbeatTimer.Read = true
			// TODO(tamird): Perhaps retry more aggressively when the client is unhealthy.
		}
	}
}
示例#8
0
// Send sends one or more RPCs to clients specified by the slice of
// replicas. On success, Send returns the first successful reply. Otherwise,
// Send returns an error if and as soon as the number of failed RPCs exceeds
// the available endpoints less the number of required replies.
//
// TODO(pmattis): Get rid of the getArgs function which requires the caller to
// maintain a map from address to replica. Instead, pass in the list of
// replicas instead of a list of addresses and use that to populate the
// requests.
func send(opts SendOptions, replicas ReplicaSlice,
	args roachpb.BatchRequest, context *rpc.Context) (proto.Message, error) {
	sp := opts.Trace
	if sp == nil {
		sp = tracing.NilSpan()
	}

	if len(replicas) < 1 {
		return nil, roachpb.NewSendError(
			fmt.Sprintf("insufficient replicas (%d) to satisfy send request of %d",
				len(replicas), 1), false)
	}

	done := make(chan *netrpc.Call, len(replicas))

	clients := make([]batchClient, 0, len(replicas))
	for i, replica := range replicas {
		clients = append(clients, batchClient{
			Client:  rpc.NewClient(&replica.NodeDesc.Address, context),
			replica: &replicas[i],
			args:    args,
		})
	}

	var orderedClients []batchClient
	switch opts.Ordering {
	case orderStable:
		orderedClients = clients
	case orderRandom:
		// Randomly permute order, but keep known-unhealthy clients last.
		var nHealthy int
		for i, client := range clients {
			select {
			case <-client.Healthy():
				clients[i], clients[nHealthy] = clients[nHealthy], clients[i]
				nHealthy++
			default:
			}
		}

		shuffleClients(clients[:nHealthy])
		shuffleClients(clients[nHealthy:])

		orderedClients = clients
	}
	// TODO(spencer): going to need to also sort by affinity; closest
	// ping time should win. Makes sense to have the rpc client/server
	// heartbeat measure ping times. With a bit of seasoning, each
	// node will be able to order the healthy replicas based on latency.

	// Send the first request.
	sendOneFn(&orderedClients[0], opts.Timeout, context, sp, done)
	orderedClients = orderedClients[1:]

	var errors, retryableErrors int

	// Wait for completions.
	var sendNextTimer util.Timer
	defer sendNextTimer.Stop()
	for {
		sendNextTimer.Reset(opts.SendNextTimeout)
		select {
		case <-sendNextTimer.C:
			sendNextTimer.Read = true
			// On successive RPC timeouts, send to additional replicas if available.
			if len(orderedClients) > 0 {
				sp.LogEvent("timeout, trying next peer")
				sendOneFn(&orderedClients[0], opts.Timeout, context, sp, done)
				orderedClients = orderedClients[1:]
			}

		case call := <-done:
			if call.Error == nil {
				// Verify response data integrity if this is a proto response.
				if req, reqOk := call.Args.(roachpb.Request); reqOk {
					if resp, respOk := call.Reply.(roachpb.Response); respOk {
						if err := resp.Verify(req); err != nil {
							call.Error = err
						}
					} else {
						call.Error = util.Errorf("response to proto request must be a proto")
					}
				}
			}
			err := call.Error
			if err == nil {
				if log.V(2) {
					log.Infof("successful reply: %+v", call.Reply)
				}

				return call.Reply.(proto.Message), nil
			}

			// Error handling.
			if log.V(1) {
				log.Warningf("error reply: %s", err)
			}

			errors++

			// Since we have a reconnecting client here, disconnect errors are retryable.
			disconnected := err == netrpc.ErrShutdown || err == io.ErrUnexpectedEOF
			if retryErr, ok := err.(retry.Retryable); disconnected || (ok && retryErr.CanRetry()) {
				retryableErrors++
			}

			if remainingNonErrorRPCs := len(replicas) - errors; remainingNonErrorRPCs < 1 {
				return nil, roachpb.NewSendError(
					fmt.Sprintf("too many errors encountered (%d of %d total): %v",
						errors, len(clients), err), remainingNonErrorRPCs+retryableErrors >= 1)
			}
			// Send to additional replicas if available.
			if len(orderedClients) > 0 {
				sp.LogEvent("error, trying next peer")
				sendOneFn(&orderedClients[0], opts.Timeout, context, sp, done)
				orderedClients = orderedClients[1:]
			}
		}
	}
}
示例#9
0
// processQueue creates a client and sends messages from its designated queue
// via that client, exiting when the client fails or when it idles out. All
// messages remaining in the queue at that point are lost and a new instance of
// processQueue should be started by the next message to be sent.
// TODO(tschottdorf) should let raft know if the node is down;
// need a feedback mechanism for that. Potentially easiest is to arrange for
// the next call to Send() to fail appropriately.
func (t *RaftTransport) processQueue(nodeID roachpb.NodeID) {
	t.mu.Lock()
	ch, ok := t.mu.queues[nodeID]
	t.mu.Unlock()
	if !ok {
		return
	}
	// Clean-up when the loop below shuts down.
	defer func() {
		t.mu.Lock()
		delete(t.mu.queues, nodeID)
		t.mu.Unlock()
	}()

	addr, err := t.resolver(nodeID)
	if err != nil {
		if log.V(1) {
			log.Errorf("failed to get address for node %d: %s", nodeID, err)
		}
		return
	}

	if log.V(1) {
		log.Infof("dialing node %d at %s", nodeID, addr)
	}
	conn, err := t.rpcContext.GRPCDial(addr.String())
	if err != nil {
		if log.V(1) {
			log.Errorf("failed to dial: %s", err)
		}
		return
	}
	client := NewMultiRaftClient(conn)
	ctx, cancel := context.WithCancel(context.Background())
	defer cancel()
	if log.V(1) {
		log.Infof("establishing Raft transport stream to node %d at %s", nodeID, addr)
	}
	stream, err := client.RaftMessage(ctx)
	if err != nil {
		if log.V(1) {
			log.Errorf("failed to establish Raft transport stream to node %d at %s: %s", nodeID, addr, err)
		}
		return
	}

	errCh := make(chan error, 1)

	// Starting workers in a task prevents data races during shutdown.
	t.rpcContext.Stopper.RunTask(func() {
		t.rpcContext.Stopper.RunWorker(func() {
			errCh <- stream.RecvMsg(&RaftMessageResponse{})
		})
	})

	var raftIdleTimer util.Timer
	defer raftIdleTimer.Stop()
	for {
		raftIdleTimer.Reset(raftIdleTimeout)
		select {
		case <-t.rpcContext.Stopper.ShouldStop():
			return
		case <-raftIdleTimer.C:
			raftIdleTimer.Read = true
			if log.V(1) {
				log.Infof("closing Raft transport to %d at %s due to inactivity", nodeID, addr)
			}
			return
		case err := <-errCh:
			if log.V(1) {
				if err != nil {
					log.Infof("remote node %d at %s closed Raft transport with error: %s", nodeID, addr, err)
				} else {
					log.Infof("remote node %d at %s closed Raft transport", nodeID, addr)
				}
			}
			return
		case req := <-ch:
			if err := stream.Send(req); err != nil {
				log.Error(err)
				return
			}
		}
	}
}
示例#10
0
// processQueue creates a client and sends messages from its designated queue
// via that client, exiting when the client fails or when it idles out. All
// messages remaining in the queue at that point are lost and a new instance of
// processQueue should be started by the next message to be sent.
// TODO(tschottdorf) should let raft know if the node is down;
// need a feedback mechanism for that. Potentially easiest is to arrange for
// the next call to Send() to fail appropriately.
func (t *rpcTransport) processQueue(nodeID roachpb.NodeID, storeID roachpb.StoreID) {
	t.mu.Lock()
	ch, ok := t.queues[storeID]
	t.mu.Unlock()
	if !ok {
		return
	}
	// Clean-up when the loop below shuts down.
	defer func() {
		t.mu.Lock()
		delete(t.queues, storeID)
		t.mu.Unlock()
	}()

	addr, err := t.gossip.GetNodeIDAddress(nodeID)
	if err != nil {
		if log.V(1) {
			log.Errorf("could not get address for node %d: %s", nodeID, err)
		}
		return
	}

	var dialOpt grpc.DialOption
	if t.rpcContext.Insecure {
		dialOpt = grpc.WithInsecure()
	} else {
		tlsConfig, err := t.rpcContext.GetClientTLSConfig()
		if err != nil {
			log.Error(err)
			return
		}
		dialOpt = grpc.WithTransportCredentials(credentials.NewTLS(tlsConfig))
	}

	conn, err := grpc.Dial(addr.String(), dialOpt)
	if err != nil {
		log.Errorf("failed to dial: %v", err)
		return
	}
	defer func() {
		if err := conn.Close(); err != nil {
			log.Error(err)
		}
	}()
	client := storage.NewMultiRaftClient(conn)
	ctx := grpcutil.NewContextWithStopper(context.Background(), t.rpcContext.Stopper)
	stream, err := client.RaftMessage(ctx)
	if err != nil {
		log.Error(err)
		return
	}
	defer func() {
		if err := stream.CloseSend(); err != nil {
			log.Error(err)
		}
	}()

	var raftIdleTimer util.Timer
	defer raftIdleTimer.Stop()
	for {
		raftIdleTimer.Reset(raftIdleTimeout)
		select {
		case <-ctx.Done():
			return
		case <-raftIdleTimer.C:
			raftIdleTimer.Read = true
			if log.V(1) {
				log.Infof("closing Raft transport to %d due to inactivity", nodeID)
			}
			return
		case req := <-ch:
			if err := stream.Send(req); err != nil {
				log.Error(err)
				return
			}
		}
	}
}
示例#11
0
// startStats blocks and periodically logs transaction statistics (throughput,
// success rates, durations, ...). Note that this only captures write txns,
// since read-only txns are stateless as far as TxnCoordSender is concerned.
// stats).
// TODO(mrtracy): Add this to TimeSeries.
func (tc *TxnCoordSender) startStats() {
	res := time.Millisecond // for duration logging resolution
	lastNow := tc.clock.PhysicalNow()
	var statusLogTimer util.Timer
	defer statusLogTimer.Stop()
	for {
		statusLogTimer.Reset(statusLogInterval)
		select {
		case <-statusLogTimer.C:
			statusLogTimer.Read = true
			if !log.V(1) {
				continue
			}

			tc.Lock()
			curStats := tc.txnStats
			tc.txnStats = txnCoordStats{}
			tc.Unlock()

			now := tc.clock.PhysicalNow()

			// Tests have weird clocks.
			if now-lastNow <= 0 {
				continue
			}

			num := len(curStats.durations)
			// Only compute when non-empty input.
			var dMax, dMean, dDev, rMax, rMean, rDev float64
			var err error
			if num > 0 {
				// There should never be an error in the below
				// computations.
				dMax, err = stats.Max(curStats.durations)
				if err != nil {
					panic(err)
				}
				dMean, err = stats.Mean(curStats.durations)
				if err != nil {
					panic(err)
				}
				dDev, err = stats.StdDevP(curStats.durations)
				if err != nil {
					panic(err)
				}
				rMax, err = stats.Max(curStats.restarts)
				if err != nil {
					panic(err)
				}
				rMean, err = stats.Mean(curStats.restarts)
				if err != nil {
					panic(err)
				}
				rDev, err = stats.StdDevP(curStats.restarts)
				if err != nil {
					panic(err)
				}
			}

			rate := float64(int64(num)*int64(time.Second)) / float64(now-lastNow)
			var pCommitted, pAbandoned, pAborted float32

			if fNum := float32(num); fNum > 0 {
				pCommitted = 100 * float32(curStats.committed) / fNum
				pAbandoned = 100 * float32(curStats.abandoned) / fNum
				pAborted = 100 * float32(curStats.aborted) / fNum
			}
			log.Infof(
				"txn coordinator: %.2f txn/sec, %.2f/%.2f/%.2f %%cmmt/abrt/abnd, %s/%s/%s avg/σ/max duration, %.1f/%.1f/%.1f avg/σ/max restarts (%d samples)",
				rate, pCommitted, pAborted, pAbandoned,
				util.TruncateDuration(time.Duration(dMean), res),
				util.TruncateDuration(time.Duration(dDev), res),
				util.TruncateDuration(time.Duration(dMax), res),
				rMean, rDev, rMax, num,
			)
			lastNow = now
		case <-tc.stopper.ShouldStop():
			return
		}
	}
}
示例#12
0
// Send sends one or more RPCs to clients specified by the slice of
// replicas. On success, Send returns the first successful reply. Otherwise,
// Send returns an error if and as soon as the number of failed RPCs exceeds
// the available endpoints less the number of required replies.
func send(opts SendOptions, replicas ReplicaSlice,
	args roachpb.BatchRequest, rpcContext *rpc.Context) (*roachpb.BatchResponse, error) {

	if len(replicas) < 1 {
		return nil, roachpb.NewSendError(
			fmt.Sprintf("insufficient replicas (%d) to satisfy send request of %d",
				len(replicas), 1), false)
	}

	done := make(chan batchCall, len(replicas))

	clients := make([]batchClient, 0, len(replicas))
	for _, replica := range replicas {
		conn, err := rpcContext.GRPCDial(replica.NodeDesc.Address.String())
		if err != nil {
			return nil, err
		}
		argsCopy := args
		argsCopy.Replica = replica.ReplicaDescriptor
		clients = append(clients, batchClient{
			remoteAddr: replica.NodeDesc.Address.String(),
			conn:       conn,
			client:     roachpb.NewInternalClient(conn),
			args:       argsCopy,
		})
	}

	// Put known-unhealthy clients last.
	nHealthy, err := splitHealthy(clients)
	if err != nil {
		return nil, err
	}

	var orderedClients []batchClient
	switch opts.Ordering {
	case orderStable:
		orderedClients = clients
	case orderRandom:
		// Randomly permute order, but keep known-unhealthy clients last.
		shuffleClients(clients[:nHealthy])
		shuffleClients(clients[nHealthy:])

		orderedClients = clients
	}
	// TODO(spencer): going to need to also sort by affinity; closest
	// ping time should win. Makes sense to have the rpc client/server
	// heartbeat measure ping times. With a bit of seasoning, each
	// node will be able to order the healthy replicas based on latency.

	// Send the first request.
	sendOneFn(opts, rpcContext, orderedClients[0], done)
	orderedClients = orderedClients[1:]

	var errors, retryableErrors int

	// Wait for completions.
	var sendNextTimer util.Timer
	defer sendNextTimer.Stop()
	for {
		sendNextTimer.Reset(opts.SendNextTimeout)
		select {
		case <-sendNextTimer.C:
			sendNextTimer.Read = true
			// On successive RPC timeouts, send to additional replicas if available.
			if len(orderedClients) > 0 {
				log.Trace(opts.Context, "timeout, trying next peer")
				sendOneFn(opts, rpcContext, orderedClients[0], done)
				orderedClients = orderedClients[1:]
			}

		case call := <-done:
			err := call.err
			if err == nil {
				if log.V(2) {
					log.Infof("successful reply: %+v", call.reply)
				}

				return call.reply, nil
			}

			// Error handling.
			if log.V(1) {
				log.Warningf("error reply: %s", err)
			}

			errors++

			// Since we have a reconnecting client here, disconnect errors are retryable.
			disconnected := err == io.ErrUnexpectedEOF
			if retryErr, ok := err.(retry.Retryable); disconnected || (ok && retryErr.CanRetry()) {
				retryableErrors++
			}

			if remainingNonErrorRPCs := len(replicas) - errors; remainingNonErrorRPCs < 1 {
				return nil, roachpb.NewSendError(
					fmt.Sprintf("too many errors encountered (%d of %d total): %v",
						errors, len(clients), err), remainingNonErrorRPCs+retryableErrors >= 1)
			}
			// Send to additional replicas if available.
			if len(orderedClients) > 0 {
				log.Trace(opts.Context, "error, trying next peer")
				sendOneFn(opts, rpcContext, orderedClients[0], done)
				orderedClients = orderedClients[1:]
			}
		}
	}
}
示例#13
0
// processQueue creates a client and sends messages from its designated queue
// via that client, exiting when the client fails or when it idles out. All
// messages remaining in the queue at that point are lost and a new instance of
// processQueue should be started by the next message to be sent.
// TODO(tschottdorf) should let raft know if the node is down;
// need a feedback mechanism for that. Potentially easiest is to arrange for
// the next call to Send() to fail appropriately.
func (t *RaftTransport) processQueue(nodeID roachpb.NodeID) {
	t.mu.Lock()
	ch, ok := t.mu.queues[nodeID]
	t.mu.Unlock()
	if !ok {
		return
	}
	// Clean-up when the loop below shuts down.
	defer func() {
		t.mu.Lock()
		delete(t.mu.queues, nodeID)
		t.mu.Unlock()
	}()

	addr, err := t.resolver(nodeID)
	if err != nil {
		if log.V(1) {
			log.Errorf("failed to get address for node %d: %s", nodeID, err)
		}
		return
	}

	if log.V(1) {
		log.Infof("dialing node %d at %s", nodeID, addr)
	}
	conn, err := t.rpcContext.GRPCDial(addr.String())
	if err != nil {
		if log.V(1) {
			log.Errorf("failed to dial: %s", err)
		}
		return
	}
	client := NewMultiRaftClient(conn)
	ctx, cancel := context.WithCancel(context.TODO())
	defer cancel()
	if log.V(1) {
		log.Infof("establishing Raft transport stream to node %d at %s", nodeID, addr)
	}
	// We start two streams; one will be used for snapshots, the other for all
	// other traffic. This is done to prevent snapshots from blocking other
	// traffic.
	streams := make([]MultiRaft_RaftMessageClient, 2)
	for i := range streams {
		stream, err := client.RaftMessage(ctx)
		if err != nil {
			if log.V(1) {
				log.Errorf("failed to establish Raft transport stream to node %d at %s: %s", nodeID, addr, err)
			}
			return
		}
		streams[i] = stream
	}

	errCh := make(chan error, len(streams))

	// Starting workers in a task prevents data races during shutdown.
	t.rpcContext.Stopper.RunTask(func() {
		for i := range streams {
			// Avoid closing over a `range` binding.
			stream := streams[i]

			t.rpcContext.Stopper.RunWorker(func() {
				// NB: only one error will ever be read from this channel. That's fine,
				// given that the channel is buffered to the maximum number of errors
				// that will be written to it.
				errCh <- stream.RecvMsg(new(RaftMessageResponse))
			})
		}
	})

	snapStream := streams[0]
	restStream := streams[1]

	var raftIdleTimer util.Timer
	defer raftIdleTimer.Stop()
	for {
		raftIdleTimer.Reset(raftIdleTimeout)
		select {
		case <-t.rpcContext.Stopper.ShouldStop():
			return
		case <-raftIdleTimer.C:
			raftIdleTimer.Read = true
			if log.V(1) {
				log.Infof("closing Raft transport to %d at %s due to inactivity", nodeID, addr)
			}
			return
		case err := <-errCh:
			if log.V(1) {
				if err != nil {
					log.Infof("remote node %d at %s closed Raft transport with error: %s", nodeID, addr, err)
				} else {
					log.Infof("remote node %d at %s closed Raft transport", nodeID, addr)
				}
			}
			return
		case req := <-ch:
			if req.Message.Type == raftpb.MsgSnap {
				t.rpcContext.Stopper.RunAsyncTask(func() {
					err := snapStream.Send(req)
					if err != nil {
						log.Errorf("failed to send Raft snapshot to node %d at %s: %s", nodeID, addr, err)
					} else if log.V(1) {
						log.Infof("successfully sent a Raft snapshot to node %d at %s", nodeID, addr)
					}
					t.SnapshotStatusChan <- RaftSnapshotStatus{req, err}
				})
			} else {
				if err := restStream.Send(req); err != nil {
					log.Error(err)
					return
				}
			}
		}
	}
}