Beispiel #1
0
// start dials the remote addr and commences gossip once connected. Upon exit,
// the client is sent on the disconnected channel. This method starts client
// processing in a goroutine and returns immediately.
func (c *client) start(g *Gossip, disconnected chan *client, ctx *rpc.Context, stopper *stop.Stopper) {
	stopper.RunWorker(func() {
		defer func() {
			disconnected <- c
		}()

		// Note: avoid using `grpc.WithBlock` here. This code is already
		// asynchronous from the caller's perspective, so the only effect of
		// `WithBlock` here is blocking shutdown - at the time of this writing,
		// that ends ups up making `kv` tests take twice as long.
		conn, err := ctx.GRPCDial(c.addr.String())
		if err != nil {
			log.Errorf("failed to dial: %v", err)
			return
		}

		// Start gossiping.
		if err := c.gossip(g, NewGossipClient(conn), stopper); err != nil {
			if !grpcutil.IsClosedConnection(err) {
				g.mu.Lock()
				peerID := c.peerID
				g.mu.Unlock()
				if peerID != 0 {
					log.Infof("closing client to node %d (%s): %s", peerID, c.addr, err)
				} else {
					log.Infof("closing client to %s: %s", c.addr, err)
				}
			}
		}
	})
}
Beispiel #2
0
// start dials the remote addr and commences gossip once connected. Upon exit,
// the client is sent on the disconnected channel. This method starts client
// processing in a goroutine and returns immediately.
func (c *client) start(g *Gossip, disconnected chan *client, ctx *rpc.Context, stopper *stop.Stopper) {
	stopper.RunWorker(func() {
		defer func() {
			disconnected <- c
		}()

		conn, err := ctx.GRPCDial(c.addr.String(), grpc.WithBlock())
		if err != nil {
			log.Errorf("failed to dial: %v", err)
			return
		}

		// Start gossiping.
		if err := c.gossip(g, NewGossipClient(conn), stopper); err != nil {
			if !grpcutil.IsClosedConnection(err) {
				g.mu.Lock()
				peerID := c.peerID
				g.mu.Unlock()
				if peerID != 0 {
					log.Infof("closing client to node %d (%s): %s", peerID, c.addr, err)
				} else {
					log.Infof("closing client to %s: %s", c.addr, err)
				}
			}
		}
	})
}
Beispiel #3
0
// grpcTransportFactory is the default TransportFactory, using GRPC.
func grpcTransportFactory(
	opts SendOptions,
	rpcContext *rpc.Context,
	replicas ReplicaSlice,
	args roachpb.BatchRequest,
) (Transport, error) {
	clients := make([]batchClient, 0, len(replicas))
	for _, replica := range replicas {
		conn, err := rpcContext.GRPCDial(replica.NodeDesc.Address.String())
		if err != nil {
			return nil, err
		}
		argsCopy := args
		argsCopy.Replica = replica.ReplicaDescriptor
		remoteAddr := replica.NodeDesc.Address.String()
		clients = append(clients, batchClient{
			remoteAddr: remoteAddr,
			conn:       conn,
			client:     roachpb.NewInternalClient(conn),
			args:       argsCopy,
			healthy:    rpcContext.IsConnHealthy(remoteAddr),
		})
	}

	// Put known-unhealthy clients last.
	splitHealthy(clients)

	return &grpcTransport{
		opts:           opts,
		rpcContext:     rpcContext,
		orderedClients: clients,
	}, nil
}
Beispiel #4
0
// NewSender returns an implementation of Sender which exposes the Key-Value
// database provided by a Cockroach cluster by connecting via RPC to a
// Cockroach node.
func NewSender(ctx *rpc.Context, target string) (Sender, error) {
	conn, err := ctx.GRPCDial(target)
	if err != nil {
		return nil, err
	}
	return sender{roachpb.NewExternalClient(conn)}, nil
}
Beispiel #5
0
// Send sends one or more RPCs to clients specified by the slice of
// replicas. On success, Send returns the first successful reply. Otherwise,
// Send returns an error if and as soon as the number of failed RPCs exceeds
// the available endpoints less the number of required replies.
func send(opts SendOptions, replicas ReplicaSlice,
	args roachpb.BatchRequest, rpcContext *rpc.Context) (*roachpb.BatchResponse, error) {

	if len(replicas) < 1 {
		return nil, roachpb.NewSendError(
			fmt.Sprintf("insufficient replicas (%d) to satisfy send request of %d",
				len(replicas), 1), false)
	}

	done := make(chan batchCall, len(replicas))

	clients := make([]batchClient, 0, len(replicas))
	for _, replica := range replicas {
		conn, err := rpcContext.GRPCDial(replica.NodeDesc.Address.String())
		if err != nil {
			return nil, err
		}
		argsCopy := args
		argsCopy.Replica = replica.ReplicaDescriptor
		clients = append(clients, batchClient{
			remoteAddr: replica.NodeDesc.Address.String(),
			conn:       conn,
			client:     roachpb.NewInternalClient(conn),
			args:       argsCopy,
		})
	}

	// Put known-unhealthy clients last.
	nHealthy, err := splitHealthy(clients)
	if err != nil {
		return nil, err
	}

	var orderedClients []batchClient
	switch opts.Ordering {
	case orderStable:
		orderedClients = clients
	case orderRandom:
		// Randomly permute order, but keep known-unhealthy clients last.
		shuffleClients(clients[:nHealthy])
		shuffleClients(clients[nHealthy:])

		orderedClients = clients
	}
	// TODO(spencer): going to need to also sort by affinity; closest
	// ping time should win. Makes sense to have the rpc client/server
	// heartbeat measure ping times. With a bit of seasoning, each
	// node will be able to order the healthy replicas based on latency.

	// Send the first request.
	sendOneFn(opts, rpcContext, orderedClients[0], done)
	orderedClients = orderedClients[1:]

	var errors, retryableErrors int

	// Wait for completions.
	var sendNextTimer util.Timer
	defer sendNextTimer.Stop()
	for {
		sendNextTimer.Reset(opts.SendNextTimeout)
		select {
		case <-sendNextTimer.C:
			sendNextTimer.Read = true
			// On successive RPC timeouts, send to additional replicas if available.
			if len(orderedClients) > 0 {
				log.Trace(opts.Context, "timeout, trying next peer")
				sendOneFn(opts, rpcContext, orderedClients[0], done)
				orderedClients = orderedClients[1:]
			}

		case call := <-done:
			err := call.err
			if err == nil {
				if log.V(2) {
					log.Infof("successful reply: %+v", call.reply)
				}

				return call.reply, nil
			}

			// Error handling.
			if log.V(1) {
				log.Warningf("error reply: %s", err)
			}

			errors++

			// Since we have a reconnecting client here, disconnect errors are retryable.
			disconnected := err == io.ErrUnexpectedEOF
			if retryErr, ok := err.(retry.Retryable); disconnected || (ok && retryErr.CanRetry()) {
				retryableErrors++
			}

			if remainingNonErrorRPCs := len(replicas) - errors; remainingNonErrorRPCs < 1 {
				return nil, roachpb.NewSendError(
					fmt.Sprintf("too many errors encountered (%d of %d total): %v",
						errors, len(clients), err), remainingNonErrorRPCs+retryableErrors >= 1)
			}
			// Send to additional replicas if available.
			if len(orderedClients) > 0 {
				log.Trace(opts.Context, "error, trying next peer")
				sendOneFn(opts, rpcContext, orderedClients[0], done)
				orderedClients = orderedClients[1:]
			}
		}
	}
}
Beispiel #6
0
// start dials the remote addr and commences gossip once connected. Upon exit,
// the client is sent on the disconnected channel. This method starts client
// processing in a goroutine and returns immediately.
func (c *client) start(
	g *Gossip,
	disconnected chan *client,
	rpcCtx *rpc.Context,
	stopper *stop.Stopper,
	nodeID roachpb.NodeID,
	breaker *circuit.Breaker,
) {
	stopper.RunWorker(func() {
		ctx, cancel := context.WithCancel(c.ctx)
		var wg sync.WaitGroup
		defer func() {
			// This closes the outgoing stream, causing any attempt to send or
			// receive to return an error.
			//
			// Note: it is still possible for incoming gossip to be processed after
			// this point.
			cancel()

			// The stream is closed, but there may still be some incoming gossip
			// being processed. Wait until that is complete to avoid racing the
			// client's removal against the discovery of its remote's node ID.
			wg.Wait()
			disconnected <- c
		}()

		consecFailures := breaker.ConsecFailures()
		var stream Gossip_GossipClient
		if err := breaker.Call(func() error {
			// Note: avoid using `grpc.WithBlock` here. This code is already
			// asynchronous from the caller's perspective, so the only effect of
			// `WithBlock` here is blocking shutdown - at the time of this writing,
			// that ends ups up making `kv` tests take twice as long.
			conn, err := rpcCtx.GRPCDial(c.addr.String())
			if err != nil {
				return err
			}
			if stream, err = NewGossipClient(conn).Gossip(ctx); err != nil {
				return err
			}
			return c.requestGossip(g, stream)
		}, 0); err != nil {
			if consecFailures == 0 {
				log.Warningf(ctx, "node %d: failed to start gossip client: %s", nodeID, err)
			}
			return
		}

		// Start gossiping.
		log.Infof(ctx, "node %d: started gossip client to %s", nodeID, c.addr)
		if err := c.gossip(ctx, g, stream, stopper, &wg); err != nil {
			if !grpcutil.IsClosedConnection(err) {
				g.mu.Lock()
				peerID := c.peerID
				g.mu.Unlock()
				if peerID != 0 {
					log.Infof(ctx, "node %d: closing client to node %d (%s): %s", nodeID, peerID, c.addr, err)
				} else {
					log.Infof(ctx, "node %d: closing client to %s: %s", nodeID, c.addr, err)
				}
			}
		}
	})
}