// grpcTransportFactory is the default TransportFactory, using GRPC. func grpcTransportFactory( opts SendOptions, rpcContext *rpc.Context, replicas ReplicaSlice, args roachpb.BatchRequest, ) (Transport, error) { clients := make([]batchClient, 0, len(replicas)) for _, replica := range replicas { conn, err := rpcContext.GRPCDial(replica.NodeDesc.Address.String()) if err != nil { return nil, err } argsCopy := args argsCopy.Replica = replica.ReplicaDescriptor remoteAddr := replica.NodeDesc.Address.String() clients = append(clients, batchClient{ remoteAddr: remoteAddr, conn: conn, client: roachpb.NewInternalClient(conn), args: argsCopy, healthy: rpcContext.IsConnHealthy(remoteAddr), }) } // Put known-unhealthy clients last. splitHealthy(clients) return &grpcTransport{ opts: opts, rpcContext: rpcContext, orderedClients: clients, }, nil }
// reserve send a reservation request rpc to the node and store // based on the toStoreID. It returns an error if the reservation was not // successfully booked. When unsuccessful, the store is marked as having a // declined reservation so it will not be considered for up-replication or // rebalancing until after the configured timeout period has passed. // TODO(bram): consider moving the nodeID to the store pool during // NewStorePool. func (sp *StorePool) reserve( curIdent roachpb.StoreIdent, toStoreID roachpb.StoreID, rangeID roachpb.RangeID, rangeSize int64, ) error { if !sp.reservationsEnabled { return nil } sp.mu.Lock() defer sp.mu.Unlock() detail, ok := sp.mu.stores[toStoreID] if !ok { return fmt.Errorf("store does not exist in the store pool") } conn, err := sp.rpcContext.GRPCDial(detail.desc.Node.Address.String()) if err != nil { return err } client := roachpb.NewInternalClient(conn) req := &roachpb.ReservationRequest{ StoreRequestHeader: roachpb.StoreRequestHeader{ NodeID: detail.desc.Node.NodeID, StoreID: toStoreID, }, FromNodeID: curIdent.NodeID, FromStoreID: curIdent.StoreID, RangeSize: rangeSize, RangeID: rangeID, } if log.V(2) { log.Infof("proposing new reservation:%+v", req) } ctxWithTimeout, cancel := context.WithTimeout(context.TODO(), sp.reserveRPCTimeout) defer cancel() resp, err := client.Reserve(ctxWithTimeout, req) // If a reservation is declined, be it due to an error or because it was // rejected, we mark the store detail as having been rejected so it won't // be considered as a candidate for new replicas until after the configured // timeout period has passed. if err != nil { detail.unavailableUntil = sp.clock.Now().GoTime().Add(sp.failedReservationsTimeout) if log.V(2) { log.Infof("reservation failed, store:%s will be unavailable for %s until %s", toStoreID, sp.failedReservationsTimeout, detail.unavailableUntil) } return fmt.Errorf("reservation failed:%+v due to error:%s", req, err) } if !resp.Reserved { detail.unavailableUntil = sp.clock.Now().GoTime().Add(sp.declinedReservationsTimeout) if log.V(2) { log.Infof("reservation failed, store:%s will be unavailable for %s until %s", toStoreID, sp.declinedReservationsTimeout, detail.unavailableUntil) } return fmt.Errorf("reservation declined:%+v", req) } if log.V(2) { log.Infof("reservation was approved:%+v", req) } return nil }
// waitForStoreFrozen polls the given stores until they all report having no // unfrozen Replicas (or an error or timeout occurs). func (s *adminServer) waitForStoreFrozen( stream serverpb.Admin_ClusterFreezeServer, stores map[roachpb.StoreID]roachpb.NodeID, wantFrozen bool, ) error { mu := struct { sync.Mutex oks map[roachpb.StoreID]bool }{ oks: make(map[roachpb.StoreID]bool), } opts := base.DefaultRetryOptions() opts.Closer = s.server.stopper.ShouldDrain() opts.MaxRetries = 20 sem := make(chan struct{}, 256) errChan := make(chan error, 1) sendErr := func(err error) { select { case errChan <- err: default: } } numWaiting := len(stores) // loop until this drops to zero var err error for r := retry.Start(opts); r.Next(); { mu.Lock() for storeID, nodeID := range stores { storeID, nodeID := storeID, nodeID // loop-local copies for goroutine var nodeDesc roachpb.NodeDescriptor if err := s.server.gossip.GetInfoProto(gossip.MakeNodeIDKey(nodeID), &nodeDesc); err != nil { sendErr(err) break } addr := nodeDesc.Address.String() if _, inflightOrSucceeded := mu.oks[storeID]; inflightOrSucceeded { continue } mu.oks[storeID] = false // mark as inflight action := func() (err error) { var resp *roachpb.PollFrozenResponse defer func() { message := fmt.Sprintf("node %d, store %d: ", nodeID, storeID) if err != nil { message += err.Error() } else { numMismatching := len(resp.Results) mu.Lock() if numMismatching == 0 { // If the Store is in the right state, mark it as such. // This means we won't try it again. message += "ready" mu.oks[storeID] = true } else { // Otherwise, forget that we tried the Store so that // the retry loop picks it up again. message += fmt.Sprintf("%d replicas report wrong status", numMismatching) if limit := 10; numMismatching > limit { message += " [truncated]: " resp.Results = resp.Results[:limit] } else { message += ": " } message += fmt.Sprintf("%+v", resp.Results) delete(mu.oks, storeID) } mu.Unlock() err = stream.Send(&serverpb.ClusterFreezeResponse{ Message: message, }) } }() conn, err := s.server.rpcContext.GRPCDial(addr) if err != nil { return err } client := roachpb.NewInternalClient(conn) resp, err = client.PollFrozen(context.Background(), &roachpb.PollFrozenRequest{ StoreRequestHeader: roachpb.StoreRequestHeader{ NodeID: nodeID, StoreID: storeID, }, // If we are looking to freeze everything, we want to // collect thawed Replicas, and vice versa. CollectFrozen: !wantFrozen, }) return err } // Run a limited, non-blocking task. That means the task simply // won't run if the semaphore is full (or the node is draining). // Both are handled by the surrounding retry loop. if !s.server.stopper.RunLimitedAsyncTask(sem, func() { if err := action(); err != nil { sendErr(err) } }) { // Node draining. sendErr(errors.New("node is shutting down")) break } } numWaiting = len(stores) for _, ok := range mu.oks { if ok { // Store has reported that it is frozen. numWaiting-- continue } } mu.Unlock() select { case err = <-errChan: default: } // Keep going unless there's been an error or everyone's frozen. if err != nil || numWaiting == 0 { break } if err := stream.Send(&serverpb.ClusterFreezeResponse{ Message: fmt.Sprintf("waiting for %d store%s to apply operation", numWaiting, util.Pluralize(int64(numWaiting))), }); err != nil { return err } } if err != nil { return err } if numWaiting > 0 { err = fmt.Errorf("timed out waiting for %d store%s to report freeze", numWaiting, util.Pluralize(int64(numWaiting))) } return err }
// Send sends one or more RPCs to clients specified by the slice of // replicas. On success, Send returns the first successful reply. Otherwise, // Send returns an error if and as soon as the number of failed RPCs exceeds // the available endpoints less the number of required replies. func send(opts SendOptions, replicas ReplicaSlice, args roachpb.BatchRequest, rpcContext *rpc.Context) (*roachpb.BatchResponse, error) { if len(replicas) < 1 { return nil, roachpb.NewSendError( fmt.Sprintf("insufficient replicas (%d) to satisfy send request of %d", len(replicas), 1), false) } done := make(chan batchCall, len(replicas)) clients := make([]batchClient, 0, len(replicas)) for _, replica := range replicas { conn, err := rpcContext.GRPCDial(replica.NodeDesc.Address.String()) if err != nil { return nil, err } argsCopy := args argsCopy.Replica = replica.ReplicaDescriptor clients = append(clients, batchClient{ remoteAddr: replica.NodeDesc.Address.String(), conn: conn, client: roachpb.NewInternalClient(conn), args: argsCopy, }) } // Put known-unhealthy clients last. nHealthy, err := splitHealthy(clients) if err != nil { return nil, err } var orderedClients []batchClient switch opts.Ordering { case orderStable: orderedClients = clients case orderRandom: // Randomly permute order, but keep known-unhealthy clients last. shuffleClients(clients[:nHealthy]) shuffleClients(clients[nHealthy:]) orderedClients = clients } // TODO(spencer): going to need to also sort by affinity; closest // ping time should win. Makes sense to have the rpc client/server // heartbeat measure ping times. With a bit of seasoning, each // node will be able to order the healthy replicas based on latency. // Send the first request. sendOneFn(opts, rpcContext, orderedClients[0], done) orderedClients = orderedClients[1:] var errors, retryableErrors int // Wait for completions. var sendNextTimer util.Timer defer sendNextTimer.Stop() for { sendNextTimer.Reset(opts.SendNextTimeout) select { case <-sendNextTimer.C: sendNextTimer.Read = true // On successive RPC timeouts, send to additional replicas if available. if len(orderedClients) > 0 { log.Trace(opts.Context, "timeout, trying next peer") sendOneFn(opts, rpcContext, orderedClients[0], done) orderedClients = orderedClients[1:] } case call := <-done: err := call.err if err == nil { if log.V(2) { log.Infof("successful reply: %+v", call.reply) } return call.reply, nil } // Error handling. if log.V(1) { log.Warningf("error reply: %s", err) } errors++ // Since we have a reconnecting client here, disconnect errors are retryable. disconnected := err == io.ErrUnexpectedEOF if retryErr, ok := err.(retry.Retryable); disconnected || (ok && retryErr.CanRetry()) { retryableErrors++ } if remainingNonErrorRPCs := len(replicas) - errors; remainingNonErrorRPCs < 1 { return nil, roachpb.NewSendError( fmt.Sprintf("too many errors encountered (%d of %d total): %v", errors, len(clients), err), remainingNonErrorRPCs+retryableErrors >= 1) } // Send to additional replicas if available. if len(orderedClients) > 0 { log.Trace(opts.Context, "error, trying next peer") sendOneFn(opts, rpcContext, orderedClients[0], done) orderedClients = orderedClients[1:] } } } }