func (s *adminServer) ClusterFreeze( req *serverpb.ClusterFreezeRequest, stream serverpb.Admin_ClusterFreezeServer, ) error { var totalAffected int64 stores := make(map[roachpb.StoreID]roachpb.NodeID) process := func(from, to roachpb.Key) (roachpb.Key, error) { b := &client.Batch{} fa := roachpb.NewChangeFrozen(from, to, req.Freeze, build.GetInfo().Tag) b.AddRawRequest(fa) if err := s.server.db.Run(b); err != nil { return nil, err } fr := b.RawResponse().Responses[0].GetInner().(*roachpb.ChangeFrozenResponse) totalAffected += fr.RangesAffected for storeID, nodeID := range fr.Stores { stores[storeID] = nodeID } return fr.MinStartKey.AsRawKey(), nil } task := "thaw" if req.Freeze { task = "freeze" // When freezing, we save the meta2 and meta1 range for last to avoid // interfering with command routing. // Note that we freeze only Ranges whose StartKey is included. In // particular, a Range which contains some meta keys will not be frozen // by the request that begins at Meta2KeyMax. ChangeFreeze gives us the // leftmost covered Range back, which we use for the next request to // avoid split-related races. freezeTo := roachpb.KeyMax // updated as we go along freezeFroms := []roachpb.Key{ keys.Meta2KeyMax, // freeze userspace keys.Meta1KeyMax, // freeze all meta2 ranges keys.LocalMax, // freeze first range (meta1) } for _, freezeFrom := range freezeFroms { var err error if freezeTo, err = process(freezeFrom, freezeTo); err != nil { return err } } } else { // When unfreezing, we walk in opposite order and try the first range // first. We should be able to get there if the first range manages to // gossip. From that, we can talk to the second level replicas, and // then to everyone else. Because ChangeFrozen works in forward order, // we can simply hit the whole keyspace at once. // TODO(tschottdorf): make the first range replicas gossip their // descriptor unconditionally or we won't always be able to unfreeze // (except by restarting a node which holds the first range). if _, err := process(keys.LocalMax, roachpb.KeyMax); err != nil { return err } } if err := stream.Send(&serverpb.ClusterFreezeResponse{ RangesAffected: totalAffected, Message: fmt.Sprintf("proposed %s to %d ranges", task, totalAffected), }); err != nil { return err } return s.waitForStoreFrozen(stream, stores, req.Freeze) }
// waitForStoreFrozen polls the given stores until they all report having no // unfrozen Replicas (or an error or timeout occurs). func (s *adminServer) waitForStoreFrozen( stream serverpb.Admin_ClusterFreezeServer, stores map[roachpb.StoreID]roachpb.NodeID, wantFrozen bool, ) error { mu := struct { sync.Mutex oks map[roachpb.StoreID]bool }{ oks: make(map[roachpb.StoreID]bool), } opts := base.DefaultRetryOptions() opts.Closer = s.server.stopper.ShouldDrain() opts.MaxRetries = 20 sem := make(chan struct{}, 256) errChan := make(chan error, 1) sendErr := func(err error) { select { case errChan <- err: default: } } numWaiting := len(stores) // loop until this drops to zero var err error for r := retry.Start(opts); r.Next(); { mu.Lock() for storeID, nodeID := range stores { storeID, nodeID := storeID, nodeID // loop-local copies for goroutine var nodeDesc roachpb.NodeDescriptor if err := s.server.gossip.GetInfoProto(gossip.MakeNodeIDKey(nodeID), &nodeDesc); err != nil { sendErr(err) break } addr := nodeDesc.Address.String() if _, inflightOrSucceeded := mu.oks[storeID]; inflightOrSucceeded { continue } mu.oks[storeID] = false // mark as inflight action := func() (err error) { var resp *roachpb.PollFrozenResponse defer func() { message := fmt.Sprintf("node %d, store %d: ", nodeID, storeID) if err != nil { message += err.Error() } else { numMismatching := len(resp.Results) mu.Lock() if numMismatching == 0 { // If the Store is in the right state, mark it as such. // This means we won't try it again. message += "ready" mu.oks[storeID] = true } else { // Otherwise, forget that we tried the Store so that // the retry loop picks it up again. message += fmt.Sprintf("%d replicas report wrong status", numMismatching) if limit := 10; numMismatching > limit { message += " [truncated]: " resp.Results = resp.Results[:limit] } else { message += ": " } message += fmt.Sprintf("%+v", resp.Results) delete(mu.oks, storeID) } mu.Unlock() err = stream.Send(&serverpb.ClusterFreezeResponse{ Message: message, }) } }() conn, err := s.server.rpcContext.GRPCDial(addr) if err != nil { return err } client := roachpb.NewInternalClient(conn) resp, err = client.PollFrozen(context.Background(), &roachpb.PollFrozenRequest{ StoreRequestHeader: roachpb.StoreRequestHeader{ NodeID: nodeID, StoreID: storeID, }, // If we are looking to freeze everything, we want to // collect thawed Replicas, and vice versa. CollectFrozen: !wantFrozen, }) return err } // Run a limited, non-blocking task. That means the task simply // won't run if the semaphore is full (or the node is draining). // Both are handled by the surrounding retry loop. if !s.server.stopper.RunLimitedAsyncTask(sem, func() { if err := action(); err != nil { sendErr(err) } }) { // Node draining. sendErr(errors.New("node is shutting down")) break } } numWaiting = len(stores) for _, ok := range mu.oks { if ok { // Store has reported that it is frozen. numWaiting-- continue } } mu.Unlock() select { case err = <-errChan: default: } // Keep going unless there's been an error or everyone's frozen. if err != nil || numWaiting == 0 { break } if err := stream.Send(&serverpb.ClusterFreezeResponse{ Message: fmt.Sprintf("waiting for %d store%s to apply operation", numWaiting, util.Pluralize(int64(numWaiting))), }); err != nil { return err } } if err != nil { return err } if numWaiting > 0 { err = fmt.Errorf("timed out waiting for %d store%s to report freeze", numWaiting, util.Pluralize(int64(numWaiting))) } return err }