// FindHealthyRdonlyEndPoint returns a random healthy endpoint. // Since we don't want to use them all, we require at least // minHealthyEndPoints servers to be healthy. // May block up to -wait_for_healthy_rdonly_endpoints_timeout. func FindHealthyRdonlyEndPoint(ctx context.Context, wr *wrangler.Wrangler, cell, keyspace, shard string) (*topodatapb.TabletAlias, error) { busywaitCtx, busywaitCancel := context.WithTimeout(ctx, *WaitForHealthyEndPointsTimeout) defer busywaitCancel() // create a discovery healthcheck, wait for it to have one rdonly // endpoints at this point healthCheck := discovery.NewHealthCheck(*remoteActionsTimeout, *healthcheckRetryDelay, *healthCheckTimeout, "" /* statsSuffix */) watcher := discovery.NewShardReplicationWatcher(wr.TopoServer(), healthCheck, cell, keyspace, shard, *healthCheckTopologyRefresh, 5 /*topoReadConcurrency*/) defer watcher.Stop() defer healthCheck.Close() if err := discovery.WaitForEndPoints(ctx, healthCheck, cell, keyspace, shard, []topodatapb.TabletType{topodatapb.TabletType_RDONLY}); err != nil { return nil, fmt.Errorf("error waiting for rdonly endpoints for (%v,%v/%v): %v", cell, keyspace, shard, err) } var healthyEndpoints []*topodatapb.EndPoint for { select { case <-busywaitCtx.Done(): return nil, fmt.Errorf("Not enough endpoints to choose from in (%v,%v/%v), have %v healthy ones, need at least %v Context Error: %v", cell, keyspace, shard, len(healthyEndpoints), *minHealthyEndPoints, busywaitCtx.Err()) default: } addrs := healthCheck.GetEndPointStatsFromTarget(keyspace, shard, topodatapb.TabletType_RDONLY) healthyEndpoints = make([]*topodatapb.EndPoint, 0, len(addrs)) for _, addr := range addrs { // Note we do not check the 'Serving' flag here. // This is mainly to avoid the case where we run a // Diff between a source and destination, and the source // is not serving (disabled by TabletControl). // When we switch the tablet to 'worker', it will // go back to serving state. if addr.Stats == nil || addr.Stats.HealthError != "" || addr.Stats.SecondsBehindMaster > 30 { continue } healthyEndpoints = append(healthyEndpoints, addr.EndPoint) } if len(healthyEndpoints) >= *minHealthyEndPoints { break } deadlineForLog, _ := busywaitCtx.Deadline() wr.Logger().Infof("Waiting for enough endpoints to become available. available: %v required: %v Waiting up to %.1f more seconds.", len(healthyEndpoints), *minHealthyEndPoints, deadlineForLog.Sub(time.Now()).Seconds()) // Block for 1 second because 2 seconds is the -health_check_interval flag value in integration tests. timer := time.NewTimer(1 * time.Second) select { case <-busywaitCtx.Done(): timer.Stop() case <-timer.C: } } // random server in the list is what we want index := rand.Intn(len(healthyEndpoints)) return &topodatapb.TabletAlias{ Cell: cell, Uid: healthyEndpoints[index].Uid, }, nil }
// Iteration is a single iteration for the player: get the current status, // try to play, and plays until interrupted, or until an error occurs. func (bpc *BinlogPlayerController) Iteration() (err error) { defer func() { if x := recover(); x != nil { log.Errorf("%v: caught panic: %v\n%s", bpc, x, tb.Stack(4)) err = fmt.Errorf("panic: %v", x) } }() // Apply any special settings necessary for playback of binlogs. // We do it on every iteration to be sure, in case MySQL was restarted. if err := bpc.mysqld.EnableBinlogPlayback(); err != nil { // We failed to apply the required settings, so we shouldn't keep going. return err } // create the db connection, connect it vtClient := bpc.vtClientFactory() if err := vtClient.Connect(); err != nil { return fmt.Errorf("can't connect to database: %v", err) } defer vtClient.Close() // Read the start position startPosition, flags, err := binlogplayer.ReadStartPosition(vtClient, bpc.sourceShard.Uid) if err != nil { return fmt.Errorf("can't read startPosition: %v", err) } // if we shouldn't start, we just error out and try again later if strings.Index(flags, binlogplayer.BlpFlagDontStart) != -1 { return fmt.Errorf("not starting because flag '%v' is set", binlogplayer.BlpFlagDontStart) } // wait for the endpoint set (usefull for the first run at least, fast for next runs) if err := discovery.WaitForEndPoints(bpc.healthCheck, bpc.cell, bpc.sourceShard.Keyspace, bpc.sourceShard.Shard, []topodatapb.TabletType{topodatapb.TabletType_REPLICA}); err != nil { return fmt.Errorf("error waiting for endpoints for %v %v %v: %v", bpc.cell, bpc.sourceShard.String(), topodatapb.TabletType_REPLICA, err) } // Find the server list from the health check addrs := bpc.healthCheck.GetEndPointStatsFromTarget(bpc.sourceShard.Keyspace, bpc.sourceShard.Shard, topodatapb.TabletType_REPLICA) if len(addrs) == 0 { return fmt.Errorf("can't find any source tablet for %v %v %v", bpc.cell, bpc.sourceShard.String(), topodatapb.TabletType_REPLICA) } newServerIndex := rand.Intn(len(addrs)) endPoint := addrs[newServerIndex].EndPoint // save our current server bpc.playerMutex.Lock() bpc.sourceTablet = &topodatapb.TabletAlias{ Cell: bpc.cell, Uid: endPoint.Uid, } bpc.lastError = nil bpc.playerMutex.Unlock() // check which kind of replication we're doing, tables or keyrange if len(bpc.sourceShard.Tables) > 0 { // tables, first resolve wildcards tables, err := mysqlctl.ResolveTables(bpc.mysqld, bpc.dbName, bpc.sourceShard.Tables) if err != nil { return fmt.Errorf("failed to resolve table names: %v", err) } // tables, just get them player, err := binlogplayer.NewBinlogPlayerTables(vtClient, endPoint, tables, bpc.sourceShard.Uid, startPosition, bpc.stopPosition, bpc.binlogPlayerStats) if err != nil { return fmt.Errorf("NewBinlogPlayerTables failed: %v", err) } return player.ApplyBinlogEvents(bpc.ctx) } // the data we have to replicate is the intersection of the // source keyrange and our keyrange overlap, err := key.KeyRangesOverlap(bpc.sourceShard.KeyRange, bpc.keyRange) if err != nil { return fmt.Errorf("Source shard %v doesn't overlap destination shard %v", bpc.sourceShard.KeyRange, bpc.keyRange) } player, err := binlogplayer.NewBinlogPlayerKeyRange(vtClient, endPoint, overlap, bpc.sourceShard.Uid, startPosition, bpc.stopPosition, bpc.binlogPlayerStats) if err != nil { return fmt.Errorf("NewBinlogPlayerKeyRange failed: %v", err) } return player.ApplyBinlogEvents(bpc.ctx) }
func (wr *Wrangler) waitForDrainInCell(ctx context.Context, cell, keyspace, shard string, servedType topodatapb.TabletType, retryDelay, healthCheckTopologyRefresh, healthcheckRetryDelay, healthCheckTimeout time.Duration) error { hc := discovery.NewHealthCheck(healthCheckTimeout /* connectTimeout */, healthcheckRetryDelay, healthCheckTimeout, cell) defer hc.Close() watcher := discovery.NewShardReplicationWatcher(wr.TopoServer(), hc, cell, keyspace, shard, healthCheckTopologyRefresh, 5 /* topoReadConcurrency */) defer watcher.Stop() if err := discovery.WaitForEndPoints(ctx, hc, cell, keyspace, shard, []topodatapb.TabletType{servedType}); err != nil { return fmt.Errorf("%v: error waiting for initial %v endpoints for %v/%v: %v", cell, servedType, keyspace, shard, err) } wr.Logger().Infof("%v: Waiting for %.1f seconds to make sure that the discovery module retrieves healthcheck information from all tablets.", cell, healthCheckTimeout.Seconds()) // Wait at least for -vtctl_healthcheck_timeout to elapse to make sure that we // see all healthy tablets. Otherwise, we might miss some tablets. // It's safe to wait not longer for this because we would only miss slow // tablets and vtgate would not serve from such tablets anyway. time.Sleep(healthCheckTimeout) // Now check the QPS rate of all tablets until the timeout expires. startTime := time.Now() for { healthyTabletsCount := 0 // map key: tablet uid drainedHealthyTablets := make(map[uint32]*discovery.EndPointStats) notDrainedHealtyTablets := make(map[uint32]*discovery.EndPointStats) addrs := hc.GetEndPointStatsFromTarget(keyspace, shard, servedType) healthyTabletsCount = 0 for _, addr := range addrs { // TODO(mberlin): Move this health check logic into a common function // because other code uses it as well e.g. go/vt/worker/topo_utils.go. if addr.Stats == nil || addr.Stats.HealthError != "" || addr.Stats.SecondsBehindMaster > 30 { // not healthy continue } healthyTabletsCount++ if addr.Stats.Qps == 0.0 { drainedHealthyTablets[addr.EndPoint.Uid] = addr } else { notDrainedHealtyTablets[addr.EndPoint.Uid] = addr } } if len(drainedHealthyTablets) == healthyTabletsCount { wr.Logger().Infof("%v: All %d healthy tablets were drained after %.1f seconds (not counting %.1f seconds for the initial wait).", cell, healthyTabletsCount, time.Now().Sub(startTime).Seconds(), healthCheckTimeout.Seconds()) break } // Continue waiting, sleep in between. deadlineString := "" if d, ok := ctx.Deadline(); ok { deadlineString = fmt.Sprintf(" up to %.1f more seconds", d.Sub(time.Now()).Seconds()) } wr.Logger().Infof("%v: Waiting%v for all healthy tablets to be drained (%d/%d done).", cell, deadlineString, len(drainedHealthyTablets), healthyTabletsCount) timer := time.NewTimer(retryDelay) select { case <-ctx.Done(): timer.Stop() var l []string for _, eps := range notDrainedHealtyTablets { l = append(l, formatEndpointStats(eps)) } return fmt.Errorf("%v: WaitForDrain failed for %v tablets in %v/%v. Only %d/%d tablets were drained. err: %v List of tablets which were not drained:\n%v", cell, servedType, keyspace, shard, len(drainedHealthyTablets), healthyTabletsCount, ctx.Err(), strings.Join(l, "\n")) case <-timer.C: } } return nil }