Esempio n. 1
0
// FindHealthyRdonlyEndPoint returns a random healthy endpoint.
// Since we don't want to use them all, we require at least
// minHealthyEndPoints servers to be healthy.
// May block up to -wait_for_healthy_rdonly_endpoints_timeout.
func FindHealthyRdonlyEndPoint(ctx context.Context, wr *wrangler.Wrangler, cell, keyspace, shard string) (*topodatapb.TabletAlias, error) {
	busywaitCtx, busywaitCancel := context.WithTimeout(ctx, *WaitForHealthyEndPointsTimeout)
	defer busywaitCancel()

	// create a discovery healthcheck, wait for it to have one rdonly
	// endpoints at this point
	healthCheck := discovery.NewHealthCheck(*remoteActionsTimeout, *healthcheckRetryDelay, *healthCheckTimeout, "" /* statsSuffix */)
	watcher := discovery.NewShardReplicationWatcher(wr.TopoServer(), healthCheck, cell, keyspace, shard, *healthCheckTopologyRefresh, 5 /*topoReadConcurrency*/)
	defer watcher.Stop()
	defer healthCheck.Close()
	if err := discovery.WaitForEndPoints(ctx, healthCheck, cell, keyspace, shard, []topodatapb.TabletType{topodatapb.TabletType_RDONLY}); err != nil {
		return nil, fmt.Errorf("error waiting for rdonly endpoints for (%v,%v/%v): %v", cell, keyspace, shard, err)
	}

	var healthyEndpoints []*topodatapb.EndPoint
	for {
		select {
		case <-busywaitCtx.Done():
			return nil, fmt.Errorf("Not enough endpoints to choose from in (%v,%v/%v), have %v healthy ones, need at least %v Context Error: %v", cell, keyspace, shard, len(healthyEndpoints), *minHealthyEndPoints, busywaitCtx.Err())
		default:
		}

		addrs := healthCheck.GetEndPointStatsFromTarget(keyspace, shard, topodatapb.TabletType_RDONLY)
		healthyEndpoints = make([]*topodatapb.EndPoint, 0, len(addrs))
		for _, addr := range addrs {
			// Note we do not check the 'Serving' flag here.
			// This is mainly to avoid the case where we run a
			// Diff between a source and destination, and the source
			// is not serving (disabled by TabletControl).
			// When we switch the tablet to 'worker', it will
			// go back to serving state.
			if addr.Stats == nil || addr.Stats.HealthError != "" || addr.Stats.SecondsBehindMaster > 30 {
				continue
			}
			healthyEndpoints = append(healthyEndpoints, addr.EndPoint)
		}

		if len(healthyEndpoints) >= *minHealthyEndPoints {
			break
		}

		deadlineForLog, _ := busywaitCtx.Deadline()
		wr.Logger().Infof("Waiting for enough endpoints to become available. available: %v required: %v Waiting up to %.1f more seconds.", len(healthyEndpoints), *minHealthyEndPoints, deadlineForLog.Sub(time.Now()).Seconds())
		// Block for 1 second because 2 seconds is the -health_check_interval flag value in integration tests.
		timer := time.NewTimer(1 * time.Second)
		select {
		case <-busywaitCtx.Done():
			timer.Stop()
		case <-timer.C:
		}
	}

	// random server in the list is what we want
	index := rand.Intn(len(healthyEndpoints))
	return &topodatapb.TabletAlias{
		Cell: cell,
		Uid:  healthyEndpoints[index].Uid,
	}, nil
}
Esempio n. 2
0
// Iteration is a single iteration for the player: get the current status,
// try to play, and plays until interrupted, or until an error occurs.
func (bpc *BinlogPlayerController) Iteration() (err error) {
	defer func() {
		if x := recover(); x != nil {
			log.Errorf("%v: caught panic: %v\n%s", bpc, x, tb.Stack(4))
			err = fmt.Errorf("panic: %v", x)
		}
	}()

	// Apply any special settings necessary for playback of binlogs.
	// We do it on every iteration to be sure, in case MySQL was restarted.
	if err := bpc.mysqld.EnableBinlogPlayback(); err != nil {
		// We failed to apply the required settings, so we shouldn't keep going.
		return err
	}

	// create the db connection, connect it
	vtClient := bpc.vtClientFactory()
	if err := vtClient.Connect(); err != nil {
		return fmt.Errorf("can't connect to database: %v", err)
	}
	defer vtClient.Close()

	// Read the start position
	startPosition, flags, err := binlogplayer.ReadStartPosition(vtClient, bpc.sourceShard.Uid)
	if err != nil {
		return fmt.Errorf("can't read startPosition: %v", err)
	}

	// if we shouldn't start, we just error out and try again later
	if strings.Index(flags, binlogplayer.BlpFlagDontStart) != -1 {
		return fmt.Errorf("not starting because flag '%v' is set", binlogplayer.BlpFlagDontStart)
	}

	// wait for the endpoint set (usefull for the first run at least, fast for next runs)
	if err := discovery.WaitForEndPoints(bpc.healthCheck, bpc.cell, bpc.sourceShard.Keyspace, bpc.sourceShard.Shard, []topodatapb.TabletType{topodatapb.TabletType_REPLICA}); err != nil {
		return fmt.Errorf("error waiting for endpoints for %v %v %v: %v", bpc.cell, bpc.sourceShard.String(), topodatapb.TabletType_REPLICA, err)
	}

	// Find the server list from the health check
	addrs := bpc.healthCheck.GetEndPointStatsFromTarget(bpc.sourceShard.Keyspace, bpc.sourceShard.Shard, topodatapb.TabletType_REPLICA)
	if len(addrs) == 0 {
		return fmt.Errorf("can't find any source tablet for %v %v %v", bpc.cell, bpc.sourceShard.String(), topodatapb.TabletType_REPLICA)
	}
	newServerIndex := rand.Intn(len(addrs))
	endPoint := addrs[newServerIndex].EndPoint

	// save our current server
	bpc.playerMutex.Lock()
	bpc.sourceTablet = &topodatapb.TabletAlias{
		Cell: bpc.cell,
		Uid:  endPoint.Uid,
	}
	bpc.lastError = nil
	bpc.playerMutex.Unlock()

	// check which kind of replication we're doing, tables or keyrange
	if len(bpc.sourceShard.Tables) > 0 {
		// tables, first resolve wildcards
		tables, err := mysqlctl.ResolveTables(bpc.mysqld, bpc.dbName, bpc.sourceShard.Tables)
		if err != nil {
			return fmt.Errorf("failed to resolve table names: %v", err)
		}

		// tables, just get them
		player, err := binlogplayer.NewBinlogPlayerTables(vtClient, endPoint, tables, bpc.sourceShard.Uid, startPosition, bpc.stopPosition, bpc.binlogPlayerStats)
		if err != nil {
			return fmt.Errorf("NewBinlogPlayerTables failed: %v", err)
		}
		return player.ApplyBinlogEvents(bpc.ctx)
	}
	// the data we have to replicate is the intersection of the
	// source keyrange and our keyrange
	overlap, err := key.KeyRangesOverlap(bpc.sourceShard.KeyRange, bpc.keyRange)
	if err != nil {
		return fmt.Errorf("Source shard %v doesn't overlap destination shard %v", bpc.sourceShard.KeyRange, bpc.keyRange)
	}

	player, err := binlogplayer.NewBinlogPlayerKeyRange(vtClient, endPoint, overlap, bpc.sourceShard.Uid, startPosition, bpc.stopPosition, bpc.binlogPlayerStats)
	if err != nil {
		return fmt.Errorf("NewBinlogPlayerKeyRange failed: %v", err)
	}
	return player.ApplyBinlogEvents(bpc.ctx)
}
Esempio n. 3
0
func (wr *Wrangler) waitForDrainInCell(ctx context.Context, cell, keyspace, shard string, servedType topodatapb.TabletType,
	retryDelay, healthCheckTopologyRefresh, healthcheckRetryDelay, healthCheckTimeout time.Duration) error {
	hc := discovery.NewHealthCheck(healthCheckTimeout /* connectTimeout */, healthcheckRetryDelay, healthCheckTimeout, cell)
	defer hc.Close()
	watcher := discovery.NewShardReplicationWatcher(wr.TopoServer(), hc, cell, keyspace, shard, healthCheckTopologyRefresh, 5 /* topoReadConcurrency */)
	defer watcher.Stop()

	if err := discovery.WaitForEndPoints(ctx, hc, cell, keyspace, shard, []topodatapb.TabletType{servedType}); err != nil {
		return fmt.Errorf("%v: error waiting for initial %v endpoints for %v/%v: %v", cell, servedType, keyspace, shard, err)
	}

	wr.Logger().Infof("%v: Waiting for %.1f seconds to make sure that the discovery module retrieves healthcheck information from all tablets.",
		cell, healthCheckTimeout.Seconds())
	// Wait at least for -vtctl_healthcheck_timeout to elapse to make sure that we
	// see all healthy tablets. Otherwise, we might miss some tablets.
	// It's safe to wait not longer for this because we would only miss slow
	// tablets and vtgate would not serve from such tablets anyway.
	time.Sleep(healthCheckTimeout)

	// Now check the QPS rate of all tablets until the timeout expires.
	startTime := time.Now()
	for {
		healthyTabletsCount := 0
		// map key: tablet uid
		drainedHealthyTablets := make(map[uint32]*discovery.EndPointStats)
		notDrainedHealtyTablets := make(map[uint32]*discovery.EndPointStats)

		addrs := hc.GetEndPointStatsFromTarget(keyspace, shard, servedType)
		healthyTabletsCount = 0
		for _, addr := range addrs {
			// TODO(mberlin): Move this health check logic into a common function
			// because other code uses it as well e.g. go/vt/worker/topo_utils.go.
			if addr.Stats == nil || addr.Stats.HealthError != "" || addr.Stats.SecondsBehindMaster > 30 {
				// not healthy
				continue
			}

			healthyTabletsCount++
			if addr.Stats.Qps == 0.0 {
				drainedHealthyTablets[addr.EndPoint.Uid] = addr
			} else {
				notDrainedHealtyTablets[addr.EndPoint.Uid] = addr
			}
		}

		if len(drainedHealthyTablets) == healthyTabletsCount {
			wr.Logger().Infof("%v: All %d healthy tablets were drained after %.1f seconds (not counting %.1f seconds for the initial wait).",
				cell, healthyTabletsCount, time.Now().Sub(startTime).Seconds(), healthCheckTimeout.Seconds())
			break
		}

		// Continue waiting, sleep in between.
		deadlineString := ""
		if d, ok := ctx.Deadline(); ok {
			deadlineString = fmt.Sprintf(" up to %.1f more seconds", d.Sub(time.Now()).Seconds())
		}
		wr.Logger().Infof("%v: Waiting%v for all healthy tablets to be drained (%d/%d done).",
			cell, deadlineString, len(drainedHealthyTablets), healthyTabletsCount)

		timer := time.NewTimer(retryDelay)
		select {
		case <-ctx.Done():
			timer.Stop()

			var l []string
			for _, eps := range notDrainedHealtyTablets {
				l = append(l, formatEndpointStats(eps))
			}
			return fmt.Errorf("%v: WaitForDrain failed for %v tablets in %v/%v. Only %d/%d tablets were drained. err: %v List of tablets which were not drained:\n%v",
				cell, servedType, keyspace, shard, len(drainedHealthyTablets), healthyTabletsCount, ctx.Err(), strings.Join(l, "\n"))
		case <-timer.C:
		}
	}

	return nil
}