Example #1
0
// WaitBlpPosition will wait for the filtered replication to reach at least
// the provided position.
func WaitBlpPosition(mysqld MysqlDaemon, bp *blproto.BlpPosition, waitTimeout time.Duration) error {
	timeOut := time.Now().Add(waitTimeout)
	for {
		if time.Now().After(timeOut) {
			break
		}

		cmd := binlogplayer.QueryBlpCheckpoint(bp.Uid)
		qr, err := mysqld.FetchSuperQuery(cmd)
		if err != nil {
			return err
		}
		if len(qr.Rows) != 1 {
			return fmt.Errorf("QueryBlpCheckpoint(%v) returned unexpected row count: %v", bp.Uid, len(qr.Rows))
		}
		var pos proto.ReplicationPosition
		if !qr.Rows[0][0].IsNull() {
			pos, err = proto.DecodeReplicationPosition(qr.Rows[0][0].String())
			if err != nil {
				return err
			}
		}
		if pos.AtLeast(bp.Position) {
			return nil
		}

		log.Infof("Sleeping 1 second waiting for binlog replication(%v) to catch up: %v != %v", bp.Uid, pos, bp.Position)
		time.Sleep(1 * time.Second)
	}

	return fmt.Errorf("WaitBlpPosition(%v) timed out", bp.Uid)
}
Example #2
0
// BinlogInfo returns the filename and position for a Google MySQL group_id.
// This command only exists in Google MySQL.
func (mysqld *Mysqld) BinlogInfo(pos proto.ReplicationPosition) (fileName string, filePos uint, err error) {
	if pos.IsZero() {
		return fileName, filePos, fmt.Errorf("input position for BinlogInfo is uninitialized")
	}
	// Extract the group_id from the GoogleGTID. We can't just use String() on the
	// ReplicationPosition, because that includes the server_id.
	gtid, ok := pos.GTIDSet.(proto.GoogleGTID)
	if !ok {
		return "", 0, fmt.Errorf("Non-Google GTID in BinlogInfo(%#v), which is only supported on Google MySQL", pos)
	}
	info, err := mysqld.fetchSuperQueryMap(fmt.Sprintf("SHOW BINLOG INFO FOR %v", gtid.GroupID))
	if err != nil {
		return "", 0, err
	}
	fileName = info["Log_name"]
	temp, err := strconv.ParseUint(info["Pos"], 10, 32)
	if err != nil {
		return fileName, filePos, err
	}
	filePos = uint(temp)
	return fileName, filePos, err
}
Example #3
0
// Return a replication state that will reparent a slave to the
// correct master for a specified position.
func (mysqld *Mysqld) ReparentPosition(slavePosition *proto.ReplicationPosition) (rs *proto.ReplicationState, waitPosition *proto.ReplicationPosition, reparentTime int64, err error) {
	qr, err := mysqld.fetchSuperQuery(fmt.Sprintf("SELECT time_created_ns, new_addr, new_position, wait_position FROM _vt.reparent_log WHERE last_position = '%v'", slavePosition.MapKey()))
	if err != nil {
		return
	}
	if len(qr.Rows) != 1 {
		err = fmt.Errorf("no reparent for position: %v", slavePosition.MapKey())
		return
	}

	reparentTime, err = qr.Rows[0][0].ParseInt64()
	if err != nil {
		err = fmt.Errorf("bad reparent time: %v %v %v", slavePosition.MapKey(), qr.Rows[0][0], err)
		return
	}

	file, pos, err := parseReplicationPosition(qr.Rows[0][2].String())
	if err != nil {
		return
	}
	rs, err = proto.NewReplicationState(qr.Rows[0][1].String())
	if err != nil {
		return
	}
	rs.ReplicationPosition.MasterLogFile = file
	rs.ReplicationPosition.MasterLogPosition = uint(pos)

	file, pos, err = parseReplicationPosition(qr.Rows[0][3].String())
	if err != nil {
		return
	}
	waitPosition = new(proto.ReplicationPosition)
	waitPosition.MasterLogFile = file
	waitPosition.MasterLogPosition = pos
	return
}
Example #4
0
func (mysqld *Mysqld) SlaveStatus() (*proto.ReplicationPosition, error) {
	fields, err := mysqld.slaveStatus()
	if err != nil {
		return nil, err
	}
	pos := new(proto.ReplicationPosition)
	// Use Relay_Master_Log_File for the SQL thread postion.
	pos.MasterLogFile = fields["Relay_Master_Log_File"]
	pos.MasterLogFileIo = fields["Master_Log_File"]
	temp, _ := strconv.ParseUint(fields["Exec_Master_Log_Pos"], 10, 0)
	pos.MasterLogPosition = uint(temp)
	temp, _ = strconv.ParseUint(fields["Read_Master_Log_Pos"], 10, 0)
	pos.MasterLogPositionIo = uint(temp)
	pos.MasterLogGroupId, _ = strconv.ParseInt(fields["Exec_Master_Group_ID"], 10, 0)

	if fields["Slave_IO_Running"] == "Yes" && fields["Slave_SQL_Running"] == "Yes" {
		temp, _ = strconv.ParseUint(fields["Seconds_Behind_Master"], 10, 0)
		pos.SecondsBehindMaster = uint(temp)
	} else {
		// replications isn't running - report it as invalid since it won't resolve itself.
		pos.SecondsBehindMaster = proto.InvalidLagSeconds
	}
	return pos, nil
}
Example #5
0
// Check all the tablets to see if we can proceed with reparenting.
// masterPosition is supplied from the demoted master if we are doing
// this gracefully.
func (wr *Wrangler) checkSlaveConsistency(tabletMap map[uint32]*topo.TabletInfo, masterPosition *myproto.ReplicationPosition) error {
	log.V(6).Infof("checkSlaveConsistency %v %#v", mapKeys(tabletMap), masterPosition)

	// FIXME(msolomon) Something still feels clumsy here and I can't put my finger on it.
	calls := make(chan *rpcContext, len(tabletMap))
	f := func(ti *topo.TabletInfo) {
		ctx := &rpcContext{tablet: ti}
		defer func() {
			calls <- ctx
		}()

		var args *myproto.ReplicationPosition
		if masterPosition != nil {
			// If the master position is known, do our best to wait for replication to catch up.
			args = masterPosition
		} else {
			// In the case where a master is down, look for the last bit of data copied and wait
			// for that to apply. That gives us a chance to wait for all data.
			replPos, err := wr.ai.SlavePosition(ti, wr.actionTimeout())
			if err != nil {
				ctx.err = err
				return
			}
			args = &myproto.ReplicationPosition{
				MasterLogFile:       replPos.MasterLogFileIo,
				MasterLogPositionIo: replPos.MasterLogPositionIo,
			}
		}

		// This option waits for the SQL thread to apply all changes to this instance.
		rp, err := wr.ai.WaitSlavePosition(ti, args, wr.actionTimeout())
		if err != nil {
			ctx.err = err
			return
		}
		ctx.position = rp
	}

	for _, tablet := range tabletMap {
		// Pass loop variable explicitly so we don't have a concurrency issue.
		go f(tablet)
	}

	// map positions to tablets
	positionMap := make(map[string][]uint32)
	for i := 0; i < len(tabletMap); i++ {
		ctx := <-calls
		mapKey := "unavailable-tablet-error"
		if ctx.err == nil {
			mapKey = ctx.position.MapKey()
		}
		if _, ok := positionMap[mapKey]; !ok {
			positionMap[mapKey] = make([]uint32, 0, 32)
		}
		positionMap[mapKey] = append(positionMap[mapKey], ctx.tablet.Alias.Uid)
	}

	if len(positionMap) == 1 {
		// great, everyone agrees
		// demotedMasterReplicationState is nil if demotion failed
		if masterPosition != nil {
			demotedMapKey := masterPosition.MapKey()
			if _, ok := positionMap[demotedMapKey]; !ok {
				for slaveMapKey := range positionMap {
					return fmt.Errorf("slave position doesn't match demoted master: %v != %v", demotedMapKey,
						slaveMapKey)
				}
			}
		}
	} else {
		// FIXME(msolomon) in the event of a crash, do you pick replica that is
		// furthest along or do you promote the majority? data loss vs availability
		// sounds like you pick the latest group and reclone.
		items := make([]string, 0, 32)
		for slaveMapKey, uids := range positionMap {
			tabletPaths := make([]string, len(uids))
			for i, uid := range uids {
				tabletPaths[i] = tabletMap[uid].Alias.String()
			}
			items = append(items, fmt.Sprintf("  %v\n    %v", slaveMapKey, strings.Join(tabletPaths, "\n    ")))
		}
		sort.Strings(items)
		return fmt.Errorf("inconsistent slaves, mark some offline with vtctl ScrapTablet\n%v", strings.Join(items, "\n"))
	}
	return nil
}
Example #6
0
// Check all the tablets to see if we can proceed with reparenting.
// masterPosition is supplied from the demoted master if we are doing
// this gracefully.
func (wr *Wrangler) checkSlaveConsistency(tabletMap map[uint32]*topo.TabletInfo, masterPosition myproto.ReplicationPosition) error {
	wr.logger.Infof("checkSlaveConsistency %v %#v", topotools.MapKeys(tabletMap), masterPosition)

	// FIXME(msolomon) Something still feels clumsy here and I can't put my finger on it.
	calls := make(chan *rpcContext, len(tabletMap))
	f := func(ti *topo.TabletInfo) {
		ctx := &rpcContext{tablet: ti}
		defer func() {
			calls <- ctx
		}()

		if !masterPosition.IsZero() {
			// If the master position is known, do our best to wait for replication to catch up.
			status, err := wr.tmc.WaitSlavePosition(ti, masterPosition, wr.ActionTimeout())
			if err != nil {
				ctx.err = err
				return
			}
			ctx.status = status
		} else {
			// If the master is down, just get the slave status.
			status, err := wr.tmc.SlaveStatus(ti, wr.ActionTimeout())
			if err != nil {
				ctx.err = err
				return
			}
			ctx.status = status
		}
	}

	for _, tablet := range tabletMap {
		// Pass loop variable explicitly so we don't have a concurrency issue.
		go f(tablet)
	}

	// map positions to tablets
	positionMap := make(map[string][]uint32)
	for i := 0; i < len(tabletMap); i++ {
		ctx := <-calls
		mapKey := "unavailable-tablet-error"
		if ctx.err == nil {
			mapKey = ctx.status.Position.String()
		}
		if _, ok := positionMap[mapKey]; !ok {
			positionMap[mapKey] = make([]uint32, 0, 32)
		}
		positionMap[mapKey] = append(positionMap[mapKey], ctx.tablet.Alias.Uid)
	}

	if len(positionMap) == 1 {
		// great, everyone agrees
		// demotedMasterReplicationState is nil if demotion failed
		if !masterPosition.IsZero() {
			demotedMapKey := masterPosition.String()
			if _, ok := positionMap[demotedMapKey]; !ok {
				for slaveMapKey := range positionMap {
					return fmt.Errorf("slave position doesn't match demoted master: %v != %v", demotedMapKey,
						slaveMapKey)
				}
			}
		}
	} else {
		// FIXME(msolomon) in the event of a crash, do you pick replica that is
		// furthest along or do you promote the majority? data loss vs availability
		// sounds like you pick the latest group and reclone.
		items := make([]string, 0, 32)
		for slaveMapKey, uids := range positionMap {
			tabletPaths := make([]string, len(uids))
			for i, uid := range uids {
				tabletPaths[i] = tabletMap[uid].Alias.String()
			}
			items = append(items, fmt.Sprintf("  %v\n    %v", slaveMapKey, strings.Join(tabletPaths, "\n    ")))
		}
		sort.Strings(items)
		return fmt.Errorf("inconsistent slaves, mark some offline with vtctl ScrapTablet\n%v", strings.Join(items, "\n"))
	}
	return nil
}