Example #1
0
// Return a replication state that will reparent a slave to the
// correct master for a specified position.
func (mysqld *Mysqld) ReparentPosition(slavePosition *proto.ReplicationPosition) (rs *proto.ReplicationState, waitPosition *proto.ReplicationPosition, reparentTime int64, err error) {
	qr, err := mysqld.fetchSuperQuery(fmt.Sprintf("SELECT time_created_ns, new_addr, new_position, wait_position FROM _vt.reparent_log WHERE last_position = '%v'", slavePosition.MapKey()))
	if err != nil {
		return
	}
	if len(qr.Rows) != 1 {
		err = fmt.Errorf("no reparent for position: %v", slavePosition.MapKey())
		return
	}

	reparentTime, err = qr.Rows[0][0].ParseInt64()
	if err != nil {
		err = fmt.Errorf("bad reparent time: %v %v %v", slavePosition.MapKey(), qr.Rows[0][0], err)
		return
	}

	file, pos, err := parseReplicationPosition(qr.Rows[0][2].String())
	if err != nil {
		return
	}
	rs, err = proto.NewReplicationState(qr.Rows[0][1].String())
	if err != nil {
		return
	}
	rs.ReplicationPosition.MasterLogFile = file
	rs.ReplicationPosition.MasterLogPosition = uint(pos)

	file, pos, err = parseReplicationPosition(qr.Rows[0][3].String())
	if err != nil {
		return
	}
	waitPosition = new(proto.ReplicationPosition)
	waitPosition.MasterLogFile = file
	waitPosition.MasterLogPosition = pos
	return
}
Example #2
0
// Check all the tablets to see if we can proceed with reparenting.
// masterPosition is supplied from the demoted master if we are doing
// this gracefully.
func (wr *Wrangler) checkSlaveConsistency(tabletMap map[uint32]*topo.TabletInfo, masterPosition *myproto.ReplicationPosition) error {
	log.V(6).Infof("checkSlaveConsistency %v %#v", mapKeys(tabletMap), masterPosition)

	// FIXME(msolomon) Something still feels clumsy here and I can't put my finger on it.
	calls := make(chan *rpcContext, len(tabletMap))
	f := func(ti *topo.TabletInfo) {
		ctx := &rpcContext{tablet: ti}
		defer func() {
			calls <- ctx
		}()

		var args *myproto.ReplicationPosition
		if masterPosition != nil {
			// If the master position is known, do our best to wait for replication to catch up.
			args = masterPosition
		} else {
			// In the case where a master is down, look for the last bit of data copied and wait
			// for that to apply. That gives us a chance to wait for all data.
			replPos, err := wr.ai.SlavePosition(ti, wr.actionTimeout())
			if err != nil {
				ctx.err = err
				return
			}
			args = &myproto.ReplicationPosition{
				MasterLogFile:       replPos.MasterLogFileIo,
				MasterLogPositionIo: replPos.MasterLogPositionIo,
			}
		}

		// This option waits for the SQL thread to apply all changes to this instance.
		rp, err := wr.ai.WaitSlavePosition(ti, args, wr.actionTimeout())
		if err != nil {
			ctx.err = err
			return
		}
		ctx.position = rp
	}

	for _, tablet := range tabletMap {
		// Pass loop variable explicitly so we don't have a concurrency issue.
		go f(tablet)
	}

	// map positions to tablets
	positionMap := make(map[string][]uint32)
	for i := 0; i < len(tabletMap); i++ {
		ctx := <-calls
		mapKey := "unavailable-tablet-error"
		if ctx.err == nil {
			mapKey = ctx.position.MapKey()
		}
		if _, ok := positionMap[mapKey]; !ok {
			positionMap[mapKey] = make([]uint32, 0, 32)
		}
		positionMap[mapKey] = append(positionMap[mapKey], ctx.tablet.Alias.Uid)
	}

	if len(positionMap) == 1 {
		// great, everyone agrees
		// demotedMasterReplicationState is nil if demotion failed
		if masterPosition != nil {
			demotedMapKey := masterPosition.MapKey()
			if _, ok := positionMap[demotedMapKey]; !ok {
				for slaveMapKey := range positionMap {
					return fmt.Errorf("slave position doesn't match demoted master: %v != %v", demotedMapKey,
						slaveMapKey)
				}
			}
		}
	} else {
		// FIXME(msolomon) in the event of a crash, do you pick replica that is
		// furthest along or do you promote the majority? data loss vs availability
		// sounds like you pick the latest group and reclone.
		items := make([]string, 0, 32)
		for slaveMapKey, uids := range positionMap {
			tabletPaths := make([]string, len(uids))
			for i, uid := range uids {
				tabletPaths[i] = tabletMap[uid].Alias.String()
			}
			items = append(items, fmt.Sprintf("  %v\n    %v", slaveMapKey, strings.Join(tabletPaths, "\n    ")))
		}
		sort.Strings(items)
		return fmt.Errorf("inconsistent slaves, mark some offline with vtctl ScrapTablet\n%v", strings.Join(items, "\n"))
	}
	return nil
}