Exemplo n.º 1
0
func (wr *Wrangler) finishReparent(si *topo.ShardInfo, masterElect *topo.TabletInfo, majorityRestart, leaveMasterReadOnly bool) error {
	// If the majority of slaves restarted, move ahead.
	if majorityRestart {
		if leaveMasterReadOnly {
			log.Warningf("leaving master-elect read-only, change with: vtctl SetReadWrite %v", masterElect.Alias)
		} else {
			log.Infof("marking master-elect read-write %v", masterElect.Alias)
			actionPath, err := wr.ai.SetReadWrite(masterElect.Alias)
			if err == nil {
				err = wr.ai.WaitForCompletion(actionPath, wr.actionTimeout())
			}
			if err != nil {
				log.Warningf("master master-elect read-write failed, leaving master-elect read-only, change with: vtctl SetReadWrite %v", masterElect.Alias)
			}
		}
	} else {
		log.Warningf("minority reparent, manual fixes are needed, leaving master-elect read-only, change with: vtctl SetReadWrite %v", masterElect.Alias)
	}

	// save the new master in the shard info
	si.MasterAlias = masterElect.Alias
	if err := wr.ts.UpdateShard(si); err != nil {
		log.Errorf("Failed to save new master into shard: %v", err)
		return err
	}

	// We rebuild all the cells, as we may have taken tablets in and
	// out of the graph.
	log.Infof("rebuilding shard serving graph data")
	return topo.RebuildShard(wr.ts, masterElect.Keyspace, masterElect.Shard, topo.RebuildShardOptions{IgnorePartialResult: false})
}
Exemplo n.º 2
0
// same as ChangeType, but assume we already have the shard lock,
// and do not have the option to force anything.
func (wr *Wrangler) changeTypeInternal(tabletAlias topo.TabletAlias, dbType topo.TabletType) error {
	ti, err := wr.ts.GetTablet(tabletAlias)
	if err != nil {
		return err
	}
	rebuildRequired := ti.Tablet.IsInServingGraph()

	// change the type
	if wr.UseRPCs {
		if err := wr.ai.RpcChangeType(ti, dbType, wr.actionTimeout()); err != nil {
			return err
		}
	} else {
		actionPath, err := wr.ai.ChangeType(ti.Alias, dbType)
		if err != nil {
			return err
		}
		err = wr.ai.WaitForCompletion(actionPath, wr.actionTimeout())
		if err != nil {
			return err
		}
	}

	// rebuild if necessary
	if rebuildRequired {
		err = topo.RebuildShard(wr.ts, ti.Keyspace, ti.Shard, topo.RebuildShardOptions{
			Cells:               []string{ti.Alias.Cell},
			IgnorePartialResult: false,
		})
		if err != nil {
			return err
		}
	}
	return nil
}
Exemplo n.º 3
0
func (wr *Wrangler) shardExternallyReparentedLocked(keyspace, shard string, masterElectTabletAlias topo.TabletAlias) error {
	// read the shard, make sure the master is not already good.
	// critical read, we want up to date info (and the shard is locked).
	shardInfo, err := wr.ts.GetShardCritical(keyspace, shard)
	if err != nil {
		return err
	}
	if shardInfo.MasterAlias == masterElectTabletAlias {
		return fmt.Errorf("master-elect tablet %v is already master", masterElectTabletAlias)
	}

	// Read the tablets, make sure the master elect is known to us.
	// Note we will keep going with a partial tablet map, which usually
	// happens when a cell is not reachable. After these checks, the
	// guarantees we'll have are:
	// - global cell is reachable (we just locked and read the shard)
	// - the local cell that contains the new master is reachable
	//   (as we're going to check the new master is in the list)
	// That should be enough.
	tabletMap, err := topo.GetTabletMapForShard(wr.ts, keyspace, shard)
	partialTopology := false
	switch err {
	case nil:
		// keep going
	case topo.ErrPartialResult:
		partialTopology = true
		log.Warningf("Got topo.ErrPartialResult from GetTabletMapForShard, may need to re-init some tablets")
	default:
		return err
	}
	masterElectTablet, ok := tabletMap[masterElectTabletAlias]
	if !ok {
		return fmt.Errorf("master-elect tablet %v not found in replication graph %v/%v %v", masterElectTabletAlias, keyspace, shard, mapKeys(tabletMap))
	}

	// sort the tablets, and handle them
	slaveTabletMap, masterTabletMap := sortedTabletMap(tabletMap)
	err = wr.reparentShardExternal(slaveTabletMap, masterTabletMap, masterElectTablet)
	if err != nil {
		log.Infof("Skipping shard rebuild with failed reparent")
		return err
	}

	// now update the master record in the shard object
	log.Infof("Updating Shard's MasterAlias record")
	shardInfo.MasterAlias = masterElectTabletAlias
	if err = wr.ts.UpdateShard(shardInfo); err != nil {
		return err
	}

	// and rebuild the shard serving graph (but do not change the
	// master record, we already did it)
	log.Infof("Rebuilding shard serving graph data")
	return topo.RebuildShard(wr.ts, masterElectTablet.Keyspace, masterElectTablet.Shard,
		topo.RebuildShardOptions{IgnorePartialResult: partialTopology})
}
Exemplo n.º 4
0
// Rebuild the serving and replication rollup data data while locking
// out other changes.
func (wr *Wrangler) RebuildShardGraph(keyspace, shard string, cells []string) error {
	actionNode := actionnode.RebuildShard()
	lockPath, err := wr.lockShard(keyspace, shard, actionNode)
	if err != nil {
		return err
	}

	err = topo.RebuildShard(wr.ts, keyspace, shard, topo.RebuildShardOptions{Cells: cells, IgnorePartialResult: false})
	return wr.unlockShard(keyspace, shard, actionNode, lockPath, err)
}
Exemplo n.º 5
0
// RunHealthCheck takes the action mutex, runs the health check,
// and if we need to change our state, do it.
// If we are the master, we don't change our type, healthy or not.
// If we are not the master, we change to spare if not healthy,
// or to the passed in targetTabletType if healthy.
//
// Note we only update the topo record if we need to, that is if our type or
// health details changed.
func (agent *ActionAgent) RunHealthCheck(targetTabletType topo.TabletType) {
	agent.actionMutex.Lock()
	defer agent.actionMutex.Unlock()

	// read the current tablet record
	agent.mutex.Lock()
	tablet := agent._tablet
	agent.mutex.Unlock()

	// run the health check
	typeForHealthCheck := targetTabletType
	if tablet.Type == topo.TYPE_MASTER {
		typeForHealthCheck = topo.TYPE_MASTER
	}
	health, err := health.Run(typeForHealthCheck)
	if len(health) == 0 {
		health = nil
	}

	// start with no change
	newTabletType := tablet.Type
	if err != nil {
		if tablet.Type != targetTabletType {
			log.Infof("Tablet not healthy and in state %v, not changing it: %v", tablet.Type, err)
			return
		}
		log.Infof("Tablet not healthy, converting it from %v to spare: %v", targetTabletType, err)
		newTabletType = topo.TYPE_SPARE
	} else {
		// We are healthy, maybe with health, see if we need
		// to update the record. We only change from spare to
		// our target type.
		if tablet.Type == topo.TYPE_SPARE {
			newTabletType = targetTabletType
		}
		if tablet.Type == newTabletType && reflect.DeepEqual(health, tablet.Health) {
			// no change in health, not logging anything,
			// and we're done
			return
		}

		// we need to update our state
		log.Infof("Updating tablet record as healthy type %v -> %v with health details %v -> %v", tablet.Type, newTabletType, tablet.Health, health)
	}

	// Change the Type, update the health
	if err := ChangeType(agent.TopoServer, tablet.Alias, newTabletType, health, true /*runHooks*/); err != nil {
		log.Infof("Error updating tablet record: %v", err)
		return
	}

	// Rebuild the serving graph in our cell.
	// TODO: timeout should be configurable
	// TODO: interrupted may need to be a global one closed when we exit
	interrupted := make(chan struct{})
	actionNode := actionnode.RebuildShard()
	lockPath, err := actionNode.LockShard(agent.TopoServer, tablet.Keyspace, tablet.Shard, 5*time.Second, interrupted)
	if err != nil {
		log.Warningf("Cannot lock shard for rebuild: %v", err)
		return
	}
	err = topo.RebuildShard(agent.TopoServer, tablet.Keyspace, tablet.Shard, topo.RebuildShardOptions{Cells: []string{tablet.Alias.Cell}, IgnorePartialResult: true})
	err = actionNode.UnlockShard(agent.TopoServer, tablet.Keyspace, tablet.Shard, lockPath, err)
	if err != nil {
		log.Warningf("UnlockShard returned an error: %v", err)
		return
	}

	// run the post action callbacks
	agent.afterAction("healthcheck", false /* reloadSchema */)
}