func (wr *Wrangler) finishReparent(si *topo.ShardInfo, masterElect *topo.TabletInfo, majorityRestart, leaveMasterReadOnly bool) error { // If the majority of slaves restarted, move ahead. if majorityRestart { if leaveMasterReadOnly { log.Warningf("leaving master-elect read-only, change with: vtctl SetReadWrite %v", masterElect.Alias) } else { log.Infof("marking master-elect read-write %v", masterElect.Alias) actionPath, err := wr.ai.SetReadWrite(masterElect.Alias) if err == nil { err = wr.ai.WaitForCompletion(actionPath, wr.actionTimeout()) } if err != nil { log.Warningf("master master-elect read-write failed, leaving master-elect read-only, change with: vtctl SetReadWrite %v", masterElect.Alias) } } } else { log.Warningf("minority reparent, manual fixes are needed, leaving master-elect read-only, change with: vtctl SetReadWrite %v", masterElect.Alias) } // save the new master in the shard info si.MasterAlias = masterElect.Alias if err := wr.ts.UpdateShard(si); err != nil { log.Errorf("Failed to save new master into shard: %v", err) return err } // We rebuild all the cells, as we may have taken tablets in and // out of the graph. log.Infof("rebuilding shard serving graph data") return topo.RebuildShard(wr.ts, masterElect.Keyspace, masterElect.Shard, topo.RebuildShardOptions{IgnorePartialResult: false}) }
// same as ChangeType, but assume we already have the shard lock, // and do not have the option to force anything. func (wr *Wrangler) changeTypeInternal(tabletAlias topo.TabletAlias, dbType topo.TabletType) error { ti, err := wr.ts.GetTablet(tabletAlias) if err != nil { return err } rebuildRequired := ti.Tablet.IsInServingGraph() // change the type if wr.UseRPCs { if err := wr.ai.RpcChangeType(ti, dbType, wr.actionTimeout()); err != nil { return err } } else { actionPath, err := wr.ai.ChangeType(ti.Alias, dbType) if err != nil { return err } err = wr.ai.WaitForCompletion(actionPath, wr.actionTimeout()) if err != nil { return err } } // rebuild if necessary if rebuildRequired { err = topo.RebuildShard(wr.ts, ti.Keyspace, ti.Shard, topo.RebuildShardOptions{ Cells: []string{ti.Alias.Cell}, IgnorePartialResult: false, }) if err != nil { return err } } return nil }
func (wr *Wrangler) shardExternallyReparentedLocked(keyspace, shard string, masterElectTabletAlias topo.TabletAlias) error { // read the shard, make sure the master is not already good. // critical read, we want up to date info (and the shard is locked). shardInfo, err := wr.ts.GetShardCritical(keyspace, shard) if err != nil { return err } if shardInfo.MasterAlias == masterElectTabletAlias { return fmt.Errorf("master-elect tablet %v is already master", masterElectTabletAlias) } // Read the tablets, make sure the master elect is known to us. // Note we will keep going with a partial tablet map, which usually // happens when a cell is not reachable. After these checks, the // guarantees we'll have are: // - global cell is reachable (we just locked and read the shard) // - the local cell that contains the new master is reachable // (as we're going to check the new master is in the list) // That should be enough. tabletMap, err := topo.GetTabletMapForShard(wr.ts, keyspace, shard) partialTopology := false switch err { case nil: // keep going case topo.ErrPartialResult: partialTopology = true log.Warningf("Got topo.ErrPartialResult from GetTabletMapForShard, may need to re-init some tablets") default: return err } masterElectTablet, ok := tabletMap[masterElectTabletAlias] if !ok { return fmt.Errorf("master-elect tablet %v not found in replication graph %v/%v %v", masterElectTabletAlias, keyspace, shard, mapKeys(tabletMap)) } // sort the tablets, and handle them slaveTabletMap, masterTabletMap := sortedTabletMap(tabletMap) err = wr.reparentShardExternal(slaveTabletMap, masterTabletMap, masterElectTablet) if err != nil { log.Infof("Skipping shard rebuild with failed reparent") return err } // now update the master record in the shard object log.Infof("Updating Shard's MasterAlias record") shardInfo.MasterAlias = masterElectTabletAlias if err = wr.ts.UpdateShard(shardInfo); err != nil { return err } // and rebuild the shard serving graph (but do not change the // master record, we already did it) log.Infof("Rebuilding shard serving graph data") return topo.RebuildShard(wr.ts, masterElectTablet.Keyspace, masterElectTablet.Shard, topo.RebuildShardOptions{IgnorePartialResult: partialTopology}) }
// Rebuild the serving and replication rollup data data while locking // out other changes. func (wr *Wrangler) RebuildShardGraph(keyspace, shard string, cells []string) error { actionNode := actionnode.RebuildShard() lockPath, err := wr.lockShard(keyspace, shard, actionNode) if err != nil { return err } err = topo.RebuildShard(wr.ts, keyspace, shard, topo.RebuildShardOptions{Cells: cells, IgnorePartialResult: false}) return wr.unlockShard(keyspace, shard, actionNode, lockPath, err) }
// RunHealthCheck takes the action mutex, runs the health check, // and if we need to change our state, do it. // If we are the master, we don't change our type, healthy or not. // If we are not the master, we change to spare if not healthy, // or to the passed in targetTabletType if healthy. // // Note we only update the topo record if we need to, that is if our type or // health details changed. func (agent *ActionAgent) RunHealthCheck(targetTabletType topo.TabletType) { agent.actionMutex.Lock() defer agent.actionMutex.Unlock() // read the current tablet record agent.mutex.Lock() tablet := agent._tablet agent.mutex.Unlock() // run the health check typeForHealthCheck := targetTabletType if tablet.Type == topo.TYPE_MASTER { typeForHealthCheck = topo.TYPE_MASTER } health, err := health.Run(typeForHealthCheck) if len(health) == 0 { health = nil } // start with no change newTabletType := tablet.Type if err != nil { if tablet.Type != targetTabletType { log.Infof("Tablet not healthy and in state %v, not changing it: %v", tablet.Type, err) return } log.Infof("Tablet not healthy, converting it from %v to spare: %v", targetTabletType, err) newTabletType = topo.TYPE_SPARE } else { // We are healthy, maybe with health, see if we need // to update the record. We only change from spare to // our target type. if tablet.Type == topo.TYPE_SPARE { newTabletType = targetTabletType } if tablet.Type == newTabletType && reflect.DeepEqual(health, tablet.Health) { // no change in health, not logging anything, // and we're done return } // we need to update our state log.Infof("Updating tablet record as healthy type %v -> %v with health details %v -> %v", tablet.Type, newTabletType, tablet.Health, health) } // Change the Type, update the health if err := ChangeType(agent.TopoServer, tablet.Alias, newTabletType, health, true /*runHooks*/); err != nil { log.Infof("Error updating tablet record: %v", err) return } // Rebuild the serving graph in our cell. // TODO: timeout should be configurable // TODO: interrupted may need to be a global one closed when we exit interrupted := make(chan struct{}) actionNode := actionnode.RebuildShard() lockPath, err := actionNode.LockShard(agent.TopoServer, tablet.Keyspace, tablet.Shard, 5*time.Second, interrupted) if err != nil { log.Warningf("Cannot lock shard for rebuild: %v", err) return } err = topo.RebuildShard(agent.TopoServer, tablet.Keyspace, tablet.Shard, topo.RebuildShardOptions{Cells: []string{tablet.Alias.Cell}, IgnorePartialResult: true}) err = actionNode.UnlockShard(agent.TopoServer, tablet.Keyspace, tablet.Shard, lockPath, err) if err != nil { log.Warningf("UnlockShard returned an error: %v", err) return } // run the post action callbacks agent.afterAction("healthcheck", false /* reloadSchema */) }