// Make this external, since these transitions need to be forced from time to time. func ChangeType(ts topo.Server, tabletAlias topo.TabletAlias, newType topo.TabletType, runHooks bool) error { tablet, err := ts.GetTablet(tabletAlias) if err != nil { return err } if !topo.IsTrivialTypeChange(tablet.Type, newType) || !topo.IsValidTypeChange(tablet.Type, newType) { return fmt.Errorf("cannot change tablet type %v -> %v %v", tablet.Type, newType, tabletAlias) } if runHooks { // Only run the preflight_serving_type hook when // transitioning from non-serving to serving. if !topo.IsInServingGraph(tablet.Type) && topo.IsInServingGraph(newType) { if err := hook.NewSimpleHook("preflight_serving_type").ExecuteOptional(); err != nil { return err } } } tablet.Type = newType if newType == topo.TYPE_IDLE { if tablet.Parent.IsZero() { si, err := ts.GetShard(tablet.Keyspace, tablet.Shard) if err != nil { return err } rec := concurrency.AllErrorRecorder{} wg := sync.WaitGroup{} for _, cell := range si.Cells { wg.Add(1) go func(cell string) { defer wg.Done() sri, err := ts.GetShardReplication(cell, tablet.Keyspace, tablet.Shard) if err != nil { log.Warningf("Cannot check cell %v for extra replication paths, assuming it's good", cell) return } for _, rl := range sri.ReplicationLinks { if rl.Parent == tabletAlias { rec.RecordError(fmt.Errorf("Still have a ReplicationLink in cell %v", cell)) } } }(cell) } wg.Wait() if rec.HasErrors() { return rec.Error() } } tablet.Parent = topo.TabletAlias{} tablet.Keyspace = "" tablet.Shard = "" tablet.KeyRange = key.KeyRange{} } return topo.UpdateTablet(ts, tablet) }
// ChangeSlaveType changes the type of tablet and recomputes all // necessary derived paths in the serving graph, if necessary. // // Note we don't update the master record in the Shard here, as we // can't ChangeType from and out of master anyway. func (wr *Wrangler) ChangeSlaveType(ctx context.Context, tabletAlias *topodatapb.TabletAlias, tabletType topodatapb.TabletType) error { // Load tablet to find endpoint, and keyspace and shard assignment. ti, err := wr.ts.GetTablet(ctx, tabletAlias) if err != nil { return err } if !topo.IsTrivialTypeChange(ti.Type, tabletType) { return fmt.Errorf("tablet %v type change %v -> %v is not an allowed transition for ChangeSlaveType", tabletAlias, ti.Type, tabletType) } // ask the tablet to make the change if err := wr.tmc.ChangeType(ctx, ti, tabletType); err != nil { return err } // if the tablet was or is serving, rebuild the serving graph if ti.IsInServingGraph() || topo.IsInServingGraph(tabletType) { if _, err := wr.RebuildShardGraph(ctx, ti.Tablet.Keyspace, ti.Tablet.Shard, []string{ti.Tablet.Alias.Cell}); err != nil { return err } } return nil }
// rebuildShardIfNeeded will rebuild the serving graph if we need to func (agent *ActionAgent) rebuildShardIfNeeded(tablet *topo.TabletInfo, targetTabletType topo.TabletType, lockTimeout time.Duration) error { if topo.IsInServingGraph(targetTabletType) { // TODO: interrupted may need to be a global one closed when we exit interrupted := make(chan struct{}) if *topotools.UseSrvShardLocks { // no need to take the shard lock in this case if err := topotools.RebuildShard(agent.TopoServer, tablet.Keyspace, tablet.Shard, topotools.RebuildShardOptions{Cells: []string{tablet.Alias.Cell}, IgnorePartialResult: true}, lockTimeout, interrupted); err != nil { return fmt.Errorf("topotools.RebuildShard returned an error: %v", err) } } else { actionNode := actionnode.RebuildShard() lockPath, err := actionNode.LockShard(agent.TopoServer, tablet.Keyspace, tablet.Shard, lockTimeout, interrupted) if err != nil { return fmt.Errorf("cannot lock shard for rebuild: %v", err) } err = topotools.RebuildShard(agent.TopoServer, tablet.Keyspace, tablet.Shard, topotools.RebuildShardOptions{Cells: []string{tablet.Alias.Cell}, IgnorePartialResult: true}, lockTimeout, interrupted) err = actionNode.UnlockShard(agent.TopoServer, tablet.Keyspace, tablet.Shard, lockPath, err) if err != nil { return fmt.Errorf("UnlockShard returned an error: %v", err) } } } return nil }
// updateServingGraph will update the serving graph if we need to. func (agent *ActionAgent) updateServingGraph(tablet *topo.TabletInfo, targetTabletType pbt.TabletType) error { if topo.IsInServingGraph(targetTabletType) { if err := topotools.UpdateTabletEndpoints(agent.batchCtx, agent.TopoServer, tablet.Tablet); err != nil { return fmt.Errorf("UpdateTabletEndpoints failed: %v", err) } } return nil }
// DeleteShard will do all the necessary changes in the topology server // to entirely remove a shard. func (wr *Wrangler) DeleteShard(ctx context.Context, keyspace, shard string, recursive bool) error { shardInfo, err := wr.ts.GetShard(ctx, keyspace, shard) if err != nil { return err } tabletMap, err := topo.GetTabletMapForShard(ctx, wr.ts, keyspace, shard) if err != nil { return err } if recursive { wr.Logger().Infof("Deleting all tablets in shard %v/%v", keyspace, shard) for tabletAlias := range tabletMap { // We don't care about scrapping or updating the replication graph, // because we're about to delete the entire replication graph. wr.Logger().Infof("Deleting tablet %v", tabletAlias) if err := wr.TopoServer().DeleteTablet(ctx, tabletAlias); err != nil && err != topo.ErrNoNode { // Unlike the errors below in non-recursive steps, we don't want to // continue if a DeleteTablet fails. If we continue and delete the // replication graph, the tablet record will be orphaned, since we'll // no longer know it belongs to this shard. // // If the problem is temporary, or resolved externally, re-running // DeleteShard will skip over tablets that were already deleted. return fmt.Errorf("can't delete tablet %v: %v", tabletAlias, err) } } } else if len(tabletMap) > 0 { return fmt.Errorf("shard %v/%v still has %v tablets; use -recursive or remove them manually", keyspace, shard, len(tabletMap)) } // remove the replication graph and serving graph in each cell for _, cell := range shardInfo.Cells { if err := wr.ts.DeleteShardReplication(ctx, cell, keyspace, shard); err != nil && err != topo.ErrNoNode { wr.Logger().Warningf("Cannot delete ShardReplication in cell %v for %v/%v: %v", cell, keyspace, shard, err) } for _, t := range topo.AllTabletTypes { if !topo.IsInServingGraph(t) { continue } if err := wr.ts.DeleteEndPoints(ctx, cell, keyspace, shard, t, -1); err != nil && err != topo.ErrNoNode { wr.Logger().Warningf("Cannot delete EndPoints in cell %v for %v/%v/%v: %v", cell, keyspace, shard, t, err) } } if err := wr.ts.DeleteSrvShard(ctx, cell, keyspace, shard); err != nil && err != topo.ErrNoNode { wr.Logger().Warningf("Cannot delete SrvShard in cell %v for %v/%v: %v", cell, keyspace, shard, err) } } return wr.ts.DeleteShard(ctx, keyspace, shard) }
// rebuildShardIfNeeded will rebuild the serving graph if we need to func (agent *ActionAgent) rebuildShardIfNeeded(tablet *topo.TabletInfo, targetTabletType topo.TabletType) error { if topo.IsInServingGraph(targetTabletType) { // TODO: interrupted may need to be a global one closed when we exit interrupted := make(chan struct{}) // no need to take the shard lock in this case if err := topotools.RebuildShard(logutil.NewConsoleLogger(), agent.TopoServer, tablet.Keyspace, tablet.Shard, []string{tablet.Alias.Cell}, agent.LockTimeout, interrupted); err != nil { return fmt.Errorf("topotools.RebuildShard returned an error: %v", err) } } return nil }
// VtctldSrvType returns the tablet type, possibly linked to the // EndPoints page in vtctld. func VtctldSrvType(cell, keyspace, shard string, tabletType topo.TabletType) template.HTML { if !topo.IsInServingGraph(tabletType) { return template.HTML(tabletType) } return MakeVtctldRedirect(string(tabletType), map[string]string{ "type": "srv_type", "cell": cell, "keyspace": keyspace, "shard": shard, "tablet_type": string(tabletType), }) }
// VtctldSrvType returns the tablet type, possibly linked to the // EndPoints page in vtctld. func VtctldSrvType(cell, keyspace, shard string, tabletType pb.TabletType) template.HTML { strTabletType := strings.ToLower(tabletType.String()) if !topo.IsInServingGraph(tabletType) { return template.HTML(strTabletType) } return MakeVtctldRedirect(strTabletType, map[string]string{ "type": "srv_type", "cell": cell, "keyspace": keyspace, "shard": shard, "tablet_type": strTabletType, }) }
// UpdateTabletEndpoints fixes up any entries in the serving graph that relate // to a given tablet. func UpdateTabletEndpoints(ctx context.Context, ts topo.Server, tablet *pb.Tablet) (err error) { srvTypes, err := ts.GetSrvTabletTypesPerShard(ctx, tablet.Alias.Cell, tablet.Keyspace, tablet.Shard) if err != nil { if err != topo.ErrNoNode { return err } // It's fine if there are no existing types. srvTypes = nil } wg := sync.WaitGroup{} errs := concurrency.AllErrorRecorder{} // Update the list that the tablet is supposed to be in (if any). if topo.IsInServingGraph(tablet.Type) { endpoint, err := topo.TabletEndPoint(tablet) if err != nil { return err } wg.Add(1) go func() { defer wg.Done() errs.RecordError( updateEndpoint(ctx, ts, tablet.Alias.Cell, tablet.Keyspace, tablet.Shard, tablet.Type, endpoint)) }() } // Remove it from any other lists it isn't supposed to be in. for _, srvType := range srvTypes { if srvType != tablet.Type { wg.Add(1) go func(tabletType pb.TabletType) { defer wg.Done() errs.RecordError( removeEndpoint(ctx, ts, tablet.Alias.Cell, tablet.Keyspace, tablet.Shard, tabletType, tablet.Alias.Uid)) }(srvType) } } wg.Wait() return errs.Error() }
// ChangeSlaveType changes the type of tablet and recomputes all // necessary derived paths in the serving graph, if necessary. // // Note we don't update the master record in the Shard here, as we // can't ChangeType from and out of master anyway. func (wr *Wrangler) ChangeSlaveType(ctx context.Context, tabletAlias *pb.TabletAlias, tabletType pb.TabletType) error { // Load tablet to find endpoint, and keyspace and shard assignment. ti, err := wr.ts.GetTablet(ctx, tabletAlias) if err != nil { return err } // ask the tablet to make the change if err := wr.tmc.ChangeType(ctx, ti, tabletType); err != nil { return err } // if the tablet was or is serving, rebuild the serving graph if ti.IsInServingGraph() || topo.IsInServingGraph(tabletType) { if _, err := wr.RebuildShardGraph(ctx, ti.Tablet.Keyspace, ti.Tablet.Shard, []string{ti.Tablet.Alias.Cell}); err != nil { return err } } return nil }
// DeleteShard will do all the necessary changes in the topology server // to entirely remove a shard. It can only work if there are no tablets // in that shard. func (wr *Wrangler) DeleteShard(keyspace, shard string) error { shardInfo, err := wr.ts.GetShard(keyspace, shard) if err != nil { return err } tabletMap, err := GetTabletMapForShard(wr.ts, keyspace, shard) if err != nil { return err } if len(tabletMap) > 0 { return fmt.Errorf("shard %v/%v still has %v tablets", keyspace, shard, len(tabletMap)) } // remove the replication graph and serving graph in each cell for _, cell := range shardInfo.Cells { if err := wr.ts.DeleteShardReplication(cell, keyspace, shard); err != nil { log.Warningf("Cannot delete ShardReplication in cell %v for %v/%v: %v", cell, keyspace, shard, err) } for _, t := range topo.AllTabletTypes { if !topo.IsInServingGraph(t) { continue } if err := wr.ts.DeleteSrvTabletType(cell, keyspace, shard, t); err != nil && err != topo.ErrNoNode { log.Warningf("Cannot delete EndPoints in cell %v for %v/%v/%v: %v", cell, keyspace, shard, t, err) } } if err := wr.ts.DeleteSrvShard(cell, keyspace, shard); err != nil && err != topo.ErrNoNode { log.Warningf("Cannot delete SrvShard in cell %v for %v/%v: %v", cell, keyspace, shard, err) } } return wr.ts.DeleteShard(keyspace, shard) }
// RunHealthCheck takes the action mutex, runs the health check, // and if we need to change our state, do it. // If we are the master, we don't change our type, healthy or not. // If we are not the master, we change to spare if not healthy, // or to the passed in targetTabletType if healthy. // // Note we only update the topo record if we need to, that is if our type or // health details changed. func (agent *ActionAgent) RunHealthCheck(targetTabletType topo.TabletType, lockTimeout time.Duration) { agent.actionMutex.Lock() defer agent.actionMutex.Unlock() // read the current tablet record agent.mutex.Lock() tablet := agent._tablet agent.mutex.Unlock() // run the health check typeForHealthCheck := targetTabletType if tablet.Type == topo.TYPE_MASTER { typeForHealthCheck = topo.TYPE_MASTER } health, err := health.Run(typeForHealthCheck) // start with no change newTabletType := tablet.Type if err != nil { if tablet.Type != targetTabletType { log.Infof("Tablet not healthy and in state %v, not changing it: %v", tablet.Type, err) return } log.Infof("Tablet not healthy, converting it from %v to spare: %v", targetTabletType, err) newTabletType = topo.TYPE_SPARE } else { // We are healthy, maybe with health, see if we need // to update the record. We only change from spare to // our target type. if tablet.Type == topo.TYPE_SPARE { newTabletType = targetTabletType } if tablet.Type == newTabletType && tablet.IsHealthEqual(health) { // no change in health, not logging anything, // and we're done return } // we need to update our state log.Infof("Updating tablet record as healthy type %v -> %v with health details %v -> %v", tablet.Type, newTabletType, tablet.Health, health) } // Change the Type, update the health. Note we pass in a map // that's not nil, meaning if it's empty, we will clear it. if err := topotools.ChangeType(agent.TopoServer, tablet.Alias, newTabletType, health, true /*runHooks*/); err != nil { log.Infof("Error updating tablet record: %v", err) return } // Rebuild the serving graph in our cell, only if we're dealing with // a serving type if topo.IsInServingGraph(targetTabletType) { // TODO: interrupted may need to be a global one closed when we exit interrupted := make(chan struct{}) if *topotools.UseSrvShardLocks { // no need to take the shard lock in this case if err := topotools.RebuildShard(agent.TopoServer, tablet.Keyspace, tablet.Shard, topotools.RebuildShardOptions{Cells: []string{tablet.Alias.Cell}, IgnorePartialResult: true}, lockTimeout, interrupted); err != nil { log.Warningf("topotools.RebuildShard returned an error: %v", err) return } } else { actionNode := actionnode.RebuildShard() lockPath, err := actionNode.LockShard(agent.TopoServer, tablet.Keyspace, tablet.Shard, lockTimeout, interrupted) if err != nil { log.Warningf("Cannot lock shard for rebuild: %v", err) return } err = topotools.RebuildShard(agent.TopoServer, tablet.Keyspace, tablet.Shard, topotools.RebuildShardOptions{Cells: []string{tablet.Alias.Cell}, IgnorePartialResult: true}, lockTimeout, interrupted) err = actionNode.UnlockShard(agent.TopoServer, tablet.Keyspace, tablet.Shard, lockPath, err) if err != nil { log.Warningf("UnlockShard returned an error: %v", err) return } } } // run the post action callbacks agent.afterAction("healthcheck", false /* reloadSchema */) }