func updateReplicationGraphForPromotedSlave(ts topo.Server, tablet *topo.TabletInfo) error { // Remove tablet from the replication graph if this is not already the master. if tablet.Parent.Uid != topo.NO_TABLET { if err := topo.DeleteTabletReplicationData(ts, tablet.Tablet); err != nil && err != topo.ErrNoNode { return err } } // Update tablet regardless - trend towards consistency. tablet.State = topo.STATE_READ_WRITE tablet.Type = topo.TYPE_MASTER tablet.Parent.Cell = "" tablet.Parent.Uid = topo.NO_TABLET err := topo.UpdateTablet(ts, tablet) if err != nil { return err } // NOTE(msolomon) A serving graph update is required, but in // order for the shard to be consistent the old master must be // scrapped first. That is externally coordinated by the // wrangler reparent action. // Insert the new tablet location in the replication graph now that // we've updated the tablet. err = topo.CreateTabletReplicationData(ts, tablet.Tablet) if err != nil && err != topo.ErrNodeExists { return err } return nil }
func SlaveWasRestarted(ts topo.Server, tabletAlias topo.TabletAlias, swrd *actionnode.SlaveWasRestartedArgs) error { tablet, err := ts.GetTablet(tabletAlias) if err != nil { return err } // Once this action completes, update authoritive tablet node first. tablet.Parent = swrd.Parent if tablet.Type == topo.TYPE_MASTER { tablet.Type = topo.TYPE_SPARE tablet.State = topo.STATE_READ_ONLY } err = topo.UpdateTablet(ts, tablet) if err != nil { return err } // Update the new tablet location in the replication graph now that // we've updated the tablet. err = topo.CreateTabletReplicationData(ts, tablet.Tablet) if err != nil && err != topo.ErrNodeExists { return err } return nil }
func (ta *TabletActor) SlaveWasRestarted(actionNode *ActionNode, masterAddr string) error { swrd := actionNode.args.(*SlaveWasRestartedData) tablet, err := ta.ts.GetTablet(ta.tabletAlias) if err != nil { return err } // Remove tablet from the replication graph. if err := topo.DeleteTabletReplicationData(ta.ts, tablet.Tablet); err != nil && err != topo.ErrNoNode { // FIXME(alainjobart) once we don't have replication paths // any more, remove this extra check if err == topo.ErrNotEmpty { log.Infof("Failed to delete master replication path, will be caught later") } else { return err } } // now we can check the reparent actually worked if masterAddr == "" { masterAddr, err = ta.mysqlDaemon.GetMasterAddr() if err != nil { return err } } if masterAddr != swrd.ExpectedMasterAddr && masterAddr != swrd.ExpectedMasterIpAddr { log.Errorf("slaveWasRestarted found unexpected master %v for %v (was expecting %v or %v)", masterAddr, ta.tabletAlias, swrd.ExpectedMasterAddr, swrd.ExpectedMasterIpAddr) if swrd.ScrapStragglers { return Scrap(ta.ts, tablet.Alias(), false) } else { return fmt.Errorf("Unexpected master %v for %v (was expecting %v or %v)", masterAddr, ta.tabletAlias, swrd.ExpectedMasterAddr, swrd.ExpectedMasterIpAddr) } } // Once this action completes, update authoritive tablet node first. tablet.Parent = swrd.Parent if tablet.Type == topo.TYPE_MASTER { tablet.Type = topo.TYPE_SPARE tablet.State = topo.STATE_READ_ONLY } err = topo.UpdateTablet(ta.ts, tablet) if err != nil { return err } // Insert the new tablet location in the replication graph now that // we've updated the tablet. err = topo.CreateTabletReplicationData(ta.ts, tablet.Tablet) if err != nil && err != topo.ErrNodeExists { return err } return nil }
func (ta *TabletActor) slaveWasRestarted(actionNode *ActionNode) error { swrd := actionNode.args.(*SlaveWasRestartedData) tablet, err := ta.ts.GetTablet(ta.tabletAlias) if err != nil { return err } // Remove tablet from the replication graph. if err := topo.DeleteTabletReplicationData(ta.ts, tablet.Tablet, tablet.ReplicationPath()); err != nil && err != topo.ErrNoNode { return err } // now we can check the reparent actually worked masterAddr, err := ta.mysqld.GetMasterAddr() if err != nil { return err } if masterAddr != swrd.ExpectedMasterAddr && masterAddr != swrd.ExpectedMasterIpAddr { log.Errorf("slaveWasRestarted found unexpected master %v for %v (was expecting %v or %v)", masterAddr, ta.tabletAlias, swrd.ExpectedMasterAddr, swrd.ExpectedMasterIpAddr) if swrd.ScrapStragglers { return Scrap(ta.ts, tablet.Alias(), false) } else { return fmt.Errorf("Unexpected master %v for %v (was expecting %v or %v)", masterAddr, ta.tabletAlias, swrd.ExpectedMasterAddr, swrd.ExpectedMasterIpAddr) } } // Once this action completes, update authoritive tablet node first. tablet.Parent = swrd.Parent if tablet.Type == topo.TYPE_MASTER { tablet.Type = topo.TYPE_SPARE tablet.State = topo.STATE_READ_ONLY } err = topo.UpdateTablet(ta.ts, tablet) if err != nil { return err } // Insert the new tablet location in the replication graph now that // we've updated the tablet. err = topo.CreateTabletReplicationData(ta.ts, tablet.Tablet) if err != nil && err != topo.ErrNodeExists { return err } return nil }
func SlaveWasRestarted(ts topo.Server, mysqlDaemon mysqlctl.MysqlDaemon, tabletAlias topo.TabletAlias, swrd *actionnode.SlaveWasRestartedArgs) error { tablet, err := ts.GetTablet(tabletAlias) if err != nil { return err } // check the reparent actually worked masterAddr, err := mysqlDaemon.GetMasterAddr() if err != nil { return err } if masterAddr != swrd.ExpectedMasterAddr && masterAddr != swrd.ExpectedMasterIpAddr { log.Errorf("SlaveWasRestarted found unexpected master %v for %v (was expecting %v or %v)", masterAddr, tabletAlias, swrd.ExpectedMasterAddr, swrd.ExpectedMasterIpAddr) // Disabled for now // if swrd.ContinueOnUnexpectedMaster { // log.Errorf("ContinueOnUnexpectedMaster is set, we keep going anyway") // } else if swrd.ScrapStragglers { return Scrap(ts, tablet.Alias, false) } else { return fmt.Errorf("Unexpected master %v for %v (was expecting %v or %v)", masterAddr, tabletAlias, swrd.ExpectedMasterAddr, swrd.ExpectedMasterIpAddr) } } // Once this action completes, update authoritive tablet node first. tablet.Parent = swrd.Parent if tablet.Type == topo.TYPE_MASTER { tablet.Type = topo.TYPE_SPARE tablet.State = topo.STATE_READ_ONLY } err = topo.UpdateTablet(ts, tablet) if err != nil { return err } // Update the new tablet location in the replication graph now that // we've updated the tablet. err = topo.CreateTabletReplicationData(ts, tablet.Tablet) if err != nil && err != topo.ErrNodeExists { return err } return nil }
// change a tablet type to RESTORE and set all the other arguments. // from now on, we can go to: // - back to IDLE if we don't use the tablet at all (after for instance // a successful ReserveForRestore but a failed Snapshot) // - to SCRAP if something in the process on the target host fails // - to SPARE if the clone works func (ta *TabletActor) changeTypeToRestore(tablet, sourceTablet *topo.TabletInfo, parentAlias topo.TabletAlias, keyRange key.KeyRange) error { // run the optional preflight_assigned hook hk := hook.NewSimpleHook("preflight_assigned") topotools.ConfigureTabletHook(hk, ta.tabletAlias) if err := hk.ExecuteOptional(); err != nil { return err } // change the type tablet.Parent = parentAlias tablet.Keyspace = sourceTablet.Keyspace tablet.Shard = sourceTablet.Shard tablet.Type = topo.TYPE_RESTORE tablet.KeyRange = keyRange tablet.DbNameOverride = sourceTablet.DbNameOverride if err := topo.UpdateTablet(ta.ts, tablet); err != nil { return err } // and create the replication graph items return topo.CreateTabletReplicationData(ta.ts, tablet.Tablet) }
// This is a quick and dirty tool to resurrect the TopologyServer data from the // canonical data stored in the tablet nodes. // // cells: local vt cells to scan for all tablets // keyspaces: list of keyspaces to rebuild func (wr *Wrangler) RebuildReplicationGraph(cells []string, keyspaces []string) error { if cells == nil || len(cells) == 0 { return fmt.Errorf("must specify cells to rebuild replication graph") } if keyspaces == nil || len(keyspaces) == 0 { return fmt.Errorf("must specify keyspaces to rebuild replication graph") } allTablets := make([]*topo.TabletInfo, 0, 1024) for _, cell := range cells { tablets, err := GetAllTablets(wr.ts, cell) if err != nil { return err } allTablets = append(allTablets, tablets...) } for _, keyspace := range keyspaces { log.V(6).Infof("delete keyspace shards: %v", keyspace) if err := wr.ts.DeleteKeyspaceShards(keyspace); err != nil { return err } } keyspacesToRebuild := make(map[string]bool) shardsCreated := make(map[string]bool) hasErr := false mu := sync.Mutex{} wg := sync.WaitGroup{} for _, ti := range allTablets { wg.Add(1) go func(ti *topo.TabletInfo) { defer wg.Done() if !ti.IsInReplicationGraph() { return } if !strInList(keyspaces, ti.Keyspace) { return } mu.Lock() keyspacesToRebuild[ti.Keyspace] = true shardPath := ti.Keyspace + "/" + ti.Shard if !shardsCreated[shardPath] { if err := topo.CreateShard(wr.ts, ti.Keyspace, ti.Shard); err != nil && err != topo.ErrNodeExists { log.Warningf("failed re-creating shard %v: %v", shardPath, err) hasErr = true } else { shardsCreated[shardPath] = true } } mu.Unlock() err := topo.CreateTabletReplicationData(wr.ts, ti.Tablet) if err != nil { mu.Lock() hasErr = true mu.Unlock() log.Warningf("failed creating replication path: %v", err) } }(ti) } wg.Wait() for keyspace := range keyspacesToRebuild { wg.Add(1) go func(keyspace string) { defer wg.Done() if err := wr.RebuildKeyspaceGraph(keyspace, nil); err != nil { mu.Lock() hasErr = true mu.Unlock() log.Warningf("RebuildKeyspaceGraph(%v) failed: %v", keyspace, err) return } }(keyspace) } wg.Wait() if hasErr { return fmt.Errorf("some errors occurred rebuilding replication graph, consult log") } return nil }
// InitTablet creates or updates a tablet. If no parent is specified // in the tablet, and the tablet has a slave type, we will find the // appropriate parent. If createShardAndKeyspace is true and the // parent keyspace or shard don't exist, they will be created. If // update is true, and a tablet with the same ID exists, update it. // If Force is true, and a tablet with the same ID already exists, it // will be scrapped and deleted, and then recreated. func (wr *Wrangler) InitTablet(tablet *topo.Tablet, force, createShardAndKeyspace, update bool) error { if err := tablet.Complete(); err != nil { return err } if tablet.IsInReplicationGraph() { // create the parent keyspace and shard if needed if createShardAndKeyspace { if err := wr.ts.CreateKeyspace(tablet.Keyspace, &topo.Keyspace{}); err != nil && err != topo.ErrNodeExists { return err } if err := topo.CreateShard(wr.ts, tablet.Keyspace, tablet.Shard); err != nil && err != topo.ErrNodeExists { return err } } // get the shard, checks a couple things si, err := wr.ts.GetShard(tablet.Keyspace, tablet.Shard) if err != nil { return fmt.Errorf("missing parent shard, use -parent option to create it, or CreateKeyspace / CreateShard") } if si.KeyRange != tablet.KeyRange { return fmt.Errorf("shard %v/%v has a different KeyRange: %v != %v", tablet.Keyspace, tablet.Shard, si.KeyRange, tablet.KeyRange) } if tablet.Type == topo.TYPE_MASTER && !si.MasterAlias.IsZero() && si.MasterAlias != tablet.Alias && !force { return fmt.Errorf("creating this tablet would override old master %v in shard %v/%v", si.MasterAlias, tablet.Keyspace, tablet.Shard) } // see if we specified a parent, otherwise get it from the shard if tablet.Parent.IsZero() && tablet.Type.IsSlaveType() { if si.MasterAlias.IsZero() { return fmt.Errorf("trying to create tablet %v in shard %v/%v without a master", tablet.Alias, tablet.Keyspace, tablet.Shard) } tablet.Parent = si.MasterAlias } // update the shard record if needed if err := wr.updateShardCellsAndMaster(si, tablet.Alias, tablet.Type, force); err != nil { return err } } err := topo.CreateTablet(wr.ts, tablet) if err != nil && err == topo.ErrNodeExists { // Try to update nicely, but if it fails fall back to force behavior. if update || force { oldTablet, err := wr.ts.GetTablet(tablet.Alias) if err != nil { log.Warningf("failed reading tablet %v: %v", tablet.Alias, err) } else { if oldTablet.Keyspace == tablet.Keyspace && oldTablet.Shard == tablet.Shard { *(oldTablet.Tablet) = *tablet if err := topo.UpdateTablet(wr.ts, oldTablet); err != nil { log.Warningf("failed updating tablet %v: %v", tablet.Alias, err) // now fall through the Scrap case } else { if !tablet.IsInReplicationGraph() { return nil } if err := topo.CreateTabletReplicationData(wr.ts, tablet); err != nil { log.Warningf("failed updating tablet replication data for %v: %v", tablet.Alias, err) // now fall through the Scrap case } else { return nil } } } } } if force { if _, err = wr.Scrap(tablet.Alias, force, false); err != nil { log.Errorf("failed scrapping tablet %v: %v", tablet.Alias, err) return err } if err := wr.ts.DeleteTablet(tablet.Alias); err != nil { // we ignore this log.Errorf("failed deleting tablet %v: %v", tablet.Alias, err) } return topo.CreateTablet(wr.ts, tablet) } } return err }
func (ta *TabletActor) restartSlave(actionNode *actionnode.ActionNode) error { rsd := actionNode.Args.(*actionnode.RestartSlaveData) tablet, err := ta.ts.GetTablet(ta.tabletAlias) if err != nil { return err } // If this check fails, we seem reparented. The only part that // could have failed is the insert in the replication // graph. Do NOT try to reparent again. That will either wedge // replication or corrupt data. if tablet.Parent != rsd.Parent { log.V(6).Infof("restart with new parent") // Remove tablet from the replication graph. if err = topo.DeleteTabletReplicationData(ta.ts, tablet.Tablet); err != nil && err != topo.ErrNoNode { return err } // Move a lag slave into the orphan lag type so we can safely ignore // this reparenting until replication catches up. if tablet.Type == topo.TYPE_LAG { tablet.Type = topo.TYPE_LAG_ORPHAN } else { err = ta.mysqld.RestartSlave(rsd.ReplicationState, rsd.WaitPosition, rsd.TimePromoted) if err != nil { return err } } // Once this action completes, update authoritive tablet node first. tablet.Parent = rsd.Parent err = topo.UpdateTablet(ta.ts, tablet) if err != nil { return err } } else if rsd.Force { err = ta.mysqld.RestartSlave(rsd.ReplicationState, rsd.WaitPosition, rsd.TimePromoted) if err != nil { return err } // Complete the special orphan accounting. if tablet.Type == topo.TYPE_LAG_ORPHAN { tablet.Type = topo.TYPE_LAG err = topo.UpdateTablet(ta.ts, tablet) if err != nil { return err } } } else { // There is nothing to safely reparent, so check replication. If // either replication thread is not running, report an error. replicationPos, err := ta.mysqld.SlaveStatus() if err != nil { return fmt.Errorf("cannot verify replication for slave: %v", err) } if replicationPos.SecondsBehindMaster == myproto.InvalidLagSeconds { return fmt.Errorf("replication not running for slave") } } // Insert the new tablet location in the replication graph now that // we've updated the tablet. err = topo.CreateTabletReplicationData(ta.ts, tablet.Tablet) if err != nil && err != topo.ErrNodeExists { return err } return nil }
// InitTablet creates or updates a tablet. If no parent is specified // in the tablet, and the tablet has a slave type, we will find the // appropriate parent. If createShardAndKeyspace is true and the // parent keyspace or shard don't exist, they will be created. If // update is true, and a tablet with the same ID exists, update it. // If Force is true, and a tablet with the same ID already exists, it // will be scrapped and deleted, and then recreated. func (wr *Wrangler) InitTablet(tablet *topo.Tablet, force, createShardAndKeyspace, update bool) error { if err := tablet.Complete(); err != nil { return err } if tablet.IsInReplicationGraph() { // create the parent keyspace and shard if needed if createShardAndKeyspace { if err := wr.ts.CreateKeyspace(tablet.Keyspace); err != nil && err != topo.ErrNodeExists { return err } if err := topo.CreateShard(wr.ts, tablet.Keyspace, tablet.Shard); err != nil && err != topo.ErrNodeExists { return err } } // get the shard, checks a couple things si, err := wr.ts.GetShard(tablet.Keyspace, tablet.Shard) if err != nil { return fmt.Errorf("missing parent shard, use -parent option to create it, or CreateKeyspace / CreateShard") } if si.KeyRange != tablet.KeyRange { return fmt.Errorf("shard %v/%v has a different KeyRange: %v != %v", tablet.Keyspace, tablet.Shard, si.KeyRange, tablet.KeyRange) } if tablet.Type == topo.TYPE_MASTER && !si.MasterAlias.IsZero() && si.MasterAlias != tablet.Alias() && !force { return fmt.Errorf("creating this tablet would override old master %v in shard %v/%v", si.MasterAlias, tablet.Keyspace, tablet.Shard) } // see if we specified a parent, otherwise get it from the shard if tablet.Parent.IsZero() && tablet.Type.IsSlaveType() { if si.MasterAlias.IsZero() { return fmt.Errorf("trying to create tablet %v in shard %v/%v without a master", tablet.Alias(), tablet.Keyspace, tablet.Shard) } tablet.Parent = si.MasterAlias } // See if we need to update the Shard: // - add the tablet's cell to the shard's Cells if needed // - change the master if needed shardUpdateRequired := false if !si.HasCell(tablet.Cell) { shardUpdateRequired = true } if tablet.Type == topo.TYPE_MASTER && si.MasterAlias != tablet.Alias() { shardUpdateRequired = true } if shardUpdateRequired { actionNode := wr.ai.UpdateShard() lockPath, err := wr.lockShard(tablet.Keyspace, tablet.Shard, actionNode) if err != nil { return err } // re-read the shard with the lock si, err = wr.ts.GetShard(tablet.Keyspace, tablet.Shard) if err != nil { return wr.unlockShard(tablet.Keyspace, tablet.Shard, actionNode, lockPath, err) } // update it wasUpdated := false if !si.HasCell(tablet.Cell) { si.Cells = append(si.Cells, tablet.Cell) wasUpdated = true } if tablet.Type == topo.TYPE_MASTER && si.MasterAlias != tablet.Alias() { if !si.MasterAlias.IsZero() && !force { return wr.unlockShard(tablet.Keyspace, tablet.Shard, actionNode, lockPath, fmt.Errorf("creating this tablet would override old master %v in shard %v/%v", si.MasterAlias, tablet.Keyspace, tablet.Shard)) } si.MasterAlias = tablet.Alias() wasUpdated = true } if wasUpdated { // write it back if err := wr.ts.UpdateShard(si); err != nil { return wr.unlockShard(tablet.Keyspace, tablet.Shard, actionNode, lockPath, err) } } // and unlock if err := wr.unlockShard(tablet.Keyspace, tablet.Shard, actionNode, lockPath, err); err != nil { return err } // also create the cell's ShardReplication if err := wr.ts.CreateShardReplication(tablet.Cell, tablet.Keyspace, tablet.Shard, &topo.ShardReplication{}); err != nil && err != topo.ErrNodeExists { return err } } } err := topo.CreateTablet(wr.ts, tablet) if err != nil && err == topo.ErrNodeExists { // Try to update nicely, but if it fails fall back to force behavior. if update || force { oldTablet, err := wr.ts.GetTablet(tablet.Alias()) if err != nil { log.Warningf("failed reading tablet %v: %v", tablet.Alias(), err) } else { if oldTablet.Keyspace == tablet.Keyspace && oldTablet.Shard == tablet.Shard { *(oldTablet.Tablet) = *tablet if err := topo.UpdateTablet(wr.ts, oldTablet); err != nil { log.Warningf("failed updating tablet %v: %v", tablet.Alias(), err) // now fall through the Scrap case } else { if !tablet.IsInReplicationGraph() { return nil } if err := topo.CreateTabletReplicationData(wr.ts, tablet); err != nil { log.Warningf("failed updating tablet replication data for %v: %v", tablet.Alias(), err) // now fall through the Scrap case } else { return nil } } } } } if force { if _, err = wr.Scrap(tablet.Alias(), force, false); err != nil { log.Errorf("failed scrapping tablet %v: %v", tablet.Alias(), err) return err } if err := wr.ts.DeleteTablet(tablet.Alias()); err != nil { // we ignore this log.Errorf("failed deleting tablet %v: %v", tablet.Alias(), err) } return topo.CreateTablet(wr.ts, tablet) } } return err }