func (wr *Wrangler) finishReparent(si *topo.ShardInfo, masterElect *topo.TabletInfo, majorityRestart, leaveMasterReadOnly bool) error { // If the majority of slaves restarted, move ahead. if majorityRestart { if leaveMasterReadOnly { log.Warningf("leaving master-elect read-only, change with: vtctl SetReadWrite %v", masterElect.Alias) } else { log.Infof("marking master-elect read-write %v", masterElect.Alias) actionPath, err := wr.ai.SetReadWrite(masterElect.Alias) if err == nil { err = wr.ai.WaitForCompletion(actionPath, wr.actionTimeout()) } if err != nil { log.Warningf("master master-elect read-write failed, leaving master-elect read-only, change with: vtctl SetReadWrite %v", masterElect.Alias) } } } else { log.Warningf("minority reparent, manual fixes are needed, leaving master-elect read-only, change with: vtctl SetReadWrite %v", masterElect.Alias) } // save the new master in the shard info si.MasterAlias = masterElect.Alias if err := wr.ts.UpdateShard(si); err != nil { log.Errorf("Failed to save new master into shard: %v", err) return err } // We rebuild all the cells, as we may have taken tablets in and // out of the graph. log.Infof("rebuilding shard serving graph data") return topotools.RebuildShard(wr.ts, masterElect.Keyspace, masterElect.Shard, topotools.RebuildShardOptions{IgnorePartialResult: false}, wr.lockTimeout, interrupted) }
func (wr *Wrangler) finishReparent(si *topo.ShardInfo, masterElect *topo.TabletInfo, majorityRestart, leaveMasterReadOnly bool) error { // If the majority of slaves restarted, move ahead. if majorityRestart { if leaveMasterReadOnly { wr.logger.Warningf("leaving master-elect read-only, change with: vtctl SetReadWrite %v", masterElect.Alias) } else { wr.logger.Infof("marking master-elect read-write %v", masterElect.Alias) if err := wr.tmc.SetReadWrite(masterElect, wr.ActionTimeout()); err != nil { wr.logger.Warningf("master master-elect read-write failed, leaving master-elect read-only, change with: vtctl SetReadWrite %v", masterElect.Alias) } } } else { wr.logger.Warningf("minority reparent, manual fixes are needed, leaving master-elect read-only, change with: vtctl SetReadWrite %v", masterElect.Alias) } // save the new master in the shard info si.MasterAlias = masterElect.Alias if err := topo.UpdateShard(wr.ts, si); err != nil { wr.logger.Errorf("Failed to save new master into shard: %v", err) return err } // We rebuild all the cells, as we may have taken tablets in and // out of the graph. wr.logger.Infof("rebuilding shard serving graph data") _, err := topotools.RebuildShard(wr.logger, wr.ts, masterElect.Keyspace, masterElect.Shard, nil, wr.lockTimeout, interrupted) return err }
// updateShardCellsAndMaster will update the 'Cells' and possibly // MasterAlias records for the shard, if needed. func (wr *Wrangler) updateShardCellsAndMaster(ctx context.Context, si *topo.ShardInfo, tabletAlias topo.TabletAlias, tabletType topo.TabletType, force bool) error { // See if we need to update the Shard: // - add the tablet's cell to the shard's Cells if needed // - change the master if needed shardUpdateRequired := false if !si.HasCell(tabletAlias.Cell) { shardUpdateRequired = true } if tabletType == topo.TYPE_MASTER && si.MasterAlias != tabletAlias { shardUpdateRequired = true } if !shardUpdateRequired { return nil } actionNode := actionnode.UpdateShard() keyspace := si.Keyspace() shard := si.ShardName() lockPath, err := wr.lockShard(ctx, keyspace, shard, actionNode) if err != nil { return err } // re-read the shard with the lock si, err = wr.ts.GetShard(ctx, keyspace, shard) if err != nil { return wr.unlockShard(ctx, keyspace, shard, actionNode, lockPath, err) } // update it wasUpdated := false if !si.HasCell(tabletAlias.Cell) { si.Cells = append(si.Cells, tabletAlias.Cell) wasUpdated = true } if tabletType == topo.TYPE_MASTER && si.MasterAlias != tabletAlias { if !si.MasterAlias.IsZero() && !force { return wr.unlockShard(ctx, keyspace, shard, actionNode, lockPath, fmt.Errorf("creating this tablet would override old master %v in shard %v/%v", si.MasterAlias, keyspace, shard)) } si.MasterAlias = tabletAlias wasUpdated = true } if wasUpdated { // write it back if err := topo.UpdateShard(ctx, wr.ts, si); err != nil { return wr.unlockShard(ctx, keyspace, shard, actionNode, lockPath, err) } } // and unlock return wr.unlockShard(ctx, keyspace, shard, actionNode, lockPath, err) }
// finalizeTabletExternallyReparented performs slow, synchronized reconciliation // tasks that ensure topology is self-consistent, and then marks the reparent as // finished by updating the global shard record. func (agent *ActionAgent) finalizeTabletExternallyReparented(ctx context.Context, si *topo.ShardInfo, ev *events.Reparent) (err error) { var wg sync.WaitGroup var errs concurrency.AllErrorRecorder oldMasterAlias := si.MasterAlias // Update the tablet records concurrently. event.DispatchUpdate(ev, "updating old and new master tablet records") log.Infof("finalizeTabletExternallyReparented: updating tablet records") wg.Add(1) go func() { defer wg.Done() // Update our own record to master. _, err := agent.TopoServer.UpdateTabletFields(ctx, agent.TabletAlias, func(tablet *topodatapb.Tablet) error { tablet.Type = topodatapb.TabletType_MASTER return nil }) if err != nil { errs.RecordError(err) } }() if !topoproto.TabletAliasIsZero(oldMasterAlias) { wg.Add(1) go func() { // Forcibly demote the old master in topology, since we can't rely on the // old master to be up to change its own record. oldMasterTablet, err := agent.TopoServer.UpdateTabletFields(ctx, oldMasterAlias, func(tablet *topodatapb.Tablet) error { tablet.Type = topodatapb.TabletType_REPLICA return nil }) if err != nil { errs.RecordError(err) wg.Done() return } // We now know more about the old master, so add it to event data. ev.OldMaster = *oldMasterTablet wg.Done() // Tell the old master to re-read its tablet record and change its state. // We don't need to wait for it. tmc := tmclient.NewTabletManagerClient() tmc.RefreshState(ctx, oldMasterTablet) }() } tablet := agent.Tablet() // Wait for the tablet records to be updated. At that point, any rebuild will // see the new master, so we're ready to mark the reparent as done in the // global shard record. wg.Wait() if errs.HasErrors() { return errs.Error() } // Update the master field in the global shard record. We don't use a lock // here anymore. The lock was only to ensure that the global shard record // didn't get modified between the time when we read it and the time when we // write it back. Now we use an update loop pattern to do that instead. event.DispatchUpdate(ev, "updating global shard record") log.Infof("finalizeTabletExternallyReparented: updating global shard record if needed") _, err = agent.TopoServer.UpdateShardFields(ctx, tablet.Keyspace, tablet.Shard, func(si *topo.ShardInfo) error { if topoproto.TabletAliasEqual(si.MasterAlias, tablet.Alias) { return topo.ErrNoUpdateNeeded } si.MasterAlias = tablet.Alias return nil }) if err != nil { return err } event.DispatchUpdate(ev, "finished") return nil }