func (wr *Wrangler) SnapshotSourceEnd(tabletAlias topo.TabletAlias, slaveStartRequired, readWrite bool, originalType topo.TabletType) (err error) { var ti *topo.TabletInfo ti, err = wr.ts.GetTablet(tabletAlias) if err != nil { return } var actionPath string actionPath, err = wr.ai.SnapshotSourceEnd(tabletAlias, &tm.SnapshotSourceEndArgs{slaveStartRequired, !readWrite}) if err != nil { return } // wait for completion, and save the error err = wr.ai.WaitForCompletion(actionPath, wr.actionTimeout()) if err != nil { log.Errorf("SnapshotSourceEnd failed (%v), leaving tablet type alone", err) return } if ti.Tablet.Parent.Uid == topo.NO_TABLET { ti.Tablet.Type = topo.TYPE_MASTER err = topo.UpdateTablet(wr.ts, ti) } else { err = wr.ChangeType(ti.Alias(), originalType, false) } return err }
func (wr *Wrangler) restartSlavesExternal(slaveTabletMap map[topo.TabletAlias]*topo.TabletInfo, masterTablet, masterElectTablet *topo.TabletInfo, scrapStragglers bool) error { recorder := concurrency.AllErrorRecorder{} wg := sync.WaitGroup{} swrd := tm.SlaveWasRestartedData{ Parent: masterElectTablet.Alias(), ExpectedMasterAddr: masterElectTablet.MysqlAddr, ExpectedMasterIpAddr: masterElectTablet.MysqlIpAddr, ScrapStragglers: scrapStragglers, } // do all the slaves for _, ti := range slaveTabletMap { wg.Add(1) go func(ti *topo.TabletInfo) { recorder.RecordError(wr.slaveWasRestarted(ti, &swrd)) wg.Done() }(ti) } wg.Wait() // then do the master recorder.RecordError(wr.slaveWasRestarted(masterTablet, &swrd)) return recorder.Error() }
// rpcCallTablet wil execute the RPC on the remote server. func (client *GoRPCTabletManagerClient) rpcCallTablet(ctx context.Context, tablet *topo.TabletInfo, name string, args, reply interface{}) error { // create the RPC client, using ctx.Deadline if set, or no timeout. var connectTimeout time.Duration deadline, ok := ctx.Deadline() if ok { connectTimeout = deadline.Sub(time.Now()) if connectTimeout < 0 { return timeoutError{fmt.Errorf("timeout connecting to TabletManager.%v on %v", name, tablet.Alias)} } } rpcClient, err := bsonrpc.DialHTTP("tcp", tablet.Addr(), connectTimeout) if err != nil { return fmt.Errorf("RPC error for %v: %v", tablet.Alias, err.Error()) } defer rpcClient.Close() // use the context Done() channel. Will handle context timeout. call := rpcClient.Go(ctx, "TabletManager."+name, args, reply, nil) select { case <-ctx.Done(): if ctx.Err() == context.DeadlineExceeded { return timeoutError{fmt.Errorf("timeout waiting for TabletManager.%v to %v", name, tablet.Alias)} } return fmt.Errorf("interrupted waiting for TabletManager.%v to %v", name, tablet.Alias) case <-call.Done: if call.Error != nil { return fmt.Errorf("remote error for %v: %v", tablet.Alias, call.Error.Error()) } return nil } }
func (client *GoRpcTabletManagerClient) MultiSnapshot(tablet *topo.TabletInfo, sa *actionnode.MultiSnapshotArgs, waitTime time.Duration) (<-chan *logutil.LoggerEvent, tmclient.MultiSnapshotReplyFunc, error) { rpcClient, err := bsonrpc.DialHTTP("tcp", tablet.Addr(), waitTime, nil) if err != nil { return nil, nil, err } logstream := make(chan *logutil.LoggerEvent, 10) rpcstream := make(chan *gorpcproto.MultiSnapshotStreamingReply, 10) result := &actionnode.MultiSnapshotReply{} c := rpcClient.StreamGo("TabletManager.MultiSnapshot", sa, rpcstream) go func() { for ssr := range rpcstream { if ssr.Log != nil { logstream <- ssr.Log } if ssr.Result != nil { *result = *ssr.Result } } close(logstream) rpcClient.Close() }() return logstream, func() (*actionnode.MultiSnapshotReply, error) { return result, c.Error }, nil }
func updateReplicationGraphForPromotedSlave(ts topo.Server, tablet *topo.TabletInfo) error { // Remove tablet from the replication graph if this is not already the master. if tablet.Parent.Uid != topo.NO_TABLET { if err := topo.DeleteTabletReplicationData(ts, tablet.Tablet); err != nil && err != topo.ErrNoNode { return err } } // Update tablet regardless - trend towards consistency. tablet.State = topo.STATE_READ_WRITE tablet.Type = topo.TYPE_MASTER tablet.Parent.Cell = "" tablet.Parent.Uid = topo.NO_TABLET err := topo.UpdateTablet(ts, tablet) if err != nil { return err } // NOTE(msolomon) A serving graph update is required, but in // order for the shard to be consistent the old master must be // scrapped first. That is externally coordinated by the // wrangler reparent action. // Insert the new tablet location in the replication graph now that // we've updated the tablet. err = topo.CreateTabletReplicationData(ts, tablet.Tablet) if err != nil && err != topo.ErrNodeExists { return err } return nil }
func (client *fakeTabletManagerClient) GetSchema(ctx context.Context, tablet *topo.TabletInfo, tables, excludeTables []string, includeViews bool) (*proto.SchemaDefinition, error) { result, ok := client.schemaDefinitions[tablet.DbName()] if !ok { return nil, fmt.Errorf("unknown database: %s", tablet.DbName()) } return result, nil }
// UpdateTabletFields implements topo.Server. func (s *Server) UpdateTabletFields(ctx context.Context, tabletAlias topo.TabletAlias, updateFunc func(*topo.Tablet) error) error { var ti *topo.TabletInfo var err error for { if ti, err = s.GetTablet(ctx, tabletAlias); err != nil { return err } if err = updateFunc(ti.Tablet); err != nil { return err } if _, err = s.UpdateTablet(ctx, ti, ti.Version()); err != topo.ErrBadVersion { break } } if err != nil { return err } event.Dispatch(&events.TabletChange{ Tablet: *ti.Tablet, Status: "updated", }) return nil }
func (wr *Wrangler) restartSlave(ti *topo.TabletInfo, rsd *tm.RestartSlaveData) (err error) { log.Infof("restart slave %v", ti.Alias()) actionPath, err := wr.ai.RestartSlave(ti.Alias(), rsd) if err != nil { return err } return wr.ai.WaitForCompletion(actionPath, wr.actionTimeout()) }
func (wr *Wrangler) slaveWasRestarted(ti *topo.TabletInfo, swrd *tm.SlaveWasRestartedData) (err error) { log.Infof("slaveWasRestarted(%v)", ti.Alias()) actionPath, err := wr.ai.SlaveWasRestarted(ti.Alias(), swrd) if err != nil { return err } return wr.ai.WaitForCompletion(actionPath, wr.actionTimeout()) }
func (wr *Wrangler) checkMasterElect(ti *topo.TabletInfo) error { // Check the master-elect is fit for duty - call out for hardware checks. // if the server was already serving live traffic, it's probably good if ti.IsInServingGraph() { return nil } return wr.ExecuteOptionalTabletInfoHook(ti, hook.NewSimpleHook("preflight_serving_type")) }
func newTabletNodeFromTabletInfo(ti *topo.TabletInfo) *TabletNode { if err := ti.ValidatePortmap(); err != nil { log.Errorf("ValidatePortmap(%v): %v", ti.Alias, err) } return &TabletNode{ Host: ti.Hostname, Port: ti.Portmap["vt"], Alias: ti.Alias, } }
func (wr *Wrangler) getMasterPosition(ti *topo.TabletInfo) (*mysqlctl.ReplicationPosition, error) { actionPath, err := wr.ai.MasterPosition(ti.Alias()) if err != nil { return nil, err } result, err := wr.ai.WaitForCompletionReply(actionPath, wr.actionTimeout()) if err != nil { return nil, err } return result.(*mysqlctl.ReplicationPosition), nil }
// applySQLShard applies a given SQL change on a given tablet alias. It allows executing arbitrary // SQL statements, but doesn't return any results, so it's only useful for SQL statements // that would be run for their effects (e.g., CREATE). // It works by applying the SQL statement on the shard's master tablet with replication turned on. // Thus it should be used only for changes that can be applied on a live instance without causing issues; // it shouldn't be used for anything that will require a pivot. // The SQL statement string is expected to have {{.DatabaseName}} in place of the actual db name. func (wr *Wrangler) applySQLShard(ctx context.Context, tabletInfo *topo.TabletInfo, change string, reloadSchema bool) error { filledChange, err := fillStringTemplate(change, map[string]string{"DatabaseName": tabletInfo.DbName()}) if err != nil { return fmt.Errorf("fillStringTemplate failed: %v", err) } ctx, cancel := context.WithTimeout(ctx, 30*time.Second) defer cancel() // Need to make sure that we enable binlog, since we're only applying the statement on masters. _, err = wr.tmc.ExecuteFetchAsDba(ctx, tabletInfo, filledChange, 0, false, false, reloadSchema) return err }
func (wr *Wrangler) slaveWasPromoted(ti *topo.TabletInfo) error { log.Infof("slave was promoted %v", ti.Alias()) actionPath, err := wr.ai.SlaveWasPromoted(ti.Alias()) if err != nil { return err } err = wr.ai.WaitForCompletion(actionPath, wr.actionTimeout()) if err != nil { return err } return nil }
func (wr *Wrangler) demoteMaster(ti *topo.TabletInfo) (*mysqlctl.ReplicationPosition, error) { log.Infof("demote master %v", ti.Alias()) actionPath, err := wr.ai.DemoteMaster(ti.Alias()) if err != nil { return nil, err } err = wr.ai.WaitForCompletion(actionPath, wr.actionTimeout()) if err != nil { return nil, err } return wr.getMasterPosition(ti) }
func (tee *Tee) UpdateTablet(tablet *topo.TabletInfo, existingVersion int) (newVersion int, err error) { if newVersion, err = tee.primary.UpdateTablet(tablet, existingVersion); err != nil { // failed on primary, not updating secondary return } if _, err := tee.secondary.UpdateTablet(tablet, existingVersion); err != nil { // not critical enough to fail relog.Warning("secondary.UpdateTablet(%v) failed: %v", tablet.Alias(), err) } return }
func (wr *Wrangler) ExecuteTabletInfoHook(ti *topo.TabletInfo, hook *hk.Hook) (hookResult *hk.HookResult, err error) { actionPath, err := wr.ai.ExecuteHook(ti.Alias(), hook) if err != nil { return nil, err } var hr interface{} if hr, err = wr.ai.WaitForCompletionReply(actionPath, 10*time.Minute); err != nil { return nil, err } return hr.(*hk.HookResult), nil }
func (wr *Wrangler) promoteSlave(ti *topo.TabletInfo) (rsd *tm.RestartSlaveData, err error) { log.Infof("promote slave %v", ti.Alias()) actionPath, err := wr.ai.PromoteSlave(ti.Alias()) if err != nil { return } result, err := wr.ai.WaitForCompletionReply(actionPath, wr.actionTimeout()) if err != nil { return } rsd = result.(*tm.RestartSlaveData) return }
func (client *GoRpcTabletManagerClient) MultiRestore(tablet *topo.TabletInfo, sa *actionnode.MultiRestoreArgs, waitTime time.Duration) (<-chan *logutil.LoggerEvent, tmclient.ErrFunc, error) { rpcClient, err := bsonrpc.DialHTTP("tcp", tablet.Addr(), waitTime, nil) if err != nil { return nil, nil, err } logstream := make(chan *logutil.LoggerEvent, 10) c := rpcClient.StreamGo("TabletManager.MultiRestore", sa, logstream) return logstream, func() error { rpcClient.Close() return c.Error }, nil }
func (zkts *Server) UpdateTablet(tablet *topo.TabletInfo, existingVersion int) (int, error) { zkTabletPath := TabletPathForAlias(tablet.Alias()) stat, err := zkts.zconn.Set(zkTabletPath, tablet.Json(), existingVersion) if err != nil { if zookeeper.IsError(err, zookeeper.ZBADVERSION) { err = topo.ErrBadVersion } else if zookeeper.IsError(err, zookeeper.ZNONODE) { err = topo.ErrNoNode } return 0, err } return stat.Version(), nil }
// ExecuteFetchAsDba is part of the tmclient.TabletManagerClient interface func (client *GoRPCTabletManagerClient) ExecuteFetchAsDba(ctx context.Context, tablet *topo.TabletInfo, query string, maxRows int, wantFields, disableBinlogs, reloadSchema bool) (*mproto.QueryResult, error) { var qr mproto.QueryResult if err := client.rpcCallTablet(ctx, tablet, actionnode.TabletActionExecuteFetchAsDba, &gorpcproto.ExecuteFetchArgs{ Query: query, DbName: tablet.DbName(), MaxRows: maxRows, WantFields: wantFields, DisableBinlogs: disableBinlogs, ReloadSchema: reloadSchema, }, &qr); err != nil { return nil, err } return &qr, nil }
func (wr *Wrangler) reparentShardExternal(slaveTabletMap, masterTabletMap map[topo.TabletAlias]*topo.TabletInfo, masterElectTablet *topo.TabletInfo, scrapStragglers bool, acceptSuccessPercents int) error { // we fix the new master in the replication graph err := wr.slaveWasPromoted(masterElectTablet) if err != nil { // This suggests that the master-elect is dead. This is bad. return fmt.Errorf("slaveWasPromoted(%v) failed: %v", masterElectTablet, err) } // Once the slave is promoted, remove it from our maps delete(slaveTabletMap, masterElectTablet.Alias()) delete(masterTabletMap, masterElectTablet.Alias()) // then fix all the slaves, including the old master return wr.restartSlavesExternal(slaveTabletMap, masterTabletMap, masterElectTablet, scrapStragglers, acceptSuccessPercents) }
// Backup is part of the tmclient.TabletManagerClient interface func (client *GoRPCTabletManagerClient) Backup(ctx context.Context, tablet *topo.TabletInfo, concurrency int) (<-chan *logutil.LoggerEvent, tmclient.ErrFunc, error) { var connectTimeout time.Duration deadline, ok := ctx.Deadline() if ok { connectTimeout = deadline.Sub(time.Now()) if connectTimeout < 0 { return nil, nil, timeoutError{fmt.Errorf("timeout connecting to TabletManager.Backup on %v", tablet.Alias)} } } rpcClient, err := bsonrpc.DialHTTP("tcp", tablet.Addr(), connectTimeout) if err != nil { return nil, nil, err } logstream := make(chan *logutil.LoggerEvent, 10) rpcstream := make(chan *logutil.LoggerEvent, 10) c := rpcClient.StreamGo("TabletManager.Backup", &gorpcproto.BackupArgs{ Concurrency: concurrency, }, rpcstream) interrupted := false go func() { for { select { case <-ctx.Done(): // context is done interrupted = true close(logstream) rpcClient.Close() return case ssr, ok := <-rpcstream: if !ok { close(logstream) rpcClient.Close() return } logstream <- ssr } } }() return logstream, func() error { // this is only called after streaming is done if interrupted { return fmt.Errorf("TabletManager.Backup interrupted by context") } return c.Error }, nil }
// agentRPCTestIsTimeoutErrorDialTimeout verifies that client.IsTimeoutError() // returns true for RPCs failed due to a connect timeout during .Dial(). func agentRPCTestIsTimeoutErrorDialTimeout(ctx context.Context, t *testing.T, client tmclient.TabletManagerClient, ti *topo.TabletInfo) { // Connect to a non-existing tablet. // For example, this provokes gRPC to return error grpc.ErrClientConnTimeout. invalidTi := topo.NewTabletInfo(ti.Tablet, ti.Version()) invalidTi.Tablet = proto.Clone(invalidTi.Tablet).(*topodatapb.Tablet) invalidTi.Tablet.Hostname = "Non-Existent.Server" shortCtx, cancel := context.WithTimeout(ctx, time.Millisecond) defer cancel() err := client.Ping(shortCtx, invalidTi) if err == nil { t.Fatal("agentRPCTestIsTimeoutErrorDialTimeout: connect to non-existant tablet did not fail") } if !client.IsTimeoutError(err) { t.Errorf("agentRPCTestIsTimeoutErrorDialTimeout: want: IsTimeoutError() = true. error: %v", err) } }
// ExecuteOptionalTabletInfoHook executes a hook and returns an error // only if the hook failed, not if the hook doesn't exist. func (wr *Wrangler) ExecuteOptionalTabletInfoHook(ctx context.Context, ti *topo.TabletInfo, hook *hk.Hook) (err error) { hr, err := wr.ExecuteTabletInfoHook(ctx, ti, hook) if err != nil { return err } if hr.ExitStatus == hk.HOOK_DOES_NOT_EXIST { log.Infof("Hook %v doesn't exist on tablet %v", hook.Name, ti.AliasString()) return nil } if hr.ExitStatus != hk.HOOK_SUCCESS { return fmt.Errorf("Hook %v failed(%v): %v", hook.Name, hr.ExitStatus, hr.Stderr) } return nil }
// HealthStream is part of the tmclient.TabletManagerClient interface func (client *GoRPCTabletManagerClient) HealthStream(ctx context.Context, tablet *topo.TabletInfo) (<-chan *actionnode.HealthStreamReply, tmclient.ErrFunc, error) { var connectTimeout time.Duration deadline, ok := ctx.Deadline() if ok { connectTimeout = deadline.Sub(time.Now()) if connectTimeout < 0 { return nil, nil, timeoutError{fmt.Errorf("timeout connecting to TabletManager.HealthStream on %v", tablet.Alias)} } } rpcClient, err := bsonrpc.DialHTTP("tcp", tablet.Addr(), connectTimeout, nil) if err != nil { return nil, nil, err } logstream := make(chan *actionnode.HealthStreamReply, 10) rpcstream := make(chan *actionnode.HealthStreamReply, 10) c := rpcClient.StreamGo("TabletManager.HealthStream", "", rpcstream) interrupted := false go func() { for { select { case <-ctx.Done(): // context is done interrupted = true close(logstream) rpcClient.Close() return case hsr, ok := <-rpcstream: if !ok { close(logstream) rpcClient.Close() return } logstream <- hsr } } }() return logstream, func() error { // this is only called after streaming is done if interrupted { return fmt.Errorf("TabletManager.HealthStreamReply interrupted by context") } return c.Error }, nil }
// ExecuteFetchAsDba is part of the tmclient.TabletManagerClient interface func (client *Client) ExecuteFetchAsDba(ctx context.Context, tablet *topo.TabletInfo, query string, maxRows int, disableBinlogs, reloadSchema bool) (*querypb.QueryResult, error) { cc, c, err := client.dial(ctx, tablet) if err != nil { return nil, err } defer cc.Close() response, err := c.ExecuteFetchAsDba(ctx, &tabletmanagerdatapb.ExecuteFetchAsDbaRequest{ Query: query, DbName: tablet.DbName(), MaxRows: uint64(maxRows), DisableBinlogs: disableBinlogs, ReloadSchema: reloadSchema, }) if err != nil { return nil, err } return response.Result, nil }
// ExecuteFetchAsDba is part of the tmclient.TabletManagerClient interface func (client *Client) ExecuteFetchAsDba(ctx context.Context, tablet *topo.TabletInfo, query string, maxRows int, wantFields, disableBinlogs, reloadSchema bool) (*mproto.QueryResult, error) { cc, c, err := client.dial(ctx, tablet) if err != nil { return nil, err } defer cc.Close() response, err := c.ExecuteFetchAsDba(ctx, &pb.ExecuteFetchAsDbaRequest{ Query: query, DbName: tablet.DbName(), MaxRows: uint64(maxRows), WantFields: wantFields, DisableBinlogs: disableBinlogs, ReloadSchema: reloadSchema, }) if err != nil { return nil, err } return mproto.Proto3ToQueryResult(response.Result), nil }
// UpdateTablet is part of the topo.Server interface func (zkts *Server) UpdateTablet(ctx context.Context, tablet *topo.TabletInfo, existingVersion int64) (int64, error) { zkTabletPath := TabletPathForAlias(tablet.Alias) stat, err := zkts.zconn.Set(zkTabletPath, tablet.JSON(), int(existingVersion)) if err != nil { if zookeeper.IsError(err, zookeeper.ZBADVERSION) { err = topo.ErrBadVersion } else if zookeeper.IsError(err, zookeeper.ZNONODE) { err = topo.ErrNoNode } return 0, err } event.Dispatch(&events.TabletChange{ Tablet: *tablet.Tablet, Status: "updated", }) return int64(stat.Version()), nil }
// executeFetchLoop loops over the provided insertChannel // and sends the commands to the provided tablet. func executeFetchLoop(wr *wrangler.Wrangler, ti *topo.TabletInfo, insertChannel chan string, abort chan struct{}) error { for { select { case cmd, ok := <-insertChannel: if !ok { // no more to read, we're done return nil } cmd = "INSERT INTO `" + ti.DbName() + "`." + cmd _, err := wr.TabletManagerClient().ExecuteFetch(ti, cmd, 0, false, true, 30*time.Second) if err != nil { return fmt.Errorf("ExecuteFetch failed: %v", err) } case <-abort: // FIXME(alainjobart): note this select case // could be starved here, and we might miss // the abort in some corner cases. return nil } } }