Beispiel #1
0
func (wr *Wrangler) SnapshotSourceEnd(tabletAlias topo.TabletAlias, slaveStartRequired, readWrite bool, originalType topo.TabletType) (err error) {
	var ti *topo.TabletInfo
	ti, err = wr.ts.GetTablet(tabletAlias)
	if err != nil {
		return
	}

	var actionPath string
	actionPath, err = wr.ai.SnapshotSourceEnd(tabletAlias, &tm.SnapshotSourceEndArgs{slaveStartRequired, !readWrite})
	if err != nil {
		return
	}

	// wait for completion, and save the error
	err = wr.ai.WaitForCompletion(actionPath, wr.actionTimeout())
	if err != nil {
		log.Errorf("SnapshotSourceEnd failed (%v), leaving tablet type alone", err)
		return
	}

	if ti.Tablet.Parent.Uid == topo.NO_TABLET {
		ti.Tablet.Type = topo.TYPE_MASTER
		err = topo.UpdateTablet(wr.ts, ti)
	} else {
		err = wr.ChangeType(ti.Alias(), originalType, false)
	}

	return err
}
Beispiel #2
0
func (wr *Wrangler) restartSlavesExternal(slaveTabletMap map[topo.TabletAlias]*topo.TabletInfo, masterTablet, masterElectTablet *topo.TabletInfo, scrapStragglers bool) error {
	recorder := concurrency.AllErrorRecorder{}
	wg := sync.WaitGroup{}

	swrd := tm.SlaveWasRestartedData{
		Parent:               masterElectTablet.Alias(),
		ExpectedMasterAddr:   masterElectTablet.MysqlAddr,
		ExpectedMasterIpAddr: masterElectTablet.MysqlIpAddr,
		ScrapStragglers:      scrapStragglers,
	}

	// do all the slaves
	for _, ti := range slaveTabletMap {
		wg.Add(1)
		go func(ti *topo.TabletInfo) {
			recorder.RecordError(wr.slaveWasRestarted(ti, &swrd))
			wg.Done()
		}(ti)
	}
	wg.Wait()

	// then do the master
	recorder.RecordError(wr.slaveWasRestarted(masterTablet, &swrd))
	return recorder.Error()
}
Beispiel #3
0
// rpcCallTablet wil execute the RPC on the remote server.
func (client *GoRPCTabletManagerClient) rpcCallTablet(ctx context.Context, tablet *topo.TabletInfo, name string, args, reply interface{}) error {
	// create the RPC client, using ctx.Deadline if set, or no timeout.
	var connectTimeout time.Duration
	deadline, ok := ctx.Deadline()
	if ok {
		connectTimeout = deadline.Sub(time.Now())
		if connectTimeout < 0 {
			return timeoutError{fmt.Errorf("timeout connecting to TabletManager.%v on %v", name, tablet.Alias)}
		}
	}
	rpcClient, err := bsonrpc.DialHTTP("tcp", tablet.Addr(), connectTimeout)
	if err != nil {
		return fmt.Errorf("RPC error for %v: %v", tablet.Alias, err.Error())
	}
	defer rpcClient.Close()

	// use the context Done() channel. Will handle context timeout.
	call := rpcClient.Go(ctx, "TabletManager."+name, args, reply, nil)
	select {
	case <-ctx.Done():
		if ctx.Err() == context.DeadlineExceeded {
			return timeoutError{fmt.Errorf("timeout waiting for TabletManager.%v to %v", name, tablet.Alias)}
		}
		return fmt.Errorf("interrupted waiting for TabletManager.%v to %v", name, tablet.Alias)
	case <-call.Done:
		if call.Error != nil {
			return fmt.Errorf("remote error for %v: %v", tablet.Alias, call.Error.Error())
		}
		return nil
	}
}
Beispiel #4
0
func (client *GoRpcTabletManagerClient) MultiSnapshot(tablet *topo.TabletInfo, sa *actionnode.MultiSnapshotArgs, waitTime time.Duration) (<-chan *logutil.LoggerEvent, tmclient.MultiSnapshotReplyFunc, error) {
	rpcClient, err := bsonrpc.DialHTTP("tcp", tablet.Addr(), waitTime, nil)
	if err != nil {
		return nil, nil, err
	}

	logstream := make(chan *logutil.LoggerEvent, 10)
	rpcstream := make(chan *gorpcproto.MultiSnapshotStreamingReply, 10)
	result := &actionnode.MultiSnapshotReply{}

	c := rpcClient.StreamGo("TabletManager.MultiSnapshot", sa, rpcstream)
	go func() {
		for ssr := range rpcstream {
			if ssr.Log != nil {
				logstream <- ssr.Log
			}
			if ssr.Result != nil {
				*result = *ssr.Result
			}
		}
		close(logstream)
		rpcClient.Close()
	}()
	return logstream, func() (*actionnode.MultiSnapshotReply, error) {
		return result, c.Error
	}, nil
}
Beispiel #5
0
func updateReplicationGraphForPromotedSlave(ts topo.Server, tablet *topo.TabletInfo) error {
	// Remove tablet from the replication graph if this is not already the master.
	if tablet.Parent.Uid != topo.NO_TABLET {
		if err := topo.DeleteTabletReplicationData(ts, tablet.Tablet); err != nil && err != topo.ErrNoNode {
			return err
		}
	}

	// Update tablet regardless - trend towards consistency.
	tablet.State = topo.STATE_READ_WRITE
	tablet.Type = topo.TYPE_MASTER
	tablet.Parent.Cell = ""
	tablet.Parent.Uid = topo.NO_TABLET
	err := topo.UpdateTablet(ts, tablet)
	if err != nil {
		return err
	}
	// NOTE(msolomon) A serving graph update is required, but in
	// order for the shard to be consistent the old master must be
	// scrapped first. That is externally coordinated by the
	// wrangler reparent action.

	// Insert the new tablet location in the replication graph now that
	// we've updated the tablet.
	err = topo.CreateTabletReplicationData(ts, tablet.Tablet)
	if err != nil && err != topo.ErrNodeExists {
		return err
	}

	return nil
}
Beispiel #6
0
func (client *fakeTabletManagerClient) GetSchema(ctx context.Context, tablet *topo.TabletInfo, tables, excludeTables []string, includeViews bool) (*proto.SchemaDefinition, error) {
	result, ok := client.schemaDefinitions[tablet.DbName()]
	if !ok {
		return nil, fmt.Errorf("unknown database: %s", tablet.DbName())
	}
	return result, nil
}
Beispiel #7
0
// UpdateTabletFields implements topo.Server.
func (s *Server) UpdateTabletFields(ctx context.Context, tabletAlias topo.TabletAlias, updateFunc func(*topo.Tablet) error) error {
	var ti *topo.TabletInfo
	var err error

	for {
		if ti, err = s.GetTablet(ctx, tabletAlias); err != nil {
			return err
		}
		if err = updateFunc(ti.Tablet); err != nil {
			return err
		}
		if _, err = s.UpdateTablet(ctx, ti, ti.Version()); err != topo.ErrBadVersion {
			break
		}
	}
	if err != nil {
		return err
	}

	event.Dispatch(&events.TabletChange{
		Tablet: *ti.Tablet,
		Status: "updated",
	})
	return nil
}
func (wr *Wrangler) restartSlave(ti *topo.TabletInfo, rsd *tm.RestartSlaveData) (err error) {
	log.Infof("restart slave %v", ti.Alias())
	actionPath, err := wr.ai.RestartSlave(ti.Alias(), rsd)
	if err != nil {
		return err
	}
	return wr.ai.WaitForCompletion(actionPath, wr.actionTimeout())
}
Beispiel #9
0
func (wr *Wrangler) slaveWasRestarted(ti *topo.TabletInfo, swrd *tm.SlaveWasRestartedData) (err error) {
	log.Infof("slaveWasRestarted(%v)", ti.Alias())
	actionPath, err := wr.ai.SlaveWasRestarted(ti.Alias(), swrd)
	if err != nil {
		return err
	}
	return wr.ai.WaitForCompletion(actionPath, wr.actionTimeout())
}
Beispiel #10
0
func (wr *Wrangler) checkMasterElect(ti *topo.TabletInfo) error {
	// Check the master-elect is fit for duty - call out for hardware checks.
	// if the server was already serving live traffic, it's probably good
	if ti.IsInServingGraph() {
		return nil
	}
	return wr.ExecuteOptionalTabletInfoHook(ti, hook.NewSimpleHook("preflight_serving_type"))
}
Beispiel #11
0
func newTabletNodeFromTabletInfo(ti *topo.TabletInfo) *TabletNode {
	if err := ti.ValidatePortmap(); err != nil {
		log.Errorf("ValidatePortmap(%v): %v", ti.Alias, err)
	}
	return &TabletNode{
		Host:  ti.Hostname,
		Port:  ti.Portmap["vt"],
		Alias: ti.Alias,
	}
}
func (wr *Wrangler) getMasterPosition(ti *topo.TabletInfo) (*mysqlctl.ReplicationPosition, error) {
	actionPath, err := wr.ai.MasterPosition(ti.Alias())
	if err != nil {
		return nil, err
	}
	result, err := wr.ai.WaitForCompletionReply(actionPath, wr.actionTimeout())
	if err != nil {
		return nil, err
	}
	return result.(*mysqlctl.ReplicationPosition), nil
}
Beispiel #13
0
// applySQLShard applies a given SQL change on a given tablet alias. It allows executing arbitrary
// SQL statements, but doesn't return any results, so it's only useful for SQL statements
// that would be run for their effects (e.g., CREATE).
// It works by applying the SQL statement on the shard's master tablet with replication turned on.
// Thus it should be used only for changes that can be applied on a live instance without causing issues;
// it shouldn't be used for anything that will require a pivot.
// The SQL statement string is expected to have {{.DatabaseName}} in place of the actual db name.
func (wr *Wrangler) applySQLShard(ctx context.Context, tabletInfo *topo.TabletInfo, change string, reloadSchema bool) error {
	filledChange, err := fillStringTemplate(change, map[string]string{"DatabaseName": tabletInfo.DbName()})
	if err != nil {
		return fmt.Errorf("fillStringTemplate failed: %v", err)
	}
	ctx, cancel := context.WithTimeout(ctx, 30*time.Second)
	defer cancel()
	// Need to make sure that we enable binlog, since we're only applying the statement on masters.
	_, err = wr.tmc.ExecuteFetchAsDba(ctx, tabletInfo, filledChange, 0, false, false, reloadSchema)
	return err
}
func (wr *Wrangler) slaveWasPromoted(ti *topo.TabletInfo) error {
	log.Infof("slave was promoted %v", ti.Alias())
	actionPath, err := wr.ai.SlaveWasPromoted(ti.Alias())
	if err != nil {
		return err
	}
	err = wr.ai.WaitForCompletion(actionPath, wr.actionTimeout())
	if err != nil {
		return err
	}
	return nil
}
func (wr *Wrangler) demoteMaster(ti *topo.TabletInfo) (*mysqlctl.ReplicationPosition, error) {
	log.Infof("demote master %v", ti.Alias())
	actionPath, err := wr.ai.DemoteMaster(ti.Alias())
	if err != nil {
		return nil, err
	}
	err = wr.ai.WaitForCompletion(actionPath, wr.actionTimeout())
	if err != nil {
		return nil, err
	}
	return wr.getMasterPosition(ti)
}
Beispiel #16
0
func (tee *Tee) UpdateTablet(tablet *topo.TabletInfo, existingVersion int) (newVersion int, err error) {
	if newVersion, err = tee.primary.UpdateTablet(tablet, existingVersion); err != nil {
		// failed on primary, not updating secondary
		return
	}

	if _, err := tee.secondary.UpdateTablet(tablet, existingVersion); err != nil {
		// not critical enough to fail
		relog.Warning("secondary.UpdateTablet(%v) failed: %v", tablet.Alias(), err)
	}
	return
}
Beispiel #17
0
func (wr *Wrangler) ExecuteTabletInfoHook(ti *topo.TabletInfo, hook *hk.Hook) (hookResult *hk.HookResult, err error) {

	actionPath, err := wr.ai.ExecuteHook(ti.Alias(), hook)
	if err != nil {
		return nil, err
	}

	var hr interface{}
	if hr, err = wr.ai.WaitForCompletionReply(actionPath, 10*time.Minute); err != nil {
		return nil, err
	}
	return hr.(*hk.HookResult), nil
}
func (wr *Wrangler) promoteSlave(ti *topo.TabletInfo) (rsd *tm.RestartSlaveData, err error) {
	log.Infof("promote slave %v", ti.Alias())
	actionPath, err := wr.ai.PromoteSlave(ti.Alias())
	if err != nil {
		return
	}
	result, err := wr.ai.WaitForCompletionReply(actionPath, wr.actionTimeout())
	if err != nil {
		return
	}
	rsd = result.(*tm.RestartSlaveData)
	return
}
Beispiel #19
0
func (client *GoRpcTabletManagerClient) MultiRestore(tablet *topo.TabletInfo, sa *actionnode.MultiRestoreArgs, waitTime time.Duration) (<-chan *logutil.LoggerEvent, tmclient.ErrFunc, error) {
	rpcClient, err := bsonrpc.DialHTTP("tcp", tablet.Addr(), waitTime, nil)
	if err != nil {
		return nil, nil, err
	}

	logstream := make(chan *logutil.LoggerEvent, 10)
	c := rpcClient.StreamGo("TabletManager.MultiRestore", sa, logstream)
	return logstream, func() error {
		rpcClient.Close()
		return c.Error
	}, nil
}
Beispiel #20
0
func (zkts *Server) UpdateTablet(tablet *topo.TabletInfo, existingVersion int) (int, error) {
	zkTabletPath := TabletPathForAlias(tablet.Alias())
	stat, err := zkts.zconn.Set(zkTabletPath, tablet.Json(), existingVersion)
	if err != nil {
		if zookeeper.IsError(err, zookeeper.ZBADVERSION) {
			err = topo.ErrBadVersion
		} else if zookeeper.IsError(err, zookeeper.ZNONODE) {
			err = topo.ErrNoNode
		}

		return 0, err
	}
	return stat.Version(), nil
}
Beispiel #21
0
// ExecuteFetchAsDba is part of the tmclient.TabletManagerClient interface
func (client *GoRPCTabletManagerClient) ExecuteFetchAsDba(ctx context.Context, tablet *topo.TabletInfo, query string, maxRows int, wantFields, disableBinlogs, reloadSchema bool) (*mproto.QueryResult, error) {
	var qr mproto.QueryResult
	if err := client.rpcCallTablet(ctx, tablet, actionnode.TabletActionExecuteFetchAsDba, &gorpcproto.ExecuteFetchArgs{
		Query:          query,
		DbName:         tablet.DbName(),
		MaxRows:        maxRows,
		WantFields:     wantFields,
		DisableBinlogs: disableBinlogs,
		ReloadSchema:   reloadSchema,
	}, &qr); err != nil {
		return nil, err
	}
	return &qr, nil
}
Beispiel #22
0
func (wr *Wrangler) reparentShardExternal(slaveTabletMap, masterTabletMap map[topo.TabletAlias]*topo.TabletInfo, masterElectTablet *topo.TabletInfo, scrapStragglers bool, acceptSuccessPercents int) error {
	// we fix the new master in the replication graph
	err := wr.slaveWasPromoted(masterElectTablet)
	if err != nil {
		// This suggests that the master-elect is dead. This is bad.
		return fmt.Errorf("slaveWasPromoted(%v) failed: %v", masterElectTablet, err)
	}

	// Once the slave is promoted, remove it from our maps
	delete(slaveTabletMap, masterElectTablet.Alias())
	delete(masterTabletMap, masterElectTablet.Alias())

	// then fix all the slaves, including the old master
	return wr.restartSlavesExternal(slaveTabletMap, masterTabletMap, masterElectTablet, scrapStragglers, acceptSuccessPercents)
}
Beispiel #23
0
// Backup is part of the tmclient.TabletManagerClient interface
func (client *GoRPCTabletManagerClient) Backup(ctx context.Context, tablet *topo.TabletInfo, concurrency int) (<-chan *logutil.LoggerEvent, tmclient.ErrFunc, error) {
	var connectTimeout time.Duration
	deadline, ok := ctx.Deadline()
	if ok {
		connectTimeout = deadline.Sub(time.Now())
		if connectTimeout < 0 {
			return nil, nil, timeoutError{fmt.Errorf("timeout connecting to TabletManager.Backup on %v", tablet.Alias)}
		}
	}
	rpcClient, err := bsonrpc.DialHTTP("tcp", tablet.Addr(), connectTimeout)
	if err != nil {
		return nil, nil, err
	}

	logstream := make(chan *logutil.LoggerEvent, 10)
	rpcstream := make(chan *logutil.LoggerEvent, 10)
	c := rpcClient.StreamGo("TabletManager.Backup", &gorpcproto.BackupArgs{
		Concurrency: concurrency,
	}, rpcstream)
	interrupted := false
	go func() {
		for {
			select {
			case <-ctx.Done():
				// context is done
				interrupted = true
				close(logstream)
				rpcClient.Close()
				return
			case ssr, ok := <-rpcstream:
				if !ok {
					close(logstream)
					rpcClient.Close()
					return
				}
				logstream <- ssr
			}
		}
	}()
	return logstream, func() error {
		// this is only called after streaming is done
		if interrupted {
			return fmt.Errorf("TabletManager.Backup interrupted by context")
		}
		return c.Error
	}, nil
}
Beispiel #24
0
// agentRPCTestIsTimeoutErrorDialTimeout verifies that client.IsTimeoutError()
// returns true for RPCs failed due to a connect timeout during .Dial().
func agentRPCTestIsTimeoutErrorDialTimeout(ctx context.Context, t *testing.T, client tmclient.TabletManagerClient, ti *topo.TabletInfo) {
	// Connect to a non-existing tablet.
	// For example, this provokes gRPC to return error grpc.ErrClientConnTimeout.
	invalidTi := topo.NewTabletInfo(ti.Tablet, ti.Version())
	invalidTi.Tablet = proto.Clone(invalidTi.Tablet).(*topodatapb.Tablet)
	invalidTi.Tablet.Hostname = "Non-Existent.Server"

	shortCtx, cancel := context.WithTimeout(ctx, time.Millisecond)
	defer cancel()
	err := client.Ping(shortCtx, invalidTi)
	if err == nil {
		t.Fatal("agentRPCTestIsTimeoutErrorDialTimeout: connect to non-existant tablet did not fail")
	}
	if !client.IsTimeoutError(err) {
		t.Errorf("agentRPCTestIsTimeoutErrorDialTimeout: want: IsTimeoutError() = true. error: %v", err)
	}
}
Beispiel #25
0
// ExecuteOptionalTabletInfoHook executes a hook and returns an error
// only if the hook failed, not if the hook doesn't exist.
func (wr *Wrangler) ExecuteOptionalTabletInfoHook(ctx context.Context, ti *topo.TabletInfo, hook *hk.Hook) (err error) {
	hr, err := wr.ExecuteTabletInfoHook(ctx, ti, hook)
	if err != nil {
		return err
	}

	if hr.ExitStatus == hk.HOOK_DOES_NOT_EXIST {
		log.Infof("Hook %v doesn't exist on tablet %v", hook.Name, ti.AliasString())
		return nil
	}

	if hr.ExitStatus != hk.HOOK_SUCCESS {
		return fmt.Errorf("Hook %v failed(%v): %v", hook.Name, hr.ExitStatus, hr.Stderr)
	}

	return nil
}
Beispiel #26
0
// HealthStream is part of the tmclient.TabletManagerClient interface
func (client *GoRPCTabletManagerClient) HealthStream(ctx context.Context, tablet *topo.TabletInfo) (<-chan *actionnode.HealthStreamReply, tmclient.ErrFunc, error) {
	var connectTimeout time.Duration
	deadline, ok := ctx.Deadline()
	if ok {
		connectTimeout = deadline.Sub(time.Now())
		if connectTimeout < 0 {
			return nil, nil, timeoutError{fmt.Errorf("timeout connecting to TabletManager.HealthStream on %v", tablet.Alias)}
		}
	}
	rpcClient, err := bsonrpc.DialHTTP("tcp", tablet.Addr(), connectTimeout, nil)
	if err != nil {
		return nil, nil, err
	}

	logstream := make(chan *actionnode.HealthStreamReply, 10)
	rpcstream := make(chan *actionnode.HealthStreamReply, 10)
	c := rpcClient.StreamGo("TabletManager.HealthStream", "", rpcstream)
	interrupted := false
	go func() {
		for {
			select {
			case <-ctx.Done():
				// context is done
				interrupted = true
				close(logstream)
				rpcClient.Close()
				return
			case hsr, ok := <-rpcstream:
				if !ok {
					close(logstream)
					rpcClient.Close()
					return
				}
				logstream <- hsr
			}
		}
	}()
	return logstream, func() error {
		// this is only called after streaming is done
		if interrupted {
			return fmt.Errorf("TabletManager.HealthStreamReply interrupted by context")
		}
		return c.Error
	}, nil
}
Beispiel #27
0
// ExecuteFetchAsDba is part of the tmclient.TabletManagerClient interface
func (client *Client) ExecuteFetchAsDba(ctx context.Context, tablet *topo.TabletInfo, query string, maxRows int, disableBinlogs, reloadSchema bool) (*querypb.QueryResult, error) {
	cc, c, err := client.dial(ctx, tablet)
	if err != nil {
		return nil, err
	}
	defer cc.Close()
	response, err := c.ExecuteFetchAsDba(ctx, &tabletmanagerdatapb.ExecuteFetchAsDbaRequest{
		Query:          query,
		DbName:         tablet.DbName(),
		MaxRows:        uint64(maxRows),
		DisableBinlogs: disableBinlogs,
		ReloadSchema:   reloadSchema,
	})
	if err != nil {
		return nil, err
	}
	return response.Result, nil
}
Beispiel #28
0
// ExecuteFetchAsDba is part of the tmclient.TabletManagerClient interface
func (client *Client) ExecuteFetchAsDba(ctx context.Context, tablet *topo.TabletInfo, query string, maxRows int, wantFields, disableBinlogs, reloadSchema bool) (*mproto.QueryResult, error) {
	cc, c, err := client.dial(ctx, tablet)
	if err != nil {
		return nil, err
	}
	defer cc.Close()
	response, err := c.ExecuteFetchAsDba(ctx, &pb.ExecuteFetchAsDbaRequest{
		Query:          query,
		DbName:         tablet.DbName(),
		MaxRows:        uint64(maxRows),
		WantFields:     wantFields,
		DisableBinlogs: disableBinlogs,
		ReloadSchema:   reloadSchema,
	})
	if err != nil {
		return nil, err
	}
	return mproto.Proto3ToQueryResult(response.Result), nil
}
Beispiel #29
0
// UpdateTablet is part of the topo.Server interface
func (zkts *Server) UpdateTablet(ctx context.Context, tablet *topo.TabletInfo, existingVersion int64) (int64, error) {
	zkTabletPath := TabletPathForAlias(tablet.Alias)
	stat, err := zkts.zconn.Set(zkTabletPath, tablet.JSON(), int(existingVersion))
	if err != nil {
		if zookeeper.IsError(err, zookeeper.ZBADVERSION) {
			err = topo.ErrBadVersion
		} else if zookeeper.IsError(err, zookeeper.ZNONODE) {
			err = topo.ErrNoNode
		}

		return 0, err
	}

	event.Dispatch(&events.TabletChange{
		Tablet: *tablet.Tablet,
		Status: "updated",
	})
	return int64(stat.Version()), nil
}
Beispiel #30
0
// executeFetchLoop loops over the provided insertChannel
// and sends the commands to the provided tablet.
func executeFetchLoop(wr *wrangler.Wrangler, ti *topo.TabletInfo, insertChannel chan string, abort chan struct{}) error {
	for {
		select {
		case cmd, ok := <-insertChannel:
			if !ok {
				// no more to read, we're done
				return nil
			}
			cmd = "INSERT INTO `" + ti.DbName() + "`." + cmd
			_, err := wr.TabletManagerClient().ExecuteFetch(ti, cmd, 0, false, true, 30*time.Second)
			if err != nil {
				return fmt.Errorf("ExecuteFetch failed: %v", err)
			}
		case <-abort:
			// FIXME(alainjobart): note this select case
			// could be starved here, and we might miss
			// the abort in some corner cases.
			return nil
		}
	}
}