Example #1
0
// NewQueryResultReaderForTablet creates a new QueryResultReader for
// the provided tablet / sql query
func NewQueryResultReaderForTablet(ctx context.Context, ts topo.Server, tabletAlias *pb.TabletAlias, sql string) (*QueryResultReader, error) {
	tablet, err := ts.GetTablet(ctx, tabletAlias)
	if err != nil {
		return nil, err
	}

	endPoint, err := topo.TabletEndPoint(tablet.Tablet)
	if err != nil {
		return nil, err
	}

	// use sessionId for now
	conn, err := tabletconn.GetDialer()(ctx, endPoint, tablet.Keyspace, tablet.Shard, pb.TabletType_UNKNOWN, *remoteActionsTimeout)
	if err != nil {
		return nil, err
	}

	sr, clientErrFn, err := conn.StreamExecute(ctx, sql, make(map[string]interface{}), 0)
	if err != nil {
		return nil, err
	}

	// read the columns, or grab the error
	cols, ok := <-sr
	if !ok {
		return nil, fmt.Errorf("Cannot read Fields for query '%v': %v", sql, clientErrFn())
	}

	return &QueryResultReader{
		Output:      sr,
		Fields:      cols.Fields,
		conn:        conn,
		clientErrFn: clientErrFn,
	}, nil
}
Example #2
0
// loadTablets reads all tablets from topology, converts to endpoints, and updates HealthCheck.
func (tw *TopologyWatcher) loadTablets() {
	var wg sync.WaitGroup
	newEndPoints := make(map[string]*tabletEndPoint)
	tabletAlias, err := tw.getTablets(tw)
	if err != nil {
		select {
		case <-tw.ctx.Done():
			return
		default:
		}
		log.Errorf("cannot get tablets for cell: %v: %v", tw.cell, err)
		return
	}
	for _, tAlias := range tabletAlias {
		wg.Add(1)
		go func(alias *topodatapb.TabletAlias) {
			defer wg.Done()
			tw.sem <- 1 // Wait for active queue to drain.
			tablet, err := tw.topoServer.GetTablet(tw.ctx, alias)
			<-tw.sem // Done; enable next request to run
			if err != nil {
				select {
				case <-tw.ctx.Done():
					return
				default:
				}
				log.Errorf("cannot get tablet for alias %v: %v", alias, err)
				return
			}
			endPoint, err := topo.TabletEndPoint(tablet.Tablet)
			if err != nil {
				log.Errorf("cannot get endpoint from tablet %v: %v", tablet, err)
				return
			}
			key := EndPointToMapKey(endPoint)
			tw.mu.Lock()
			newEndPoints[key] = &tabletEndPoint{
				alias:    topoproto.TabletAliasString(alias),
				endPoint: endPoint,
			}
			tw.mu.Unlock()
		}(tAlias)
	}

	wg.Wait()
	tw.mu.Lock()
	for key, tep := range newEndPoints {
		if _, ok := tw.endPoints[key]; !ok {
			tw.hc.AddEndPoint(tw.cell, tep.alias, tep.endPoint)
		}
	}
	for key, tep := range tw.endPoints {
		if _, ok := newEndPoints[key]; !ok {
			tw.hc.RemoveEndPoint(tep.endPoint)
		}
	}
	tw.endPoints = newEndPoints
	tw.mu.Unlock()
}
Example #3
0
// loadTablets reads all tablets from topology, converts to endpoints, and updates HealthCheck.
func (ctw *CellTabletsWatcher) loadTablets() {
	var wg sync.WaitGroup
	newEndPoints := make(map[string]*pbt.EndPoint)
	tabletAlias, err := ctw.topoServer.GetTabletsByCell(ctw.ctx, ctw.cell)
	if err != nil {
		select {
		case <-ctw.ctx.Done():
			return
		default:
		}
		log.Errorf("cannot get tablets for cell: %v: %v", ctw.cell, err)
		return
	}
	for _, tAlias := range tabletAlias {
		wg.Add(1)
		go func(alias *pbt.TabletAlias) {
			defer wg.Done()
			ctw.sem <- 1 // Wait for active queue to drain.
			tablet, err := ctw.topoServer.GetTablet(ctw.ctx, alias)
			<-ctw.sem // Done; enable next request to run
			if err != nil {
				select {
				case <-ctw.ctx.Done():
					return
				default:
				}
				log.Errorf("cannot get tablet for alias %v: %v", alias, err)
				return
			}
			endPoint, err := topo.TabletEndPoint(tablet.Tablet)
			if err != nil {
				log.Errorf("cannot get endpoint from tablet %v: %v", tablet, err)
				return
			}
			key := EndPointToMapKey(endPoint)
			ctw.mu.Lock()
			newEndPoints[key] = endPoint
			ctw.mu.Unlock()
		}(tAlias)
	}

	wg.Wait()
	ctw.mu.Lock()
	for key, ep := range newEndPoints {
		if _, ok := ctw.endPoints[key]; !ok {
			ctw.hc.AddEndPoint(ctw.cell, ep)
		}
	}
	for key, ep := range ctw.endPoints {
		if _, ok := newEndPoints[key]; !ok {
			ctw.hc.RemoveEndPoint(ep)
		}
	}
	ctw.endPoints = newEndPoints
	ctw.mu.Unlock()
}
Example #4
0
func (th *tabletHealth) stream(ctx context.Context, ts topo.Server, tabletAlias *topodatapb.TabletAlias) (err error) {
	defer func() {
		th.mu.Lock()
		th.err = err
		th.mu.Unlock()
		close(th.done)
	}()

	ti, err := ts.GetTablet(ctx, tabletAlias)
	if err != nil {
		return err
	}
	ep, err := topo.TabletEndPoint(ti.Tablet)
	if err != nil {
		return err
	}

	// Pass in a tablet type that is not UNKNOWN, so we don't ask
	// for sessionId.
	conn, err := tabletconn.GetDialer()(ctx, ep, "", "", topodatapb.TabletType_MASTER, 30*time.Second)
	if err != nil {
		return err
	}
	defer conn.Close()

	stream, err := conn.StreamHealth(ctx)
	if err != nil {
		return err
	}

	first := true
	for time.Since(th.lastAccessed()) < *tabletHealthKeepAlive {
		select {
		case <-ctx.Done():
			return ctx.Err()
		default:
		}

		result, err := stream.Recv()
		if err != nil {
			return err
		}

		th.mu.Lock()
		th.result = result
		th.mu.Unlock()

		if first {
			// We got the first result, so we're ready to be accessed.
			close(th.ready)
			first = false
		}
	}

	return nil
}
Example #5
0
func commandVtTabletExecute(ctx context.Context, wr *wrangler.Wrangler, subFlags *flag.FlagSet, args []string) error {
	transactionID := subFlags.Int("transaction_id", 0, "transaction id to use, if inside a transaction.")
	bindVariables := newBindvars(subFlags)
	keyspace := subFlags.String("keyspace", "", "keyspace the tablet belongs to")
	shard := subFlags.String("shard", "", "shard the tablet belongs to")
	tabletType := subFlags.String("tablet_type", "unknown", "tablet type we expect from the tablet (use unknown to use sessionId)")
	connectTimeout := subFlags.Duration("connect_timeout", 30*time.Second, "Connection timeout for vttablet client")
	json := subFlags.Bool("json", false, "Output JSON instead of human-readable table")

	if err := subFlags.Parse(args); err != nil {
		return err
	}
	if subFlags.NArg() != 2 {
		return fmt.Errorf("the <tablet_alias> and <sql> arguments are required for the VtTabletExecute command")
	}
	tt, err := topoproto.ParseTabletType(*tabletType)
	if err != nil {
		return err
	}
	tabletAlias, err := topoproto.ParseTabletAlias(subFlags.Arg(0))
	if err != nil {
		return err
	}
	tabletInfo, err := wr.TopoServer().GetTablet(ctx, tabletAlias)
	if err != nil {
		return err
	}
	ep, err := topo.TabletEndPoint(tabletInfo.Tablet)
	if err != nil {
		return fmt.Errorf("cannot get EndPoint from tablet record: %v", err)
	}

	conn, err := tabletconn.GetDialer()(ctx, ep, *keyspace, *shard, tt, *connectTimeout)
	if err != nil {
		return fmt.Errorf("cannot connect to tablet %v: %v", tabletAlias, err)
	}
	defer conn.Close()

	qr, err := conn.Execute(ctx, subFlags.Arg(1), *bindVariables, int64(*transactionID))
	if err != nil {
		return fmt.Errorf("Execute failed: %v", err)
	}
	if *json {
		return printJSON(wr.Logger(), qr)
	}
	printQueryResult(loggerWriter{wr.Logger()}, qr)
	return nil
}
Example #6
0
func commandVtTabletStreamHealth(ctx context.Context, wr *wrangler.Wrangler, subFlags *flag.FlagSet, args []string) error {
	count := subFlags.Int("count", 1, "number of responses to wait for")
	connectTimeout := subFlags.Duration("connect_timeout", 30*time.Second, "Connection timeout for vttablet client")
	if err := subFlags.Parse(args); err != nil {
		return err
	}
	if subFlags.NArg() != 1 {
		return fmt.Errorf("The <tablet alias> argument is required for the VtTabletStreamHealth command.")
	}
	tabletAlias, err := topoproto.ParseTabletAlias(subFlags.Arg(0))
	if err != nil {
		return err
	}
	tabletInfo, err := wr.TopoServer().GetTablet(ctx, tabletAlias)
	if err != nil {
		return err
	}

	ep, err := topo.TabletEndPoint(tabletInfo.Tablet)
	if err != nil {
		return fmt.Errorf("cannot get EndPoint from tablet record: %v", err)
	}

	// pass in a non-UNKNOWN tablet type to not use sessionId
	conn, err := tabletconn.GetDialer()(ctx, ep, "", "", pb.TabletType_MASTER, *connectTimeout)
	if err != nil {
		return fmt.Errorf("cannot connect to tablet %v: %v", tabletAlias, err)
	}

	stream, errFunc, err := conn.StreamHealth(ctx)
	if err != nil {
		return err
	}
	for i := 0; i < *count; i++ {
		shr, ok := <-stream
		if !ok {
			return fmt.Errorf("stream ended early: %v", errFunc())
		}
		data, err := json.Marshal(shr)
		if err != nil {
			wr.Logger().Errorf("cannot json-marshal structure: %v", err)
		} else {
			wr.Logger().Printf("%v\n", string(data))
		}
	}
	return nil
}
Example #7
0
// UpdateTabletEndpoints fixes up any entries in the serving graph that relate
// to a given tablet.
func UpdateTabletEndpoints(ctx context.Context, ts topo.Server, tablet *pb.Tablet) (err error) {
	srvTypes, err := ts.GetSrvTabletTypesPerShard(ctx, tablet.Alias.Cell, tablet.Keyspace, tablet.Shard)
	if err != nil {
		if err != topo.ErrNoNode {
			return err
		}
		// It's fine if there are no existing types.
		srvTypes = nil
	}

	wg := sync.WaitGroup{}
	errs := concurrency.AllErrorRecorder{}

	// Update the list that the tablet is supposed to be in (if any).
	if topo.IsInServingGraph(tablet.Type) {
		endpoint, err := topo.TabletEndPoint(tablet)
		if err != nil {
			return err
		}

		wg.Add(1)
		go func() {
			defer wg.Done()
			errs.RecordError(
				updateEndpoint(ctx, ts, tablet.Alias.Cell, tablet.Keyspace, tablet.Shard,
					tablet.Type, endpoint))
		}()
	}

	// Remove it from any other lists it isn't supposed to be in.
	for _, srvType := range srvTypes {
		if srvType != tablet.Type {
			wg.Add(1)
			go func(tabletType pb.TabletType) {
				defer wg.Done()
				errs.RecordError(
					removeEndpoint(ctx, ts, tablet.Alias.Cell, tablet.Keyspace, tablet.Shard,
						tabletType, tablet.Alias.Uid))
			}(srvType)
		}
	}

	wg.Wait()
	return errs.Error()
}
Example #8
0
func commandVtTabletBegin(ctx context.Context, wr *wrangler.Wrangler, subFlags *flag.FlagSet, args []string) error {
	keyspace := subFlags.String("keyspace", "", "keyspace the tablet belongs to")
	shard := subFlags.String("shard", "", "shard the tablet belongs to")
	tabletType := subFlags.String("tablet_type", "unknown", "tablet type we expect from the tablet (use unknown to use sessionId)")
	connectTimeout := subFlags.Duration("connect_timeout", 30*time.Second, "Connection timeout for vttablet client")
	if err := subFlags.Parse(args); err != nil {
		return err
	}
	if subFlags.NArg() != 1 {
		return fmt.Errorf("the <tablet_alias> argument is required for the VtTabletBegin command")
	}
	tt, err := topoproto.ParseTabletType(*tabletType)
	if err != nil {
		return err
	}
	tabletAlias, err := topoproto.ParseTabletAlias(subFlags.Arg(0))
	if err != nil {
		return err
	}
	tabletInfo, err := wr.TopoServer().GetTablet(ctx, tabletAlias)
	if err != nil {
		return err
	}
	ep, err := topo.TabletEndPoint(tabletInfo.Tablet)
	if err != nil {
		return fmt.Errorf("cannot get EndPoint from tablet record: %v", err)
	}

	conn, err := tabletconn.GetDialer()(ctx, ep, *keyspace, *shard, tt, *connectTimeout)
	if err != nil {
		return fmt.Errorf("cannot connect to tablet %v: %v", tabletAlias, err)
	}
	defer conn.Close()

	transactionID, err := conn.Begin(ctx)
	if err != nil {
		return fmt.Errorf("Begin failed: %v", err)
	}
	result := map[string]int64{
		"transaction_id": transactionID,
	}
	return printJSON(wr, result)
}
Example #9
0
func commandVtTabletRollback(ctx context.Context, wr *wrangler.Wrangler, subFlags *flag.FlagSet, args []string) error {
	keyspace := subFlags.String("keyspace", "", "keyspace the tablet belongs to")
	shard := subFlags.String("shard", "", "shard the tablet belongs to")
	tabletType := subFlags.String("tablet_type", "unknown", "tablet type we expect from the tablet (use unknown to use sessionId)")
	connectTimeout := subFlags.Duration("connect_timeout", 30*time.Second, "Connection timeout for vttablet client")
	if err := subFlags.Parse(args); err != nil {
		return err
	}
	if subFlags.NArg() != 2 {
		return fmt.Errorf("the <tablet_alias> and <transaction_id> arguments are required for the VtTabletRollback command")
	}
	transactionID, err := strconv.ParseInt(subFlags.Arg(1), 10, 64)
	if err != nil {
		return err
	}
	tt, err := topo.ParseTabletType(*tabletType)
	if err != nil {
		return err
	}
	tabletAlias, err := topo.ParseTabletAliasString(subFlags.Arg(0))
	if err != nil {
		return err
	}
	tabletInfo, err := wr.TopoServer().GetTablet(ctx, tabletAlias)
	if err != nil {
		return err
	}
	ep, err := topo.TabletEndPoint(tabletInfo.Tablet)
	if err != nil {
		return fmt.Errorf("cannot get EndPoint from tablet record: %v", err)
	}

	conn, err := tabletconn.GetDialer()(ctx, ep, *keyspace, *shard, tt, *connectTimeout)
	if err != nil {
		return fmt.Errorf("cannot connect to tablet %v: %v", tabletAlias, err)
	}
	defer conn.Close()

	return conn.Rollback(ctx, transactionID)
}
Example #10
0
// NewQueryResultReaderForTablet creates a new QueryResultReader for
// the provided tablet / sql query
func NewQueryResultReaderForTablet(ctx context.Context, ts topo.Server, tabletAlias *topodatapb.TabletAlias, sql string) (*QueryResultReader, error) {
	shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout)
	tablet, err := ts.GetTablet(shortCtx, tabletAlias)
	cancel()
	if err != nil {
		return nil, err
	}

	endPoint, err := topo.TabletEndPoint(tablet.Tablet)
	if err != nil {
		return nil, err
	}

	// use sessionId for now
	conn, err := tabletconn.GetDialer()(ctx, endPoint, tablet.Keyspace, tablet.Shard, topodatapb.TabletType_UNKNOWN, *remoteActionsTimeout)
	if err != nil {
		return nil, err
	}

	stream, err := conn.StreamExecute(ctx, sql, make(map[string]interface{}), 0)
	if err != nil {
		return nil, err
	}

	// read the columns, or grab the error
	cols, err := stream.Recv()
	if err != nil {
		return nil, fmt.Errorf("Cannot read Fields for query '%v': %v", sql, err)
	}

	return &QueryResultReader{
		Output: stream,
		Fields: cols.Fields,
		conn:   conn,
	}, nil
}
Example #11
0
// rebuildCellSrvShard computes and writes the serving graph data to a
// single cell
func rebuildCellSrvShard(ctx context.Context, log logutil.Logger, ts topo.Server, si *topo.ShardInfo, cell string) (err error) {
	log.Infof("rebuildCellSrvShard %v/%v in cell %v", si.Keyspace(), si.ShardName(), cell)

	for {
		select {
		case <-ctx.Done():
			return ctx.Err()
		default:
		}

		// Read existing EndPoints node versions, so we know if any
		// changes sneak in after we read the tablets.
		versions, err := getEndPointsVersions(ctx, ts, cell, si.Keyspace(), si.ShardName())

		// Get all tablets in this cell/shard.
		tablets, err := ts.GetTabletMapForShardByCell(ctx, si.Keyspace(), si.ShardName(), []string{cell})
		if err != nil {
			if err != topo.ErrPartialResult {
				return err
			}
			log.Warningf("Got ErrPartialResult from topo.GetTabletMapForShardByCell(%v), some tablets may not be added properly to serving graph", cell)
		}

		// Build up the serving graph from scratch.
		serving := make(map[pb.TabletType]*pb.EndPoints)
		for _, tablet := range tablets {
			// Only add serving types.
			if !tablet.IsInServingGraph() {
				continue
			}

			// Check the Keyspace and Shard for the tablet are right.
			if tablet.Keyspace != si.Keyspace() || tablet.Shard != si.ShardName() {
				return fmt.Errorf("CRITICAL: tablet %v is in replication graph for shard %v/%v but belongs to shard %v:%v", tablet.Alias, si.Keyspace(), si.ShardName(), tablet.Keyspace, tablet.Shard)
			}

			// Add the tablet to the list.
			endpoints, ok := serving[tablet.Type]
			if !ok {
				endpoints = topo.NewEndPoints()
				serving[tablet.Type] = endpoints
			}
			entry, err := topo.TabletEndPoint(tablet.Tablet)
			if err != nil {
				log.Warningf("EndPointForTablet failed for tablet %v: %v", tablet.Alias, err)
				continue
			}
			endpoints.Entries = append(endpoints.Entries, entry)
		}

		wg := sync.WaitGroup{}
		fatalErrs := concurrency.AllErrorRecorder{}
		retryErrs := concurrency.AllErrorRecorder{}

		// Write nodes that should exist.
		for tabletType, endpoints := range serving {
			wg.Add(1)
			go func(tabletType pb.TabletType, endpoints *pb.EndPoints) {
				defer wg.Done()

				log.Infof("saving serving graph for cell %v shard %v/%v tabletType %v", cell, si.Keyspace(), si.ShardName(), tabletType)

				version, ok := versions[tabletType]
				if !ok {
					// This type didn't exist when we first checked.
					// Try to create, but only if it still doesn't exist.
					if err := ts.CreateEndPoints(ctx, cell, si.Keyspace(), si.ShardName(), tabletType, endpoints); err != nil {
						log.Warningf("CreateEndPoints(%v, %v, %v) failed during rebuild: %v", cell, si, tabletType, err)
						switch err {
						case topo.ErrNodeExists:
							retryErrs.RecordError(err)
						default:
							fatalErrs.RecordError(err)
						}
					}
					return
				}

				// Update only if the version matches.
				if err := ts.UpdateEndPoints(ctx, cell, si.Keyspace(), si.ShardName(), tabletType, endpoints, version); err != nil {
					log.Warningf("UpdateEndPoints(%v, %v, %v) failed during rebuild: %v", cell, si, tabletType, err)
					switch err {
					case topo.ErrBadVersion, topo.ErrNoNode:
						retryErrs.RecordError(err)
					default:
						fatalErrs.RecordError(err)
					}
				}
			}(tabletType, endpoints)
		}

		// Delete nodes that shouldn't exist.
		for tabletType, version := range versions {
			if _, ok := serving[tabletType]; !ok {
				wg.Add(1)
				go func(tabletType pb.TabletType, version int64) {
					defer wg.Done()
					log.Infof("removing stale db type from serving graph: %v", tabletType)
					if err := ts.DeleteEndPoints(ctx, cell, si.Keyspace(), si.ShardName(), tabletType, version); err != nil && err != topo.ErrNoNode {
						log.Warningf("DeleteEndPoints(%v, %v, %v) failed during rebuild: %v", cell, si, tabletType, err)
						switch err {
						case topo.ErrNoNode:
							// Someone else deleted it, which is fine.
						case topo.ErrBadVersion:
							retryErrs.RecordError(err)
						default:
							fatalErrs.RecordError(err)
						}
					}
				}(tabletType, version)
			}
		}

		// Update srvShard object
		wg.Add(1)
		go func() {
			defer wg.Done()
			log.Infof("updating shard serving graph in cell %v for %v/%v", cell, si.Keyspace(), si.ShardName())
			if err := UpdateSrvShard(ctx, ts, cell, si); err != nil {
				fatalErrs.RecordError(err)
				log.Warningf("writing serving data in cell %v for %v/%v failed: %v", cell, si.Keyspace(), si.ShardName(), err)
			}
		}()

		wg.Wait()

		// If there are any fatal errors, give up.
		if fatalErrs.HasErrors() {
			return fatalErrs.Error()
		}
		// If there are any retry errors, try again.
		if retryErrs.HasErrors() {
			continue
		}
		// Otherwise, success!
		return nil
	}
}
Example #12
0
// TabletExternallyReparented updates all topo records so the current
// tablet is the new master for this shard.
// Should be called under RPCWrapLock.
func (agent *ActionAgent) TabletExternallyReparented(ctx context.Context, externalID string) error {
	startTime := time.Now()

	// If there is a finalize step running, wait for it to finish or time out
	// before checking the global shard record again.
	if agent.finalizeReparentCtx != nil {
		select {
		case <-agent.finalizeReparentCtx.Done():
			agent.finalizeReparentCtx = nil
		case <-ctx.Done():
			return ctx.Err()
		}
	}

	tablet := agent.Tablet()

	// Check the global shard record.
	si, err := agent.TopoServer.GetShard(ctx, tablet.Keyspace, tablet.Shard)
	if err != nil {
		log.Warningf("fastTabletExternallyReparented: failed to read global shard record for %v/%v: %v", tablet.Keyspace, tablet.Shard, err)
		return err
	}
	if topoproto.TabletAliasEqual(si.MasterAlias, tablet.Alias) {
		// We may get called on the current master even when nothing has changed.
		// If the global shard record is already updated, it means we successfully
		// finished a previous reparent to this tablet.
		return nil
	}

	// Remember when we were first told we're the master.
	// If another tablet claims to be master and offers a more recent time,
	// that tablet will be trusted over us.
	agent.mutex.Lock()
	agent._tabletExternallyReparentedTime = startTime
	agent._replicationDelay = 0
	agent.mutex.Unlock()

	// Create a reusable Reparent event with available info.
	ev := &events.Reparent{
		ShardInfo: *si,
		NewMaster: *tablet,
		OldMaster: topodatapb.Tablet{
			Alias: si.MasterAlias,
			Type:  topodatapb.TabletType_MASTER,
		},
		ExternalID: externalID,
	}
	defer func() {
		if err != nil {
			event.DispatchUpdate(ev, "failed: "+err.Error())
		}
	}()
	event.DispatchUpdate(ev, "starting external from tablet (fast)")

	var wg sync.WaitGroup
	var errs concurrency.AllErrorRecorder

	// Execute state change to master by force-updating only the local copy of the
	// tablet record. The actual record in topo will be updated later.
	log.Infof("fastTabletExternallyReparented: executing change callback for state change to MASTER")
	oldTablet := proto.Clone(tablet).(*topodatapb.Tablet)
	tablet.Type = topodatapb.TabletType_MASTER
	tablet.HealthMap = nil
	agent.setTablet(tablet)

	wg.Add(1)
	go func() {
		defer wg.Done()

		// This is where updateState will block for gracePeriod, while it gives
		// vtgate a chance to stop sending replica queries.
		if err := agent.updateState(ctx, oldTablet, "fastTabletExternallyReparented"); err != nil {
			errs.RecordError(fmt.Errorf("fastTabletExternallyReparented: failed to change tablet state to MASTER: %v", err))
		}
	}()

	wg.Add(1)
	go func() {
		defer wg.Done()

		// Directly write the new master endpoint in the serving graph.
		// We will do a true rebuild in the background soon, but in the meantime,
		// this will be enough for clients to re-resolve the new master.
		event.DispatchUpdate(ev, "writing new master endpoint")
		log.Infof("fastTabletExternallyReparented: writing new master endpoint to serving graph")
		ep, err := topo.TabletEndPoint(tablet)
		if err != nil {
			errs.RecordError(fmt.Errorf("fastTabletExternallyReparented: failed to generate EndPoint for tablet %v: %v", tablet.Alias, err))
			return
		}
		err = topo.UpdateEndPoints(ctx, agent.TopoServer, tablet.Alias.Cell,
			si.Keyspace(), si.ShardName(), topodatapb.TabletType_MASTER,
			&topodatapb.EndPoints{Entries: []*topodatapb.EndPoint{ep}}, -1)
		if err != nil {
			errs.RecordError(fmt.Errorf("fastTabletExternallyReparented: failed to update master endpoint: %v", err))
			return
		}
		externalReparentStats.Record("NewMasterVisible", startTime)
	}()

	// Wait for serving state grace period and serving graph update.
	wg.Wait()
	if errs.HasErrors() {
		return errs.Error()
	}

	// Start the finalize stage with a background context, but connect the trace.
	bgCtx, cancel := context.WithTimeout(agent.batchCtx, *finalizeReparentTimeout)
	bgCtx = trace.CopySpan(bgCtx, ctx)
	agent.finalizeReparentCtx = bgCtx
	go func() {
		err := agent.finalizeTabletExternallyReparented(bgCtx, si, ev)
		cancel()

		if err != nil {
			log.Warningf("finalizeTabletExternallyReparented error: %v", err)
			event.DispatchUpdate(ev, "failed: "+err.Error())
			return
		}
		externalReparentStats.Record("FullRebuild", startTime)
	}()

	return nil
}
Example #13
0
// TabletExternallyReparented updates all topo records so the current
// tablet is the new master for this shard.
// Should be called under RPCWrapLock.
func (agent *ActionAgent) TabletExternallyReparented(ctx context.Context, externalID string) error {
	startTime := time.Now()

	// If there is a finalize step running, wait for it to finish or time out
	// before checking the global shard record again.
	if agent.finalizeReparentCtx != nil {
		select {
		case <-agent.finalizeReparentCtx.Done():
			agent.finalizeReparentCtx = nil
		case <-ctx.Done():
			return ctx.Err()
		}
	}

	tablet := agent.Tablet()

	// Check the global shard record.
	si, err := topo.GetShard(ctx, agent.TopoServer, tablet.Keyspace, tablet.Shard)
	if err != nil {
		log.Warningf("fastTabletExternallyReparented: failed to read global shard record for %v/%v: %v", tablet.Keyspace, tablet.Shard, err)
		return err
	}
	if topo.TabletAliasEqual(si.MasterAlias, tablet.Alias) {
		// We may get called on the current master even when nothing has changed.
		// If the global shard record is already updated, it means we successfully
		// finished a previous reparent to this tablet.
		return nil
	}

	// Create a reusable Reparent event with available info.
	ev := &events.Reparent{
		ShardInfo: *si,
		NewMaster: *tablet.Tablet,
		OldMaster: pb.Tablet{
			Alias: si.MasterAlias,
			Type:  pb.TabletType_MASTER,
		},
		ExternalID: externalID,
	}
	defer func() {
		if err != nil {
			event.DispatchUpdate(ev, "failed: "+err.Error())
		}
	}()
	event.DispatchUpdate(ev, "starting external from tablet (fast)")

	// Execute state change to master by force-updating only the local copy of the
	// tablet record. The actual record in topo will be updated later.
	log.Infof("fastTabletExternallyReparented: executing change callback for state change to MASTER")
	oldTablet := *tablet.Tablet
	newTablet := oldTablet
	newTablet.Type = pb.TabletType_MASTER
	newTablet.HealthMap = nil
	agent.setTablet(topo.NewTabletInfo(&newTablet, -1))
	if err := agent.updateState(ctx, &oldTablet, "fastTabletExternallyReparented"); err != nil {
		return fmt.Errorf("fastTabletExternallyReparented: failed to change tablet state to MASTER: %v", err)
	}

	agent.mutex.Lock()
	agent._tabletExternallyReparentedTime = time.Now()
	agent.mutex.Unlock()

	// Directly write the new master endpoint in the serving graph.
	// We will do a true rebuild in the background soon, but in the meantime,
	// this will be enough for clients to re-resolve the new master.
	event.DispatchUpdate(ev, "writing new master endpoint")
	log.Infof("fastTabletExternallyReparented: writing new master endpoint to serving graph")
	ep, err := topo.TabletEndPoint(tablet.Tablet)
	if err != nil {
		return fmt.Errorf("fastTabletExternallyReparented: failed to generate EndPoint for tablet %v: %v", tablet.Alias, err)
	}
	err = topo.UpdateEndPoints(ctx, agent.TopoServer, tablet.Alias.Cell,
		si.Keyspace(), si.ShardName(), pb.TabletType_MASTER,
		&pb.EndPoints{Entries: []*pb.EndPoint{ep}}, -1)
	if err != nil {
		return fmt.Errorf("fastTabletExternallyReparented: failed to update master endpoint: %v", err)
	}
	externalReparentStats.Record("NewMasterVisible", startTime)

	// Start the finalize stage with a background context, but connect the trace.
	bgCtx, cancel := context.WithTimeout(agent.batchCtx, *finalizeReparentTimeout)
	bgCtx = trace.CopySpan(bgCtx, ctx)
	agent.finalizeReparentCtx = bgCtx
	go func() {
		err := agent.finalizeTabletExternallyReparented(bgCtx, si, ev)
		cancel()

		if err != nil {
			log.Warningf("finalizeTabletExternallyReparented error: %v", err)
			event.DispatchUpdate(ev, "failed: "+err.Error())
			return
		}
		externalReparentStats.Record("FullRebuild", startTime)
	}()

	return nil
}