Example #1
0
// unlockShard unlocks a previously locked shard.
func (l *Lock) unlockShard(ctx context.Context, ts Server, keyspace, shard string, lockPath string, actionError error) error {
	// Detach from the parent timeout, but copy the trace span.
	// We need to still release the lock even if the parent context timed out.
	ctx = trace.CopySpan(context.TODO(), ctx)
	ctx, cancel := context.WithTimeout(ctx, DefaultLockTimeout)
	defer cancel()

	span := trace.NewSpanFromContext(ctx)
	span.StartClient("TopoServer.UnlockShardForAction")
	span.Annotate("action", l.Action)
	span.Annotate("keyspace", keyspace)
	span.Annotate("shard", shard)
	defer span.Finish()

	// first update the actionNode
	if actionError != nil {
		log.Infof("Unlocking shard %v/%v for action %v with error %v", keyspace, shard, l.Action, actionError)
		l.Status = "Error: " + actionError.Error()
	} else {
		log.Infof("Unlocking shard %v/%v for successful action %v", keyspace, shard, l.Action)
		l.Status = "Done"
	}
	j, err := l.ToJSON()
	if err != nil {
		return err
	}
	return ts.UnlockShardForAction(ctx, keyspace, shard, lockPath, j)
}
Example #2
0
// UnlockShard unlocks a previously locked shard.
func (n *ActionNode) UnlockShard(ctx context.Context, ts topo.Server, keyspace, shard string, lockPath string, actionError error) error {
	// Detach from the parent timeout, but copy the trace span.
	// We need to still release the lock even if the parent context timed out.
	ctx = trace.CopySpan(context.TODO(), ctx)
	ctx, cancel := context.WithTimeout(ctx, DefaultLockTimeout)
	defer cancel()

	span := trace.NewSpanFromContext(ctx)
	span.StartClient("TopoServer.UnlockShardForAction")
	span.Annotate("action", n.Action)
	span.Annotate("keyspace", keyspace)
	span.Annotate("shard", shard)
	defer span.Finish()

	// first update the actionNode
	if actionError != nil {
		log.Infof("Unlocking shard %v/%v for action %v with error %v", keyspace, shard, n.Action, actionError)
		n.Error = actionError.Error()
		n.State = ActionStateFailed
	} else {
		log.Infof("Unlocking shard %v/%v for successful action %v", keyspace, shard, n.Action)
		n.Error = ""
		n.State = ActionStateDone
	}
	err := ts.UnlockShardForAction(ctx, keyspace, shard, lockPath, n.ToJSON())
	if actionError != nil {
		if err != nil {
			// this will be masked
			log.Warningf("UnlockShardForAction failed: %v", err)
		}
		return actionError
	}
	return err
}
Example #3
0
// CommitPrepared commits a prepared transaction. If the operation
// fails, an error counter is incremented and the transaction is
// marked as defunct in the redo log. If the marking fails, a
// different error counter is incremented indicating a more
// severe condition.
func (txe *TxExecutor) CommitPrepared(dtid string) error {
	defer txe.qe.queryServiceStats.QueryStats.Record("COMMIT_PREPARED", time.Now())
	conn := txe.qe.preparedPool.Get(dtid)
	if conn == nil {
		return nil
	}
	// We have to use a context that will never give up,
	// even if the original context expires.
	ctx := trace.CopySpan(context.Background(), txe.ctx)
	defer txe.qe.txPool.LocalConclude(ctx, conn)
	err := txe.qe.twoPC.DeleteRedo(ctx, conn, dtid)
	if err != nil {
		// TODO(sougou): raise alarms & mark as defunct.
		return err
	}
	err = txe.qe.txPool.LocalCommit(ctx, conn)
	if err != nil {
		// TODO(sougou): raise alarms & mark as defunct.
		return err
	}
	return nil
}
Example #4
0
// TabletExternallyReparented updates all topo records so the current
// tablet is the new master for this shard.
// Should be called under RPCWrapLock.
func (agent *ActionAgent) TabletExternallyReparented(ctx context.Context, externalID string) error {
	startTime := time.Now()

	// If there is a finalize step running, wait for it to finish or time out
	// before checking the global shard record again.
	if agent.finalizeReparentCtx != nil {
		select {
		case <-agent.finalizeReparentCtx.Done():
			agent.finalizeReparentCtx = nil
		case <-ctx.Done():
			return ctx.Err()
		}
	}

	tablet := agent.Tablet()

	// Check the global shard record.
	si, err := agent.TopoServer.GetShard(ctx, tablet.Keyspace, tablet.Shard)
	if err != nil {
		log.Warningf("fastTabletExternallyReparented: failed to read global shard record for %v/%v: %v", tablet.Keyspace, tablet.Shard, err)
		return err
	}
	if topoproto.TabletAliasEqual(si.MasterAlias, tablet.Alias) {
		// We may get called on the current master even when nothing has changed.
		// If the global shard record is already updated, it means we successfully
		// finished a previous reparent to this tablet.
		return nil
	}

	// Remember when we were first told we're the master.
	// If another tablet claims to be master and offers a more recent time,
	// that tablet will be trusted over us.
	agent.mutex.Lock()
	agent._tabletExternallyReparentedTime = startTime
	agent._replicationDelay = 0
	agent.mutex.Unlock()

	// Create a reusable Reparent event with available info.
	ev := &events.Reparent{
		ShardInfo: *si,
		NewMaster: *tablet,
		OldMaster: topodatapb.Tablet{
			Alias: si.MasterAlias,
			Type:  topodatapb.TabletType_MASTER,
		},
		ExternalID: externalID,
	}
	defer func() {
		if err != nil {
			event.DispatchUpdate(ev, "failed: "+err.Error())
		}
	}()
	event.DispatchUpdate(ev, "starting external from tablet (fast)")

	var wg sync.WaitGroup
	var errs concurrency.AllErrorRecorder

	// Execute state change to master by force-updating only the local copy of the
	// tablet record. The actual record in topo will be updated later.
	log.Infof("fastTabletExternallyReparented: executing change callback for state change to MASTER")
	oldTablet := proto.Clone(tablet).(*topodatapb.Tablet)
	tablet.Type = topodatapb.TabletType_MASTER
	tablet.HealthMap = nil
	agent.setTablet(tablet)

	wg.Add(1)
	go func() {
		defer wg.Done()

		// This is where updateState will block for gracePeriod, while it gives
		// vtgate a chance to stop sending replica queries.
		if err := agent.updateState(ctx, oldTablet, "fastTabletExternallyReparented"); err != nil {
			errs.RecordError(fmt.Errorf("fastTabletExternallyReparented: failed to change tablet state to MASTER: %v", err))
		}
	}()

	wg.Add(1)
	go func() {
		defer wg.Done()

		// Directly write the new master endpoint in the serving graph.
		// We will do a true rebuild in the background soon, but in the meantime,
		// this will be enough for clients to re-resolve the new master.
		event.DispatchUpdate(ev, "writing new master endpoint")
		log.Infof("fastTabletExternallyReparented: writing new master endpoint to serving graph")
		ep, err := topo.TabletEndPoint(tablet)
		if err != nil {
			errs.RecordError(fmt.Errorf("fastTabletExternallyReparented: failed to generate EndPoint for tablet %v: %v", tablet.Alias, err))
			return
		}
		err = topo.UpdateEndPoints(ctx, agent.TopoServer, tablet.Alias.Cell,
			si.Keyspace(), si.ShardName(), topodatapb.TabletType_MASTER,
			&topodatapb.EndPoints{Entries: []*topodatapb.EndPoint{ep}}, -1)
		if err != nil {
			errs.RecordError(fmt.Errorf("fastTabletExternallyReparented: failed to update master endpoint: %v", err))
			return
		}
		externalReparentStats.Record("NewMasterVisible", startTime)
	}()

	// Wait for serving state grace period and serving graph update.
	wg.Wait()
	if errs.HasErrors() {
		return errs.Error()
	}

	// Start the finalize stage with a background context, but connect the trace.
	bgCtx, cancel := context.WithTimeout(agent.batchCtx, *finalizeReparentTimeout)
	bgCtx = trace.CopySpan(bgCtx, ctx)
	agent.finalizeReparentCtx = bgCtx
	go func() {
		err := agent.finalizeTabletExternallyReparented(bgCtx, si, ev)
		cancel()

		if err != nil {
			log.Warningf("finalizeTabletExternallyReparented error: %v", err)
			event.DispatchUpdate(ev, "failed: "+err.Error())
			return
		}
		externalReparentStats.Record("FullRebuild", startTime)
	}()

	return nil
}
Example #5
0
// TabletExternallyReparented updates all topo records so the current
// tablet is the new master for this shard.
// Should be called under RPCWrapLock.
func (agent *ActionAgent) TabletExternallyReparented(ctx context.Context, externalID string) error {
	startTime := time.Now()

	// If there is a finalize step running, wait for it to finish or time out
	// before checking the global shard record again.
	if agent.finalizeReparentCtx != nil {
		select {
		case <-agent.finalizeReparentCtx.Done():
			agent.finalizeReparentCtx = nil
		case <-ctx.Done():
			return ctx.Err()
		}
	}

	tablet := agent.Tablet()

	// Check the global shard record.
	si, err := topo.GetShard(ctx, agent.TopoServer, tablet.Keyspace, tablet.Shard)
	if err != nil {
		log.Warningf("fastTabletExternallyReparented: failed to read global shard record for %v/%v: %v", tablet.Keyspace, tablet.Shard, err)
		return err
	}
	if si.MasterAlias == tablet.Alias {
		// We may get called on the current master even when nothing has changed.
		// If the global shard record is already updated, it means we successfully
		// finished a previous reparent to this tablet.
		return nil
	}

	// Create a reusable Reparent event with available info.
	ev := &events.Reparent{
		ShardInfo:  *si,
		NewMaster:  *tablet.Tablet,
		OldMaster:  topo.Tablet{Alias: si.MasterAlias, Type: topo.TYPE_MASTER},
		ExternalID: externalID,
	}
	defer func() {
		if err != nil {
			event.DispatchUpdate(ev, "failed: "+err.Error())
		}
	}()
	event.DispatchUpdate(ev, "starting external from tablet (fast)")

	// Execute state change to master by force-updating only the local copy of the
	// tablet record. The actual record in topo will be updated later.
	log.Infof("fastTabletExternallyReparented: executing change callback for state change to MASTER")
	oldTablet := *tablet.Tablet
	newTablet := oldTablet
	newTablet.Type = topo.TYPE_MASTER
	newTablet.Health = nil
	agent.setTablet(topo.NewTabletInfo(&newTablet, -1))
	if err := agent.updateState(ctx, &oldTablet, "fastTabletExternallyReparented"); err != nil {
		return fmt.Errorf("fastTabletExternallyReparented: failed to change tablet state to MASTER: %v", err)
	}

	// Directly write the new master endpoint in the serving graph.
	// We will do a true rebuild in the background soon, but in the meantime,
	// this will be enough for clients to re-resolve the new master.
	event.DispatchUpdate(ev, "writing new master endpoint")
	log.Infof("fastTabletExternallyReparented: writing new master endpoint to serving graph")
	ep, err := tablet.EndPoint()
	if err != nil {
		return fmt.Errorf("fastTabletExternallyReparented: failed to generate EndPoint for tablet %v: %v", tablet.Alias, err)
	}
	err = topo.UpdateEndPoints(ctx, agent.TopoServer, tablet.Alias.Cell,
		si.Keyspace(), si.ShardName(), topo.TYPE_MASTER,
		&topo.EndPoints{Entries: []topo.EndPoint{*ep}}, -1)
	if err != nil {
		return fmt.Errorf("fastTabletExternallyReparented: failed to update master endpoint: %v", err)
	}
	externalReparentStats.Record("NewMasterVisible", startTime)

	// Start the finalize stage with a background context, but connect the trace.
	bgCtx, cancel := context.WithTimeout(agent.batchCtx, *finalizeReparentTimeout)
	bgCtx = trace.CopySpan(bgCtx, ctx)
	agent.finalizeReparentCtx = bgCtx
	go func() {
		err := agent.finalizeTabletExternallyReparented(bgCtx, si, ev)
		cancel()

		if err != nil {
			log.Warningf("finalizeTabletExternallyReparented error: %v", err)
			event.DispatchUpdate(ev, "failed: "+err.Error())
			return
		}
		externalReparentStats.Record("FullRebuild", startTime)
	}()

	return nil
}
Example #6
0
// TabletExternallyReparented updates all topo records so the current
// tablet is the new master for this shard.
func (agent *ActionAgent) TabletExternallyReparented(ctx context.Context, externalID string) error {
	if err := agent.lock(ctx); err != nil {
		return err
	}
	defer agent.unlock()

	startTime := time.Now()

	// If there is a finalize step running, wait for it to finish or time out
	// before checking the global shard record again.
	if agent.finalizeReparentCtx != nil {
		select {
		case <-agent.finalizeReparentCtx.Done():
			agent.finalizeReparentCtx = nil
		case <-ctx.Done():
			return ctx.Err()
		}
	}

	tablet := agent.Tablet()

	// Check the global shard record.
	si, err := agent.TopoServer.GetShard(ctx, tablet.Keyspace, tablet.Shard)
	if err != nil {
		log.Warningf("fastTabletExternallyReparented: failed to read global shard record for %v/%v: %v", tablet.Keyspace, tablet.Shard, err)
		return err
	}
	if topoproto.TabletAliasEqual(si.MasterAlias, tablet.Alias) {
		// We may get called on the current master even when nothing has changed.
		// If the global shard record is already updated, it means we successfully
		// finished a previous reparent to this tablet.
		return nil
	}

	// Remember when we were first told we're the master.
	// If another tablet claims to be master and offers a more recent time,
	// that tablet will be trusted over us.
	agent.mutex.Lock()
	agent._tabletExternallyReparentedTime = startTime
	agent._replicationDelay = 0
	agent.mutex.Unlock()

	// Create a reusable Reparent event with available info.
	ev := &events.Reparent{
		ShardInfo: *si,
		NewMaster: *tablet,
		OldMaster: topodatapb.Tablet{
			Alias: si.MasterAlias,
			Type:  topodatapb.TabletType_MASTER,
		},
		ExternalID: externalID,
	}
	defer func() {
		if err != nil {
			event.DispatchUpdate(ev, "failed: "+err.Error())
		}
	}()
	event.DispatchUpdate(ev, "starting external from tablet (fast)")

	// Execute state change to master by force-updating only the local copy of the
	// tablet record. The actual record in topo will be updated later.
	log.Infof("fastTabletExternallyReparented: executing change callback for state change to MASTER")
	oldTablet := proto.Clone(tablet).(*topodatapb.Tablet)
	tablet.Type = topodatapb.TabletType_MASTER
	agent.setTablet(tablet)

	// This is where updateState will block for gracePeriod, while it gives
	// vtgate a chance to stop sending replica queries.
	agent.updateState(ctx, oldTablet, "fastTabletExternallyReparented")

	// Start the finalize stage with a background context, but connect the trace.
	bgCtx, cancel := context.WithTimeout(agent.batchCtx, *finalizeReparentTimeout)
	bgCtx = trace.CopySpan(bgCtx, ctx)
	agent.finalizeReparentCtx = bgCtx
	go func() {
		err := agent.finalizeTabletExternallyReparented(bgCtx, si, ev)
		cancel()

		if err != nil {
			log.Warningf("finalizeTabletExternallyReparented error: %v", err)
			event.DispatchUpdate(ev, "failed: "+err.Error())
			return
		}
		externalReparentStats.Record("FullRebuild", startTime)
	}()

	return nil
}