// UpdateStream is part of the vtgate service API. func (vtg *VTGate) UpdateStream(ctx context.Context, keyspace string, shard string, keyRange *topodatapb.KeyRange, tabletType topodatapb.TabletType, timestamp int64, event *querypb.EventToken, sendReply func(*querypb.StreamEvent, int64) error) error { startTime := time.Now() ltt := topoproto.TabletTypeLString(tabletType) statsKey := []string{"UpdateStream", keyspace, ltt} defer vtg.timings.Record(statsKey, startTime) err := vtg.resolver.UpdateStream( ctx, keyspace, shard, keyRange, tabletType, timestamp, event, sendReply, ) if err != nil { normalErrors.Add(statsKey, 1) query := map[string]interface{}{ "Keyspace": keyspace, "Shard": shard, "KeyRange": keyRange, "TabletType": ltt, "Timestamp": timestamp, } logError(err, query, vtg.logUpdateStream) } return formatError(err) }
// StatusAsHTML returns an HTML version of our status. // It works best if there is data in the cache. func (st *SrvKeyspaceCacheStatus) StatusAsHTML() template.HTML { if st.Value == nil { return template.HTML("No Data") } result := "<b>Partitions:</b><br>" for tabletType, keyspacePartition := range st.Value.Partitions { result += " <b>" + string(tabletType) + "</b>" for _, shard := range keyspacePartition.ShardReferences { result += " " + shard.Name } result += "<br>" } if st.Value.ShardingColumnName != "" { result += "<b>ShardingColumnName:</b> " + st.Value.ShardingColumnName + "<br>" result += "<b>ShardingColumnType:</b> " + string(st.Value.ShardingColumnType) + "<br>" } if len(st.Value.ServedFrom) > 0 { result += "<b>ServedFrom:</b><br>" for _, sf := range st.Value.ServedFrom { result += " <b>" + topoproto.TabletTypeLString(sf.TabletType) + "</b> " + sf.Keyspace + "<br>" } } return template.HTML(result) }
// ExecuteKeyRanges executes a non-streaming query based on the specified keyranges. func (vtg *VTGate) ExecuteKeyRanges(ctx context.Context, sql string, bindVariables map[string]interface{}, keyspace string, keyRanges []*topodatapb.KeyRange, tabletType topodatapb.TabletType, session *vtgatepb.Session, notInTransaction bool) (*sqltypes.Result, error) { startTime := time.Now() ltt := topoproto.TabletTypeLString(tabletType) statsKey := []string{"ExecuteKeyRanges", keyspace, ltt} defer vtg.timings.Record(statsKey, startTime) sql = sqlannotation.AddFilteredReplicationUnfriendlyIfDML(sql) qr, err := vtg.resolver.ExecuteKeyRanges(ctx, sql, bindVariables, keyspace, keyRanges, tabletType, session, notInTransaction) if err == nil { vtg.rowsReturned.Add(statsKey, int64(len(qr.Rows))) return qr, nil } query := map[string]interface{}{ "Sql": sql, "BindVariables": bindVariables, "Keyspace": keyspace, "KeyRanges": keyRanges, "TabletType": ltt, "Session": session, "NotInTransaction": notInTransaction, } handleExecuteError(err, statsKey, query, vtg.logExecuteKeyRanges) return nil, err }
// ExecuteBatchKeyspaceIds executes a group of queries based on the specified keyspace ids. func (vtg *VTGate) ExecuteBatchKeyspaceIds(ctx context.Context, queries []*vtgatepb.BoundKeyspaceIdQuery, tabletType topodatapb.TabletType, asTransaction bool, session *vtgatepb.Session, options *querypb.ExecuteOptions) ([]sqltypes.Result, error) { startTime := time.Now() ltt := topoproto.TabletTypeLString(tabletType) statsKey := []string{"ExecuteBatchKeyspaceIds", "", ltt} defer vtg.timings.Record(statsKey, startTime) annotateBoundKeyspaceIDQueries(queries) qrs, err := vtg.resolver.ExecuteBatchKeyspaceIds( ctx, queries, tabletType, asTransaction, session, options) if err == nil { var rowCount int64 for _, qr := range qrs { rowCount += int64(len(qr.Rows)) } vtg.rowsReturned.Add(statsKey, rowCount) return qrs, nil } query := map[string]interface{}{ "Queries": queries, "TabletType": ltt, "AsTransaction": asTransaction, "Session": session, "Options": options, } err = handleExecuteError(err, statsKey, query, vtg.logExecuteBatchKeyspaceIds) return nil, err }
// StreamExecuteShards executes a streaming query on the specified shards. func (vtg *VTGate) StreamExecuteShards(ctx context.Context, sql string, bindVariables map[string]interface{}, keyspace string, shards []string, tabletType topodatapb.TabletType, options *querypb.ExecuteOptions, sendReply func(*sqltypes.Result) error) error { startTime := time.Now() ltt := topoproto.TabletTypeLString(tabletType) statsKey := []string{"StreamExecuteShards", keyspace, ltt} defer vtg.timings.Record(statsKey, startTime) err := vtg.resolver.streamExecute( ctx, sql, bindVariables, keyspace, tabletType, func(keyspace string) (string, []string, error) { return keyspace, shards, nil }, options, func(reply *sqltypes.Result) error { vtg.rowsReturned.Add(statsKey, int64(len(reply.Rows))) return sendReply(reply) }) if err != nil { normalErrors.Add(statsKey, 1) query := map[string]interface{}{ "Sql": sql, "BindVariables": bindVariables, "Keyspace": keyspace, "Shards": shards, "TabletType": ltt, "Options": options, } logError(err, query, vtg.logStreamExecuteShards) } return formatError(err) }
// ExecuteEntityIds excutes a non-streaming query based on given KeyspaceId map. func (vtg *VTGate) ExecuteEntityIds(ctx context.Context, sql string, bindVariables map[string]interface{}, keyspace string, entityColumnName string, entityKeyspaceIDs []*vtgatepb.ExecuteEntityIdsRequest_EntityId, tabletType topodatapb.TabletType, session *vtgatepb.Session, notInTransaction bool, options *querypb.ExecuteOptions) (*sqltypes.Result, error) { startTime := time.Now() ltt := topoproto.TabletTypeLString(tabletType) statsKey := []string{"ExecuteEntityIds", keyspace, ltt} defer vtg.timings.Record(statsKey, startTime) sql = sqlannotation.AnnotateIfDML(sql, nil) qr, err := vtg.resolver.ExecuteEntityIds(ctx, sql, bindVariables, keyspace, entityColumnName, entityKeyspaceIDs, tabletType, session, notInTransaction, options) if err == nil { vtg.rowsReturned.Add(statsKey, int64(len(qr.Rows))) return qr, nil } query := map[string]interface{}{ "Sql": sql, "BindVariables": bindVariables, "Keyspace": keyspace, "EntityColumnName": entityColumnName, "EntityKeyspaceIDs": entityKeyspaceIDs, "TabletType": ltt, "Session": session, "NotInTransaction": notInTransaction, "Options": options, } err = handleExecuteError(err, statsKey, query, vtg.logExecuteEntityIds) return nil, err }
// StreamExecuteKeyRanges executes a streaming query on the specified KeyRanges. // The KeyRanges are resolved to shards using the serving graph. // This function currently temporarily enforces the restriction of executing on // one shard since it cannot merge-sort the results to guarantee ordering of // response which is needed for checkpointing. // The api supports supplying multiple keyranges to make it future proof. func (vtg *VTGate) StreamExecuteKeyRanges(ctx context.Context, sql string, bindVariables map[string]interface{}, keyspace string, keyRanges []*topodatapb.KeyRange, tabletType topodatapb.TabletType, sendReply func(*sqltypes.Result) error) error { startTime := time.Now() ltt := topoproto.TabletTypeLString(tabletType) statsKey := []string{"StreamExecuteKeyRanges", keyspace, ltt} defer vtg.timings.Record(statsKey, startTime) var rowCount int64 err := vtg.resolver.StreamExecuteKeyRanges( ctx, sql, bindVariables, keyspace, keyRanges, tabletType, func(reply *sqltypes.Result) error { rowCount += int64(len(reply.Rows)) vtg.rowsReturned.Add(statsKey, int64(len(reply.Rows))) return sendReply(reply) }) if err != nil { normalErrors.Add(statsKey, 1) query := map[string]interface{}{ "Sql": sql, "BindVariables": bindVariables, "Keyspace": keyspace, "KeyRanges": keyRanges, "TabletType": ltt, } logError(err, query, vtg.logStreamExecuteKeyRanges) } return formatError(err) }
// ExecuteBatchShards executes a group of queries on the specified shards. func (vtg *VTGate) ExecuteBatchShards(ctx context.Context, queries []*vtgatepb.BoundShardQuery, tabletType topodatapb.TabletType, asTransaction bool, session *vtgatepb.Session) ([]sqltypes.Result, error) { startTime := time.Now() ltt := topoproto.TabletTypeLString(tabletType) statsKey := []string{"ExecuteBatchShards", "", ltt} defer vtg.timings.Record(statsKey, startTime) annotateBoundShardQueriesAsUnfriendly(queries) qrs, err := vtg.resolver.ExecuteBatch( ctx, tabletType, asTransaction, session, func() (*scatterBatchRequest, error) { return boundShardQueriesToScatterBatchRequest(queries) }) if err == nil { var rowCount int64 for _, qr := range qrs { rowCount += int64(len(qr.Rows)) } vtg.rowsReturned.Add(statsKey, rowCount) return qrs, nil } query := map[string]interface{}{ "Queries": queries, "TabletType": ltt, "AsTransaction": asTransaction, "Session": session, } handleExecuteError(err, statsKey, query, vtg.logExecuteBatchShards) return nil, err }
func getKeyspaceShards(ctx context.Context, topoServ topo.SrvTopoServer, cell, keyspace string, tabletType topodatapb.TabletType) (string, *topodatapb.SrvKeyspace, []*topodatapb.ShardReference, error) { srvKeyspace, err := topoServ.GetSrvKeyspace(ctx, cell, keyspace) if err != nil { return "", nil, nil, vterrors.NewVitessError( vtrpcpb.ErrorCode_INTERNAL_ERROR, err, "keyspace %v fetch error: %v", keyspace, err, ) } // check if the keyspace has been redirected for this tabletType. for _, sf := range srvKeyspace.ServedFrom { if sf.TabletType == tabletType { keyspace = sf.Keyspace srvKeyspace, err = topoServ.GetSrvKeyspace(ctx, cell, keyspace) if err != nil { return "", nil, nil, vterrors.NewVitessError( vtrpcpb.ErrorCode_INTERNAL_ERROR, err, "keyspace %v fetch error: %v", keyspace, err, ) } } } partition := topoproto.SrvKeyspaceGetPartition(srvKeyspace, tabletType) if partition == nil { return "", nil, nil, vterrors.NewVitessError( vtrpcpb.ErrorCode_INTERNAL_ERROR, err, "No partition found for tabletType %v in keyspace %v", topoproto.TabletTypeLString(tabletType), keyspace, ) } return keyspace, srvKeyspace, partition.ShardReferences, nil }
// NewShardError returns a ShardError which preserves the original // error code if possible, adds the connection context and adds a bit // to determine whether the keyspace/shard needs to be re-resolved for // a potential sharding event (namely, if we were in a transaction). func NewShardError(in error, keyspace, shard string, tabletType topodatapb.TabletType, tablet *topodatapb.Tablet, inTransaction bool) error { if in == nil { return nil } var shardIdentifier string if tablet != nil { shardIdentifier = fmt.Sprintf("%s.%s.%s, %+v", keyspace, shard, topoproto.TabletTypeLString(tabletType), tablet) } else { shardIdentifier = fmt.Sprintf("%s.%s.%s", keyspace, shard, topoproto.TabletTypeLString(tabletType)) } return &ShardError{ ShardIdentifier: shardIdentifier, InTransaction: inTransaction, Err: in, ErrorCode: vterrors.RecoverVtErrorCode(in), } }
func (hc *HealthCheckImpl) checkHealthCheckTimeout() { hc.mu.RLock() list := make([]*healthCheckConn, 0, len(hc.addrToConns)) for _, hcc := range hc.addrToConns { list = append(list, hcc) } hc.mu.RUnlock() for _, hcc := range list { hcc.mu.RLock() if !hcc.tabletStats.Serving { // ignore non-serving tablet hcc.mu.RUnlock() continue } if time.Now().Sub(hcc.lastResponseTimestamp) < hc.healthCheckTimeout { // received a healthcheck response recently hcc.mu.RUnlock() continue } hcc.mu.RUnlock() // mark the tablet non-serving as we have not seen a health check response for a long time hcc.mu.Lock() // check again to avoid race condition if !hcc.tabletStats.Serving { // ignore non-serving tablet hcc.mu.Unlock() continue } if time.Now().Sub(hcc.lastResponseTimestamp) < hc.healthCheckTimeout { // received a healthcheck response recently hcc.mu.Unlock() continue } hcc.tabletStats.Serving = false hcc.tabletStats.LastError = fmt.Errorf("healthcheck timed out (latest %v)", hcc.lastResponseTimestamp) ts := hcc.tabletStats hcc.mu.Unlock() // notify downstream for serving status change if hc.listener != nil { hc.listener.StatsUpdate(&ts) } hcErrorCounters.Add([]string{ts.Target.Keyspace, ts.Target.Shard, topoproto.TabletTypeLString(ts.Target.TabletType)}, 1) } }
func (lg *l2VTGateGateway) getStatsAggregator(conn *l2VTGateConn, tabletType topodatapb.TabletType) *TabletStatusAggregator { key := fmt.Sprintf("%v:%v", conn.addr, topoproto.TabletTypeLString(tabletType)) // get existing aggregator lg.mu.RLock() aggr, ok := lg.statusAggregators[key] lg.mu.RUnlock() if ok { return aggr } // create a new one, but check again before the creation lg.mu.Lock() defer lg.mu.Unlock() aggr, ok = lg.statusAggregators[key] if ok { return aggr } aggr = NewTabletStatusAggregator(conn.keyspace, conn.shard, tabletType, key) lg.statusAggregators[key] = aggr return aggr }
// changeCallback is run after every action that might // have changed something in the tablet record or in the topology. // // It owns making changes to the BinlogPlayerMap. The input for this is the // tablet type (has to be master), and the shard's SourceShards. // // It owns updating the blacklisted tables. // // It owns updating the stats record for 'TabletType'. // // It owns starting and stopping the update stream service. // // It owns reading the TabletControl for the current tablet, and storing it. func (agent *ActionAgent) changeCallback(ctx context.Context, oldTablet, newTablet *topodatapb.Tablet) { span := trace.NewSpanFromContext(ctx) span.StartLocal("ActionAgent.changeCallback") defer span.Finish() allowQuery := topo.IsRunningQueryService(newTablet.Type) broadcastHealth := false runUpdateStream := allowQuery // Read the shard to get SourceShards / TabletControlMap if // we're going to use it. var shardInfo *topo.ShardInfo var err error var disallowQueryReason string var blacklistedTables []string updateBlacklistedTables := true if allowQuery { shardInfo, err = agent.TopoServer.GetShard(ctx, newTablet.Keyspace, newTablet.Shard) if err != nil { log.Errorf("Cannot read shard for this tablet %v, might have inaccurate SourceShards and TabletControls: %v", newTablet.Alias, err) updateBlacklistedTables = false } else { if newTablet.Type == topodatapb.TabletType_MASTER { if len(shardInfo.SourceShards) > 0 { allowQuery = false disallowQueryReason = "master tablet with filtered replication on" } } if tc := shardInfo.GetTabletControl(newTablet.Type); tc != nil { if topo.InCellList(newTablet.Alias.Cell, tc.Cells) { if tc.DisableQueryService { allowQuery = false disallowQueryReason = "TabletControl.DisableQueryService set" } blacklistedTables = tc.BlacklistedTables } } } } else { disallowQueryReason = fmt.Sprintf("not a serving tablet type(%v)", newTablet.Type) } agent.setServicesDesiredState(disallowQueryReason, runUpdateStream) if updateBlacklistedTables { if err := agent.loadBlacklistRules(newTablet, blacklistedTables); err != nil { // FIXME(alainjobart) how to handle this error? log.Errorf("Cannot update blacklisted tables rule: %v", err) } else { agent.setBlacklistedTables(blacklistedTables) } } if allowQuery { // Query service should be running. if oldTablet.Type == topodatapb.TabletType_REPLICA && newTablet.Type == topodatapb.TabletType_MASTER { // When promoting from replica to master, allow both master and replica // queries to be served during gracePeriod. if _, err := agent.QueryServiceControl.SetServingType(newTablet.Type, true, []topodatapb.TabletType{oldTablet.Type}); err == nil { // If successful, broadcast to vtgate and then wait. agent.broadcastHealth() time.Sleep(*gracePeriod) } else { log.Errorf("Can't start query service for MASTER+REPLICA mode: %v", err) } } if stateChanged, err := agent.QueryServiceControl.SetServingType(newTablet.Type, true, nil); err == nil { // If the state changed, broadcast to vtgate. // (e.g. this happens when the tablet was already master, but it just // changed from NOT_SERVING to SERVING due to // "vtctl MigrateServedFrom ... master".) if stateChanged { broadcastHealth = true } } else { runUpdateStream = false log.Errorf("Cannot start query service: %v", err) } } else { // Query service should be stopped. if topo.IsSubjectToLameduck(oldTablet.Type) && newTablet.Type == topodatapb.TabletType_SPARE && *gracePeriod > 0 { // When a non-MASTER serving type is going SPARE, // put query service in lameduck during gracePeriod. agent.lameduck(disallowQueryReason) } log.Infof("Disabling query service on type change, reason: %v", disallowQueryReason) if stateChanged, err := agent.QueryServiceControl.SetServingType(newTablet.Type, false, nil); err == nil { // If the state changed, broadcast to vtgate. // (e.g. this happens when the tablet was already master, but it just // changed from SERVING to NOT_SERVING because filtered replication was // enabled.) if stateChanged { broadcastHealth = true } } else { log.Errorf("SetServingType(serving=false) failed: %v", err) } } // update stream needs to be started or stopped too if topo.IsRunningUpdateStream(newTablet.Type) && runUpdateStream { agent.UpdateStream.Enable() } else { agent.UpdateStream.Disable() } // upate the stats to our current type if agent.exportStats { agent.statsTabletType.Set(topoproto.TabletTypeLString(newTablet.Type)) } // See if we need to start or stop any binlog player if agent.BinlogPlayerMap != nil { if newTablet.Type == topodatapb.TabletType_MASTER { agent.BinlogPlayerMap.RefreshMap(agent.batchCtx, newTablet, shardInfo) } else { agent.BinlogPlayerMap.StopAllPlayersAndReset() } } // Broadcast health changes to vtgate immediately. if broadcastHealth { agent.broadcastHealth() } }
// checkConn performs health checking on the given tablet. func (hc *HealthCheckImpl) checkConn(hcc *healthCheckConn, name string) { defer hc.wg.Done() defer func() { hcc.mu.Lock() if hcc.conn != nil { hcc.conn.Close() hcc.conn = nil } hcc.mu.Unlock() }() // retry health check if it fails for { // Try to connect to the tablet. stream, err := hcc.connect(hc) if err != nil { select { case <-hcc.ctx.Done(): return default: } hcc.mu.Lock() hcc.tabletStats.Serving = false hcc.tabletStats.LastError = err target := hcc.tabletStats.Target hcc.mu.Unlock() hcErrorCounters.Add([]string{target.Keyspace, target.Shard, topoproto.TabletTypeLString(target.TabletType)}, 1) // Sleep until the next retry is up or the context is done/canceled. select { case <-hcc.ctx.Done(): case <-time.After(hc.retryDelay): } continue } // Read stream health responses. for { reconnect, err := hcc.processResponse(hc, stream) if err != nil { hcc.mu.Lock() hcc.tabletStats.Serving = false hcc.tabletStats.LastError = err ts := hcc.tabletStats hcc.mu.Unlock() // notify downstream for serving status change if hc.listener != nil { hc.listener.StatsUpdate(&ts) } select { case <-hcc.ctx.Done(): return default: } hcErrorCounters.Add([]string{ts.Target.Keyspace, ts.Target.Shard, topoproto.TabletTypeLString(ts.Target.TabletType)}, 1) if reconnect { hcc.mu.Lock() hcc.conn.Close() hcc.conn = nil hcc.tabletStats.Target = &querypb.Target{} hcc.mu.Unlock() // Sleep until the next retry is up or the context is done/canceled. select { case <-hcc.ctx.Done(): case <-time.After(hc.retryDelay): } break } } } } }
func (stc *ScatterConn) startAction(name, keyspace, shard string, tabletType topodatapb.TabletType) (time.Time, []string) { statsKey := []string{name, keyspace, shard, topoproto.TabletTypeLString(tabletType)} startTime := time.Now() return startTime, statsKey }
func (l *L2VTGate) startAction(name string, target *querypb.Target) (time.Time, []string) { statsKey := []string{name, target.Keyspace, target.Shard, topoproto.TabletTypeLString(target.TabletType)} startTime := time.Now() return startTime, statsKey }
// servingConnStats returns the number of serving tablets per keyspace/shard/tablet type. func (hc *HealthCheckImpl) servingConnStats() map[string]int64 { res := make(map[string]int64) hc.mu.RLock() defer hc.mu.RUnlock() for _, hcc := range hc.addrToConns { hcc.mu.RLock() if !hcc.tabletStats.Up || !hcc.tabletStats.Serving || hcc.tabletStats.LastError != nil { hcc.mu.RUnlock() continue } key := fmt.Sprintf("%s.%s.%s", hcc.tabletStats.Target.Keyspace, hcc.tabletStats.Target.Shard, topoproto.TabletTypeLString(hcc.tabletStats.Target.TabletType)) hcc.mu.RUnlock() res[key]++ } return res }