// synchronizeReplication phase: // 1 - ask the subset slave to stop replication // 2 - sleep for 5 seconds // 3 - ask the superset slave to stop replication // Note this is not 100% correct, but good enough for now func (worker *SQLDiffWorker) synchronizeReplication() error { worker.setState(SQLDiffSynchronizeReplication) // stop replication on subset slave worker.wr.Logger().Infof("Stopping replication on subset slave %v", worker.subset.alias) subsetTablet, err := worker.wr.TopoServer().GetTablet(worker.subset.alias) if err != nil { return err } ctx, cancel := context.WithTimeout(context.TODO(), 30*time.Second) if err := worker.wr.TabletManagerClient().StopSlave(ctx, subsetTablet); err != nil { return fmt.Errorf("Cannot stop slave %v: %v", worker.subset.alias, err) } cancel() if worker.CheckInterrupted() { return topo.ErrInterrupted } // change the cleaner actions from ChangeSlaveType(rdonly) // to StartSlave() + ChangeSlaveType(spare) wrangler.RecordStartSlaveAction(worker.cleaner, subsetTablet, 30*time.Second) action, err := wrangler.FindChangeSlaveTypeActionByTarget(worker.cleaner, worker.subset.alias) if err != nil { return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", worker.subset.alias, err) } action.TabletType = topo.TYPE_SPARE // sleep for a few seconds time.Sleep(5 * time.Second) if worker.CheckInterrupted() { return topo.ErrInterrupted } // stop replication on superset slave worker.wr.Logger().Infof("Stopping replication on superset slave %v", worker.superset.alias) supersetTablet, err := worker.wr.TopoServer().GetTablet(worker.superset.alias) if err != nil { return err } ctx, cancel = context.WithTimeout(context.TODO(), 30*time.Second) if err := worker.wr.TabletManagerClient().StopSlave(ctx, supersetTablet); err != nil { return fmt.Errorf("Cannot stop slave %v: %v", worker.superset.alias, err) } cancel() // change the cleaner actions from ChangeSlaveType(rdonly) // to StartSlave() + ChangeSlaveType(spare) wrangler.RecordStartSlaveAction(worker.cleaner, supersetTablet, 30*time.Second) action, err = wrangler.FindChangeSlaveTypeActionByTarget(worker.cleaner, worker.superset.alias) if err != nil { return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", worker.superset.alias, err) } action.TabletType = topo.TYPE_SPARE return nil }
// findTargets phase: // - find one rdonly in the source shard // - mark it as 'checker' pointing back to us // - get the aliases of all the targets func (vscw *VerticalSplitCloneWorker) findTargets() error { vscw.setState(stateVSCFindTargets) // find an appropriate endpoint in the source shard var err error vscw.sourceAlias, err = findChecker(vscw.wr, vscw.cleaner, vscw.cell, vscw.sourceKeyspace, "0") if err != nil { return fmt.Errorf("cannot find checker for %v/%v/0: %v", vscw.cell, vscw.sourceKeyspace, err) } vscw.wr.Logger().Infof("Using tablet %v as the source", vscw.sourceAlias) // get the tablet info for it vscw.sourceTablet, err = vscw.wr.TopoServer().GetTablet(vscw.sourceAlias) if err != nil { return fmt.Errorf("cannot read tablet %v: %v", vscw.sourceTablet, err) } // stop replication on it ctx, cancel := context.WithTimeout(context.TODO(), 30*time.Second) if err := vscw.wr.TabletManagerClient().StopSlave(ctx, vscw.sourceTablet); err != nil { return fmt.Errorf("cannot stop replication on tablet %v", vscw.sourceAlias) } cancel() wrangler.RecordStartSlaveAction(vscw.cleaner, vscw.sourceTablet, 30*time.Second) action, err := wrangler.FindChangeSlaveTypeActionByTarget(vscw.cleaner, vscw.sourceAlias) if err != nil { return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", vscw.sourceAlias, err) } action.TabletType = topo.TYPE_SPARE return vscw.findMasterTargets() }
func main() { flag.Parse() client, err := zrpc.Dial("tcp://127.0.0.1:1337") if err != nil { log.Fatal(err) } start := time.Now() runs := 10000 for i := 0; i < runs; i++ { // Create the request and response req := &pb.ReverseRequest{ NormalString: proto.String("teststring"), } resp := &pb.ReverseResponse{} // Create the context and pass request timeout and service name ctx, _ := context.WithTimeout(context.Background(), time.Second*1) ctx = zrpc.NewServiceNameContext(ctx, "reverseservice") if err := client.Call(ctx, req, resp); err != nil { log.Println("error:", err) } else { log.Println("received:", resp) } log.Printf("%d goroutines", runtime.NumGoroutine()) // time.Sleep(time.Millisecond * 500) } totalTime := time.Since(start) log.Printf("Performed %d reqs in %s (avg %s)", runs, totalTime, totalTime/time.Duration(runs)) }
// runSqlCommands will send the sql commands to the remote tablet. func runSqlCommands(wr *wrangler.Wrangler, ti *topo.TabletInfo, commands []string, abort chan struct{}, disableBinLogs bool) error { for _, command := range commands { command, err := fillStringTemplate(command, map[string]string{"DatabaseName": ti.DbName()}) if err != nil { return fmt.Errorf("fillStringTemplate failed: %v", err) } ctx, cancel := context.WithTimeout(context.TODO(), 30*time.Second) _, err = wr.TabletManagerClient().ExecuteFetch(ctx, ti, command, 0, false, disableBinLogs) if err != nil { return err } cancel() // check on abort select { case <-abort: return nil default: break } } return nil }
func main() { const numRequests = 3 // create a channel to capture our results forecasts := make(chan *openweathermap.Forecast, numRequests) // create our channel of requests requests := make(chan par.RequestFunc, numRequests) requests <- findById(4288809, forecasts) // Covington, VA requests <- findById(4288809, forecasts) requests <- findById(4140963, forecasts) // DC close(requests) // important to remember to close the channel // resolver := par.Requests(requests).WithRedundancy(1) resolver := par.Requests(requests).WithConcurrency(numRequests) ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond) // ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second) err := resolver.DoWithContext(ctx) cancel() ok(err) // the forecasts channel now contains all our forecasts close(forecasts) cities := map[string]*openweathermap.Forecast{} for forecast := range forecasts { cities[forecast.Name] = forecast } }
// BasicMngrHandler can be use in "manager" ServeHTTP after initital required interface like // authmodel.UserManager, authmodel.GroupManager, conf.Configurator...etc func BasicMngrHandler(authCtx *AuthContext, rw http.ResponseWriter, req *http.Request, cond *Condition, fn HandleFunc) { var cancel context.CancelFunc authCtx.Context, cancel = context.WithTimeout(context.Background(), HandleTimeout) defer cancel() authCtx.req = req token := strings.TrimPrefix(req.Header.Get("Authorization"), "Bearer ") authCtx.saveToken(token) authCtx.saveId(mux.Vars(req)["user_id"]) authCtx.Notifications = DEFAULT_NOTIFICATOR authCtx.Logs = DEFAULT_LOGGER rw.Header().Set("Content-Type", "application/json; charset=utf-8") if cond.RequiredPri != nil || cond.Owner { _, err := authCtx.ValidCurrentUser(cond.Owner, cond.RequiredPri) if err != nil { JSONError(rw, err.Error(), http.StatusForbidden) return } } status, err := fn(authCtx, rw, req) if err != nil { authCtx.Logs.Errorf("HTTP %d: %q", status, err) JSONError(rw, err.Error(), status) } }
func (h handler) ServeHTTP(w http.ResponseWriter, r *http.Request) { ctx := context.Background() ctx, cancel := context.WithTimeout(ctx, RequestTimeout) defer cancel() ctx = reqid.NewContext(ctx, reqid.New()) log.Printf(ctx, "http request: remote=%q method=%q url=%q", r.RemoteAddr, r.Method, r.URL) h(ctx, w, r) }
func (tm *TabletManager) TabletExternallyReparented(ctx context.Context, args *gorpcproto.TabletExternallyReparentedArgs, reply *rpc.Unused) error { // TODO(alainjobart) we should forward the RPC deadline from // the original gorpc call. Until we support that, use a // reasonable hard-coded value. ctx, cancel := context.WithTimeout(context.TODO(), 30*time.Second) defer cancel() return tm.agent.RpcWrapLock(ctx, actionnode.TABLET_ACTION_EXTERNALLY_REPARENTED, args, reply, false, func() error { return tm.agent.TabletExternallyReparented(ctx, args.ExternalID) }) }
// applySqlShard applies a given SQL change on a given tablet alias. It allows executing arbitrary // SQL statements, but doesn't return any results, so it's only useful for SQL statements // that would be run for their effects (e.g., CREATE). // It works by applying the SQL statement on the shard's master tablet with replication turned on. // Thus it should be used only for changes that can be applies on a live instance without causing issues; // it shouldn't be used for anything that will require a pivot. // The SQL statement string is expected to have {{.DatabaseName}} in place of the actual db name. func (wr *Wrangler) applySqlShard(tabletInfo *topo.TabletInfo, change string) error { filledChange, err := fillStringTemplate(change, map[string]string{"DatabaseName": tabletInfo.DbName()}) if err != nil { return fmt.Errorf("fillStringTemplate failed: %v", err) } ctx, cancel := context.WithTimeout(wr.ctx, 30*time.Second) defer cancel() // Need to make sure that we enable binlog, since we're only applying the statement on masters. _, err = wr.tmc.ExecuteFetch(ctx, tabletInfo, filledChange, 0, false, false) return err }
// New creates a new Wrangler object. // // actionTimeout: how long should we wait for an action to complete? // - if using wrangler for just one action, this is set properly // upon wrangler creation. // - if re-using wrangler multiple times, call ResetActionTimeout before // every action. Do not use this too much, just for corner cases. // It is just much easier to create a new Wrangler object per action. // // lockTimeout: how long should we wait for the initial lock to start // a complex action? This is distinct from actionTimeout because most // of the time, we want to immediately know that our action will // fail. However, automated action will need some time to arbitrate // the locks. func New(logger logutil.Logger, ts topo.Server, actionTimeout, lockTimeout time.Duration) *Wrangler { ctx, cancel := context.WithTimeout(context.Background(), actionTimeout) return &Wrangler{ logger: logger, ts: ts, tmc: tmclient.NewTabletManagerClient(), ctx: ctx, cancel: cancel, deadline: time.Now().Add(actionTimeout), lockTimeout: lockTimeout, } }
// handleSearch handles URLs like /search?q=golang&timeout=1s by forwarding the // query to google.Search. If the query param includes timeout, the search is // canceled after that duration elapses. func handleSearch(w http.ResponseWriter, req *http.Request) { // ctx is the Context for this handler. Calling cancel closes the // ctx.Done channel, which is the cancellation signal for requests // started by this handler. var ( ctx context.Context cancel context.CancelFunc ) timeout, err := time.ParseDuration(req.FormValue("timeout")) if err == nil { // The request has a timeout, so create a context that is // canceled automatically when the timeout expires. ctx, cancel = context.WithTimeout(context.Background(), timeout) } else { ctx, cancel = context.WithCancel(context.Background()) } defer cancel() // Cancel ctx as soon as handleSearch returns. // Check the search query. query := req.FormValue("q") if query == "" { http.Error(w, "no query", http.StatusBadRequest) return } // Store the user IP in ctx for use by code in other packages. userIP, err := userip.FromRequest(req) if err != nil { http.Error(w, err.Error(), http.StatusBadRequest) return } ctx = userip.NewContext(ctx, userIP) // Run the Google search and print the results. start := time.Now() results, err := google.Search(ctx, query) elapsed := time.Since(start) if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) return } if err := resultsTemplate.Execute(w, struct { Results google.Results Timeout, Elapsed time.Duration }{ Results: results, Timeout: timeout, Elapsed: elapsed, }); err != nil { log.Print(err) return } }
func (s *BaseSuite) SetUpTest(c *C, clientAddr string) { if s.client == nil { log.Println("setting up client at:", clientAddr) s.client, _ = Dial(clientAddr) } s.req = &pb.ReverseRequest{ NormalString: proto.String("test"), } s.resp = &pb.ReverseResponse{} ctx, _ := context.WithTimeout(context.Background(), time.Second*3) s.ctx = NewServiceNameContext(ctx, "reverseservice") }
func ExampleWithTimeout() { // Pass a context with a timeout to tell a blocking function that it // should abandon its work after the timeout elapses. ctx, _ := context.WithTimeout(context.Background(), 100*time.Millisecond) select { case <-time.After(200 * time.Millisecond): fmt.Println("overslept") case <-ctx.Done(): fmt.Println(ctx.Err()) // prints "context deadline exceeded" } // Output: // context deadline exceeded }
// findTargets phase: // - find one rdonly in the source shard // - mark it as 'checker' pointing back to us // - get the aliases of all the targets func (scw *SplitCloneWorker) findTargets() error { scw.setState(stateSCFindTargets) var err error // find an appropriate endpoint in the source shards scw.sourceAliases = make([]topo.TabletAlias, len(scw.sourceShards)) for i, si := range scw.sourceShards { scw.sourceAliases[i], err = findChecker(scw.wr, scw.cleaner, scw.cell, si.Keyspace(), si.ShardName()) if err != nil { return fmt.Errorf("cannot find checker for %v/%v/%v: %v", scw.cell, si.Keyspace(), si.ShardName(), err) } scw.wr.Logger().Infof("Using tablet %v as source for %v/%v", scw.sourceAliases[i], si.Keyspace(), si.ShardName()) } // get the tablet info for them, and stop their replication scw.sourceTablets = make([]*topo.TabletInfo, len(scw.sourceAliases)) for i, alias := range scw.sourceAliases { scw.sourceTablets[i], err = scw.wr.TopoServer().GetTablet(alias) if err != nil { return fmt.Errorf("cannot read tablet %v: %v", alias, err) } ctx, cancel := context.WithTimeout(context.TODO(), 30*time.Second) if err := scw.wr.TabletManagerClient().StopSlave(ctx, scw.sourceTablets[i]); err != nil { return fmt.Errorf("cannot stop replication on tablet %v", alias) } cancel() wrangler.RecordStartSlaveAction(scw.cleaner, scw.sourceTablets[i], 30*time.Second) action, err := wrangler.FindChangeSlaveTypeActionByTarget(scw.cleaner, alias) if err != nil { return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", alias, err) } action.TabletType = topo.TYPE_SPARE } return scw.findMasterTargets() }
// executeFetchLoop loops over the provided insertChannel // and sends the commands to the provided tablet. func executeFetchLoop(wr *wrangler.Wrangler, ti *topo.TabletInfo, insertChannel chan string, abort chan struct{}, disableBinLogs bool) error { for { select { case cmd, ok := <-insertChannel: if !ok { // no more to read, we're done return nil } cmd = "INSERT INTO `" + ti.DbName() + "`." + cmd ctx, cancel := context.WithTimeout(context.TODO(), 30*time.Second) _, err := wr.TabletManagerClient().ExecuteFetch(ctx, ti, cmd, 0, false, disableBinLogs) if err != nil { return fmt.Errorf("ExecuteFetch failed: %v", err) } cancel() case <-abort: // FIXME(alainjobart): note this select case // could be starved here, and we might miss // the abort in some corner cases. return nil } } }
// copy phase: // - copy the data from source tablets to destination masters (wtih replication on) // Assumes that the schema has already been created on each destination tablet // (probably from vtctl's CopySchemaShard) func (vscw *VerticalSplitCloneWorker) copy() error { vscw.setState(stateVSCCopy) // get source schema sourceSchemaDefinition, err := vscw.wr.GetSchema(vscw.sourceAlias, vscw.tables, nil, true) if err != nil { return fmt.Errorf("cannot get schema from source %v: %v", vscw.sourceAlias, err) } if len(sourceSchemaDefinition.TableDefinitions) == 0 { return fmt.Errorf("no tables matching the table filter") } vscw.wr.Logger().Infof("Source tablet has %v tables to copy", len(sourceSchemaDefinition.TableDefinitions)) vscw.mu.Lock() vscw.tableStatus = make([]*tableStatus, len(sourceSchemaDefinition.TableDefinitions)) for i, td := range sourceSchemaDefinition.TableDefinitions { vscw.tableStatus[i] = &tableStatus{ name: td.Name, rowCount: td.RowCount, } } vscw.startTime = time.Now() vscw.mu.Unlock() // Count rows for i, td := range sourceSchemaDefinition.TableDefinitions { vscw.tableStatus[i].mu.Lock() if td.Type == myproto.TABLE_BASE_TABLE { vscw.tableStatus[i].rowCount = td.RowCount } else { vscw.tableStatus[i].isView = true } vscw.tableStatus[i].mu.Unlock() } // In parallel, setup the channels to send SQL data chunks to for each destination tablet. // // mu protects the abort channel for closing, and firstError mu := sync.Mutex{} abort := make(chan struct{}) var firstError error processError := func(format string, args ...interface{}) { vscw.wr.Logger().Errorf(format, args...) mu.Lock() if abort != nil { close(abort) abort = nil firstError = fmt.Errorf(format, args...) } mu.Unlock() } // since we're writing only to masters, we need to enable bin logs so that replication happens disableBinLogs := false insertChannels := make([]chan string, len(vscw.destinationAliases)) destinationWaitGroup := sync.WaitGroup{} for i, tabletAlias := range vscw.destinationAliases { // we create one channel per destination tablet. It // is sized to have a buffer of a maximum of // destinationWriterCount * 2 items, to hopefully // always have data. We then have // destinationWriterCount go routines reading from it. insertChannels[i] = make(chan string, vscw.destinationWriterCount*2) go func(ti *topo.TabletInfo, insertChannel chan string) { for j := 0; j < vscw.destinationWriterCount; j++ { destinationWaitGroup.Add(1) go func() { defer destinationWaitGroup.Done() if err := executeFetchLoop(vscw.wr, ti, insertChannel, abort, disableBinLogs); err != nil { processError("executeFetchLoop failed: %v", err) } }() } }(vscw.destinationTablets[tabletAlias], insertChannels[i]) } // Now for each table, read data chunks and send them to all // insertChannels sourceWaitGroup := sync.WaitGroup{} sema := sync2.NewSemaphore(vscw.sourceReaderCount, 0) for tableIndex, td := range sourceSchemaDefinition.TableDefinitions { if td.Type == myproto.TABLE_VIEW { continue } chunks, err := findChunks(vscw.wr, vscw.sourceTablet, td, vscw.minTableSizeForSplit, vscw.sourceReaderCount) if err != nil { return err } vscw.tableStatus[tableIndex].setThreadCount(len(chunks) - 1) for chunkIndex := 0; chunkIndex < len(chunks)-1; chunkIndex++ { sourceWaitGroup.Add(1) go func(td *myproto.TableDefinition, tableIndex, chunkIndex int) { defer sourceWaitGroup.Done() sema.Acquire() defer sema.Release() vscw.tableStatus[tableIndex].threadStarted() // build the query, and start the streaming selectSQL := buildSQLFromChunks(vscw.wr, td, chunks, chunkIndex, vscw.sourceAlias.String()) qrr, err := NewQueryResultReaderForTablet(vscw.wr.TopoServer(), vscw.sourceAlias, selectSQL) if err != nil { processError("NewQueryResultReaderForTablet failed: %v", err) return } defer qrr.Close() // process the data if err := vscw.processData(td, tableIndex, qrr, insertChannels, vscw.destinationPackCount, abort); err != nil { processError("QueryResultReader failed: %v", err) } vscw.tableStatus[tableIndex].threadDone() }(td, tableIndex, chunkIndex) } } sourceWaitGroup.Wait() for _, c := range insertChannels { close(c) } destinationWaitGroup.Wait() if firstError != nil { return firstError } // then create and populate the blp_checkpoint table if vscw.strategy.PopulateBlpCheckpoint { // get the current position from the source ctx, cancel := context.WithTimeout(context.TODO(), 30*time.Second) status, err := vscw.wr.TabletManagerClient().SlaveStatus(ctx, vscw.sourceTablet) if err != nil { return err } cancel() queries := make([]string, 0, 4) queries = append(queries, binlogplayer.CreateBlpCheckpoint()...) flags := "" if vscw.strategy.DontStartBinlogPlayer { flags = binlogplayer.BLP_FLAG_DONT_START } queries = append(queries, binlogplayer.PopulateBlpCheckpoint(0, status.Position, time.Now().Unix(), flags)) for _, tabletAlias := range vscw.destinationAliases { destinationWaitGroup.Add(1) go func(ti *topo.TabletInfo) { defer destinationWaitGroup.Done() vscw.wr.Logger().Infof("Making and populating blp_checkpoint table on tablet %v", ti.Alias) if err := runSqlCommands(vscw.wr, ti, queries, abort, disableBinLogs); err != nil { processError("blp_checkpoint queries failed on tablet %v: %v", ti.Alias, err) } }(vscw.destinationTablets[tabletAlias]) } destinationWaitGroup.Wait() if firstError != nil { return firstError } } // Now we're done with data copy, update the shard's source info. if vscw.strategy.SkipSetSourceShards { vscw.wr.Logger().Infof("Skipping setting SourceShard on destination shard.") } else { vscw.wr.Logger().Infof("Setting SourceShard on shard %v/%v", vscw.destinationKeyspace, vscw.destinationShard) if err := vscw.wr.SetSourceShards(vscw.destinationKeyspace, vscw.destinationShard, []topo.TabletAlias{vscw.sourceAlias}, vscw.tables); err != nil { return fmt.Errorf("Failed to set source shards: %v", err) } } // And force a schema reload on all destination tablets. // The master tablet will end up starting filtered replication // at this point. for _, tabletAlias := range vscw.reloadAliases { destinationWaitGroup.Add(1) go func(ti *topo.TabletInfo) { defer destinationWaitGroup.Done() vscw.wr.Logger().Infof("Reloading schema on tablet %v", ti.Alias) ctx, cancel := context.WithTimeout(context.TODO(), 30*time.Second) if err := vscw.wr.TabletManagerClient().ReloadSchema(ctx, ti); err != nil { processError("ReloadSchema failed on tablet %v: %v", ti.Alias, err) } cancel() }(vscw.reloadTablets[tabletAlias]) } destinationWaitGroup.Wait() return firstError }
// copy phase: // - copy the data from source tablets to destination masters (wtih replication on) // Assumes that the schema has already been created on each destination tablet // (probably from vtctl's CopySchemaShard) func (scw *SplitCloneWorker) copy() error { scw.setState(stateSCCopy) // get source schema from the first shard // TODO(alainjobart): for now, we assume the schema is compatible // on all source shards. Furthermore, we estimate the number of rows // in each source shard for each table to be about the same // (rowCount is used to estimate an ETA) sourceSchemaDefinition, err := scw.wr.GetSchema(scw.sourceAliases[0], nil, scw.excludeTables, true) if err != nil { return fmt.Errorf("cannot get schema from source %v: %v", scw.sourceAliases[0], err) } if len(sourceSchemaDefinition.TableDefinitions) == 0 { return fmt.Errorf("no tables matching the table filter in tablet %v", scw.sourceAliases[0]) } scw.wr.Logger().Infof("Source tablet 0 has %v tables to copy", len(sourceSchemaDefinition.TableDefinitions)) scw.mu.Lock() scw.tableStatus = make([]*tableStatus, len(sourceSchemaDefinition.TableDefinitions)) for i, td := range sourceSchemaDefinition.TableDefinitions { scw.tableStatus[i] = &tableStatus{ name: td.Name, rowCount: td.RowCount * uint64(len(scw.sourceAliases)), } } scw.startTime = time.Now() scw.mu.Unlock() // Find the column index for the sharding columns in all the databases, and count rows columnIndexes := make([]int, len(sourceSchemaDefinition.TableDefinitions)) for tableIndex, td := range sourceSchemaDefinition.TableDefinitions { if td.Type == myproto.TABLE_BASE_TABLE { // find the column to split on columnIndexes[tableIndex] = -1 for i, name := range td.Columns { if name == scw.keyspaceInfo.ShardingColumnName { columnIndexes[tableIndex] = i break } } if columnIndexes[tableIndex] == -1 { return fmt.Errorf("table %v doesn't have a column named '%v'", td.Name, scw.keyspaceInfo.ShardingColumnName) } scw.tableStatus[tableIndex].mu.Lock() scw.tableStatus[tableIndex].rowCount = td.RowCount scw.tableStatus[tableIndex].mu.Unlock() } else { scw.tableStatus[tableIndex].mu.Lock() scw.tableStatus[tableIndex].isView = true scw.tableStatus[tableIndex].mu.Unlock() } } // In parallel, setup the channels to send SQL data chunks to for each destination tablet: // // mu protects the abort channel for closing, and firstError mu := sync.Mutex{} abort := make(chan struct{}) var firstError error processError := func(format string, args ...interface{}) { scw.wr.Logger().Errorf(format, args...) mu.Lock() if abort != nil { close(abort) abort = nil firstError = fmt.Errorf(format, args...) } mu.Unlock() } // since we're writing only to masters, we need to enable bin logs so that replication happens disableBinLogs := false insertChannels := make([][]chan string, len(scw.destinationShards)) destinationWaitGroup := sync.WaitGroup{} for shardIndex, _ := range scw.destinationShards { insertChannels[shardIndex] = make([]chan string, len(scw.destinationAliases[shardIndex])) for i, tabletAlias := range scw.destinationAliases[shardIndex] { // we create one channel per destination tablet. It // is sized to have a buffer of a maximum of // destinationWriterCount * 2 items, to hopefully // always have data. We then have // destinationWriterCount go routines reading from it. insertChannels[shardIndex][i] = make(chan string, scw.destinationWriterCount*2) go func(ti *topo.TabletInfo, insertChannel chan string) { for j := 0; j < scw.destinationWriterCount; j++ { destinationWaitGroup.Add(1) go func() { defer destinationWaitGroup.Done() if err := executeFetchLoop(scw.wr, ti, insertChannel, abort, disableBinLogs); err != nil { processError("executeFetchLoop failed: %v", err) } }() } }(scw.destinationTablets[shardIndex][tabletAlias], insertChannels[shardIndex][i]) } } // Now for each table, read data chunks and send them to all // insertChannels sourceWaitGroup := sync.WaitGroup{} for shardIndex, _ := range scw.sourceShards { sema := sync2.NewSemaphore(scw.sourceReaderCount, 0) for tableIndex, td := range sourceSchemaDefinition.TableDefinitions { if td.Type == myproto.TABLE_VIEW { continue } rowSplitter := NewRowSplitter(scw.destinationShards, scw.keyspaceInfo.ShardingColumnType, columnIndexes[tableIndex]) chunks, err := findChunks(scw.wr, scw.sourceTablets[shardIndex], td, scw.minTableSizeForSplit, scw.sourceReaderCount) if err != nil { return err } scw.tableStatus[tableIndex].setThreadCount(len(chunks) - 1) for chunkIndex := 0; chunkIndex < len(chunks)-1; chunkIndex++ { sourceWaitGroup.Add(1) go func(td *myproto.TableDefinition, tableIndex, chunkIndex int) { defer sourceWaitGroup.Done() sema.Acquire() defer sema.Release() scw.tableStatus[tableIndex].threadStarted() // build the query, and start the streaming selectSQL := buildSQLFromChunks(scw.wr, td, chunks, chunkIndex, scw.sourceAliases[shardIndex].String()) qrr, err := NewQueryResultReaderForTablet(scw.wr.TopoServer(), scw.sourceAliases[shardIndex], selectSQL) if err != nil { processError("NewQueryResultReaderForTablet failed: %v", err) return } defer qrr.Close() // process the data if err := scw.processData(td, tableIndex, qrr, rowSplitter, insertChannels, scw.destinationPackCount, abort); err != nil { processError("processData failed: %v", err) } scw.tableStatus[tableIndex].threadDone() }(td, tableIndex, chunkIndex) } } } sourceWaitGroup.Wait() for shardIndex, _ := range scw.destinationShards { for _, c := range insertChannels[shardIndex] { close(c) } } destinationWaitGroup.Wait() if firstError != nil { return firstError } // then create and populate the blp_checkpoint table if scw.strategy.PopulateBlpCheckpoint { queries := make([]string, 0, 4) queries = append(queries, binlogplayer.CreateBlpCheckpoint()...) flags := "" if scw.strategy.DontStartBinlogPlayer { flags = binlogplayer.BLP_FLAG_DONT_START } // get the current position from the sources for shardIndex, _ := range scw.sourceShards { ctx, cancel := context.WithTimeout(context.TODO(), 30*time.Second) status, err := scw.wr.TabletManagerClient().SlaveStatus(ctx, scw.sourceTablets[shardIndex]) if err != nil { return err } cancel() queries = append(queries, binlogplayer.PopulateBlpCheckpoint(0, status.Position, time.Now().Unix(), flags)) } for shardIndex, _ := range scw.destinationShards { for _, tabletAlias := range scw.destinationAliases[shardIndex] { destinationWaitGroup.Add(1) go func(ti *topo.TabletInfo) { defer destinationWaitGroup.Done() scw.wr.Logger().Infof("Making and populating blp_checkpoint table on tablet %v", ti.Alias) if err := runSqlCommands(scw.wr, ti, queries, abort, disableBinLogs); err != nil { processError("blp_checkpoint queries failed on tablet %v: %v", ti.Alias, err) } }(scw.destinationTablets[shardIndex][tabletAlias]) } } destinationWaitGroup.Wait() if firstError != nil { return firstError } } // Now we're done with data copy, update the shard's source info. // TODO(alainjobart) this is a superset, some shards may not // overlap, have to deal with this better (for N -> M splits // where both N>1 and M>1) if scw.strategy.SkipSetSourceShards { scw.wr.Logger().Infof("Skipping setting SourceShard on destination shards.") } else { for _, si := range scw.destinationShards { scw.wr.Logger().Infof("Setting SourceShard on shard %v/%v", si.Keyspace(), si.ShardName()) if err := scw.wr.SetSourceShards(si.Keyspace(), si.ShardName(), scw.sourceAliases, nil); err != nil { return fmt.Errorf("Failed to set source shards: %v", err) } } } // And force a schema reload on all destination tablets. // The master tablet will end up starting filtered replication // at this point. for shardIndex, _ := range scw.destinationShards { for _, tabletAlias := range scw.reloadAliases[shardIndex] { destinationWaitGroup.Add(1) go func(ti *topo.TabletInfo) { defer destinationWaitGroup.Done() scw.wr.Logger().Infof("Reloading schema on tablet %v", ti.Alias) ctx, cancel := context.WithTimeout(context.TODO(), 30*time.Second) if err := scw.wr.TabletManagerClient().ReloadSchema(ctx, ti); err != nil { processError("ReloadSchema failed on tablet %v: %v", ti.Alias, err) } cancel() }(scw.reloadTablets[shardIndex][tabletAlias]) } } destinationWaitGroup.Wait() return firstError }
// ResetActionTimeout should be used before every action on a wrangler // object that is going to be re-used: // - vtctl will not call this, as it does one action. // - vtctld will not call this, as it creates a new Wrangler every time. // However, some actions may need to do a cleanup phase where the // original Context may have expired or been cancelled, but still do // the action. Wrangler cleaner module is one of these, or the vt // worker in some corner cases, func (wr *Wrangler) ResetActionTimeout(actionTimeout time.Duration) { wr.ctx, wr.cancel = context.WithTimeout(context.Background(), actionTimeout) wr.deadline = time.Now().Add(actionTimeout) }
// ForkWithTimeout returns a ForkContext which adds a d timeout to the supplied // context. func ForkWithTimeout(d time.Duration) ForkContext { return func(c context.Context) (context.Context, context.CancelFunc) { return context.WithTimeout(c, d) } }
// findChunks returns an array of chunks to use for splitting up a table // into multiple data chunks. It only works for tables with a primary key // (and the primary key first column is an integer type). // The array will always look like: // "", "value1", "value2", "" // A non-split tablet will just return: // "", "" func findChunks(wr *wrangler.Wrangler, ti *topo.TabletInfo, td *myproto.TableDefinition, minTableSizeForSplit uint64, sourceReaderCount int) ([]string, error) { result := []string{"", ""} // eliminate a few cases we don't split tables for if len(td.PrimaryKeyColumns) == 0 { // no primary key, what can we do? return result, nil } if td.DataLength < minTableSizeForSplit { // table is too small to split up return result, nil } // get the min and max of the leading column of the primary key query := fmt.Sprintf("SELECT MIN(%v), MAX(%v) FROM %v.%v", td.PrimaryKeyColumns[0], td.PrimaryKeyColumns[0], ti.DbName(), td.Name) ctx, cancel := context.WithTimeout(context.TODO(), 30*time.Second) qr, err := wr.TabletManagerClient().ExecuteFetch(ctx, ti, query, 1, true, false) if err != nil { wr.Logger().Infof("Not splitting table %v into multiple chunks: %v", td.Name, err) return result, nil } cancel() if len(qr.Rows) != 1 { wr.Logger().Infof("Not splitting table %v into multiple chunks, cannot get min and max", td.Name) return result, nil } if qr.Rows[0][0].IsNull() || qr.Rows[0][1].IsNull() { wr.Logger().Infof("Not splitting table %v into multiple chunks, min or max is NULL: %v %v", td.Name, qr.Rows[0][0], qr.Rows[0][1]) return result, nil } switch qr.Fields[0].Type { case mproto.VT_TINY, mproto.VT_SHORT, mproto.VT_LONG, mproto.VT_LONGLONG, mproto.VT_INT24: minNumeric := sqltypes.MakeNumeric(qr.Rows[0][0].Raw()) maxNumeric := sqltypes.MakeNumeric(qr.Rows[0][1].Raw()) if qr.Rows[0][0].Raw()[0] == '-' { // signed values, use int64 min, err := minNumeric.ParseInt64() if err != nil { wr.Logger().Infof("Not splitting table %v into multiple chunks, cannot convert min: %v %v", td.Name, minNumeric, err) return result, nil } max, err := maxNumeric.ParseInt64() if err != nil { wr.Logger().Infof("Not splitting table %v into multiple chunks, cannot convert max: %v %v", td.Name, maxNumeric, err) return result, nil } interval := (max - min) / int64(sourceReaderCount) if interval == 0 { wr.Logger().Infof("Not splitting table %v into multiple chunks, interval=0: %v %v", td.Name, max, min) return result, nil } result = make([]string, sourceReaderCount+1) result[0] = "" result[sourceReaderCount] = "" for i := int64(1); i < int64(sourceReaderCount); i++ { result[i] = fmt.Sprintf("%v", min+interval*i) } return result, nil } // unsigned values, use uint64 min, err := minNumeric.ParseUint64() if err != nil { wr.Logger().Infof("Not splitting table %v into multiple chunks, cannot convert min: %v %v", td.Name, minNumeric, err) return result, nil } max, err := maxNumeric.ParseUint64() if err != nil { wr.Logger().Infof("Not splitting table %v into multiple chunks, cannot convert max: %v %v", td.Name, maxNumeric, err) return result, nil } interval := (max - min) / uint64(sourceReaderCount) if interval == 0 { wr.Logger().Infof("Not splitting table %v into multiple chunks, interval=0: %v %v", td.Name, max, min) return result, nil } result = make([]string, sourceReaderCount+1) result[0] = "" result[sourceReaderCount] = "" for i := uint64(1); i < uint64(sourceReaderCount); i++ { result[i] = fmt.Sprintf("%v", min+interval*i) } return result, nil case mproto.VT_FLOAT, mproto.VT_DOUBLE: min, err := strconv.ParseFloat(qr.Rows[0][0].String(), 64) if err != nil { wr.Logger().Infof("Not splitting table %v into multiple chunks, cannot convert min: %v %v", td.Name, qr.Rows[0][0], err) return result, nil } max, err := strconv.ParseFloat(qr.Rows[0][1].String(), 64) if err != nil { wr.Logger().Infof("Not splitting table %v into multiple chunks, cannot convert max: %v %v", td.Name, qr.Rows[0][1].String(), err) return result, nil } interval := (max - min) / float64(sourceReaderCount) if interval == 0 { wr.Logger().Infof("Not splitting table %v into multiple chunks, interval=0: %v %v", td.Name, max, min) return result, nil } result = make([]string, sourceReaderCount+1) result[0] = "" result[sourceReaderCount] = "" for i := 1; i < sourceReaderCount; i++ { result[i] = fmt.Sprintf("%v", min+interval*float64(i)) } return result, nil } wr.Logger().Infof("Not splitting table %v into multiple chunks, primary key not numeric", td.Name) return result, nil }
func (vsdw *VerticalSplitDiffWorker) synchronizeReplication() error { vsdw.setState(stateVSDSynchronizeReplication) masterInfo, err := vsdw.wr.TopoServer().GetTablet(vsdw.shardInfo.MasterAlias) if err != nil { return fmt.Errorf("synchronizeReplication: cannot get Tablet record for master %v: %v", vsdw.shardInfo.MasterAlias, err) } // 1 - stop the master binlog replication, get its current position vsdw.wr.Logger().Infof("Stopping master binlog replication on %v", vsdw.shardInfo.MasterAlias) ctx, cancel := context.WithTimeout(context.TODO(), 30*time.Second) blpPositionList, err := vsdw.wr.TabletManagerClient().StopBlp(ctx, masterInfo) if err != nil { return fmt.Errorf("StopBlp on master %v failed: %v", vsdw.shardInfo.MasterAlias, err) } cancel() wrangler.RecordStartBlpAction(vsdw.cleaner, masterInfo, 30*time.Second) // 2 - stop the source 'checker' at a binlog position // higher than the destination master stopPositionList := blproto.BlpPositionList{ Entries: make([]blproto.BlpPosition, 1), } ss := vsdw.shardInfo.SourceShards[0] // find where we should be stopping pos, err := blpPositionList.FindBlpPositionById(ss.Uid) if err != nil { return fmt.Errorf("no binlog position on the master for Uid %v", ss.Uid) } // stop replication vsdw.wr.Logger().Infof("Stopping slave %v at a minimum of %v", vsdw.sourceAlias, pos.Position) sourceTablet, err := vsdw.wr.TopoServer().GetTablet(vsdw.sourceAlias) if err != nil { return err } stoppedAt, err := vsdw.wr.TabletManagerClient().StopSlaveMinimum(context.TODO(), sourceTablet, pos.Position, 30*time.Second) if err != nil { return fmt.Errorf("cannot stop slave %v at right binlog position %v: %v", vsdw.sourceAlias, pos.Position, err) } stopPositionList.Entries[0].Uid = ss.Uid stopPositionList.Entries[0].Position = stoppedAt.Position // change the cleaner actions from ChangeSlaveType(rdonly) // to StartSlave() + ChangeSlaveType(spare) wrangler.RecordStartSlaveAction(vsdw.cleaner, sourceTablet, 30*time.Second) action, err := wrangler.FindChangeSlaveTypeActionByTarget(vsdw.cleaner, vsdw.sourceAlias) if err != nil { return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", vsdw.sourceAlias, err) } action.TabletType = topo.TYPE_SPARE // 3 - ask the master of the destination shard to resume filtered // replication up to the new list of positions vsdw.wr.Logger().Infof("Restarting master %v until it catches up to %v", vsdw.shardInfo.MasterAlias, stopPositionList) masterPos, err := vsdw.wr.TabletManagerClient().RunBlpUntil(context.TODO(), masterInfo, &stopPositionList, 30*time.Second) if err != nil { return fmt.Errorf("RunBlpUntil on %v until %v failed: %v", vsdw.shardInfo.MasterAlias, stopPositionList, err) } // 4 - wait until the destination checker is equal or passed // that master binlog position, and stop its replication. vsdw.wr.Logger().Infof("Waiting for destination checker %v to catch up to %v", vsdw.destinationAlias, masterPos) destinationTablet, err := vsdw.wr.TopoServer().GetTablet(vsdw.destinationAlias) if err != nil { return err } _, err = vsdw.wr.TabletManagerClient().StopSlaveMinimum(context.TODO(), destinationTablet, masterPos, 30*time.Second) if err != nil { return fmt.Errorf("StopSlaveMinimum on %v at %v failed: %v", vsdw.destinationAlias, masterPos, err) } wrangler.RecordStartSlaveAction(vsdw.cleaner, destinationTablet, 30*time.Second) action, err = wrangler.FindChangeSlaveTypeActionByTarget(vsdw.cleaner, vsdw.destinationAlias) if err != nil { return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", vsdw.destinationAlias, err) } action.TabletType = topo.TYPE_SPARE // 5 - restart filtered replication on destination master vsdw.wr.Logger().Infof("Restarting filtered replication on master %v", vsdw.shardInfo.MasterAlias) ctx, cancel = context.WithTimeout(context.TODO(), 30*time.Second) err = vsdw.wr.TabletManagerClient().StartBlp(ctx, masterInfo) if err := vsdw.cleaner.RemoveActionByName(wrangler.StartBlpActionName, vsdw.shardInfo.MasterAlias.String()); err != nil { vsdw.wr.Logger().Warningf("Cannot find cleaning action %v/%v: %v", wrangler.StartBlpActionName, vsdw.shardInfo.MasterAlias.String(), err) } cancel() if err != nil { return fmt.Errorf("StartBlp on %v failed: %v", vsdw.shardInfo.MasterAlias, err) } return nil }
// CleanUp is part of CleanerAction interface. func (sba StartBlpAction) CleanUp(wr *Wrangler) error { ctx, _ := context.WithTimeout(context.TODO(), sba.WaitTime) return wr.TabletManagerClient().StartBlp(ctx, sba.TabletInfo) }