// WaitForFullReplication waits until all stores in the cluster // have no ranges with replication pending. func (tc *TestCluster) WaitForFullReplication() error { opts := retry.Options{ InitialBackoff: time.Millisecond * 10, MaxBackoff: time.Millisecond * 100, Multiplier: 2, } notReplicated := true for r := retry.Start(opts); r.Next() && notReplicated; { notReplicated = false for _, s := range tc.Servers { err := s.Stores().VisitStores(func(s *storage.Store) error { if err := s.ComputeMetrics(); err != nil { return err } if s.Registry().GetGauge("ranges.replication-pending").Value() > 0 { notReplicated = true } return nil }) if err != nil { return err } if notReplicated { break } } } return nil }
// request returns the result of performing a http get request. func request(url string, httpClient *http.Client) ([]byte, bool) { for r := retry.Start(retryOptions); r.Next(); { req, err := http.NewRequest("GET", url, nil) if err != nil { log.Fatal(err) return nil, false } req.Header.Set(util.AcceptHeader, util.JSONContentType) resp, err := httpClient.Do(req) if err != nil { log.Infof("could not GET %s - %s", url, err) continue } defer resp.Body.Close() body, err := ioutil.ReadAll(resp.Body) if err != nil { log.Infof("could not ready body for %s - %s", url, err) continue } if resp.StatusCode != http.StatusOK { log.Infof("could not GET %s - statuscode: %d - body: %s", url, resp.StatusCode, body) continue } returnedContentType := resp.Header.Get(util.ContentTypeHeader) if returnedContentType != util.JSONContentType { log.Infof("unexpected content type: %v", returnedContentType) continue } log.Infof("OK response from %s", url) return body, true } log.Warningf("There was an error retrieving %s", url) return nil, false }
// GetSnapshot wraps Snapshot() but does not require the replica lock // to be held and it will block instead of returning // ErrSnapshotTemporaryUnavailable. func (r *Replica) GetSnapshot() (raftpb.Snapshot, error) { retryOptions := retry.Options{ InitialBackoff: 1 * time.Millisecond, MaxBackoff: 50 * time.Millisecond, Multiplier: 2, } for retry := retry.Start(retryOptions); retry.Next(); { r.mu.Lock() snap, err := r.Snapshot() snapshotChan := r.mu.snapshotChan r.mu.Unlock() if err == raft.ErrSnapshotTemporarilyUnavailable { if snapshotChan == nil { // The call to Snapshot() didn't start an async process due to // rate limiting. Try again later. continue } var ok bool snap, ok = <-snapshotChan if ok { return snap, nil } // Each snapshot worker's output can only be consumed once. // We could be racing with raft itself, so if we get a closed // channel loop back and try again. } else { return snap, err } } panic("unreachable") // due to infinite retries }
func (txn *Txn) exec(retryable func(txn *Txn) error) error { // Run retryable in a retry loop until we encounter a success or // error condition this loop isn't capable of handling. var err error for r := retry.Start(txn.db.txnRetryOptions); r.Next(); { err = retryable(txn) if err == nil && txn.Proto.Status == roachpb.PENDING { // retryable succeeded, but didn't commit. err = txn.commit(nil) } if restartErr, ok := err.(roachpb.TransactionRestartError); ok { if log.V(2) { log.Warning(err) } switch restartErr.CanRestartTransaction() { case roachpb.TransactionRestart_IMMEDIATE: r.Reset() continue case roachpb.TransactionRestart_BACKOFF: continue } // By default, fall through and break. } break } txn.Cleanup(err) return err }
// recordJoinEvent begins an asynchronous task which attempts to log a "node // join" or "node restart" event. This query will retry until it succeeds or the // server stops. func (n *Node) recordJoinEvent() { if !n.ctx.LogRangeEvents { return } logEventType := sql.EventLogNodeRestart if n.initialBoot { logEventType = sql.EventLogNodeJoin } n.stopper.RunWorker(func() { for r := retry.Start(retry.Options{Closer: n.stopper.ShouldStop()}); r.Next(); { if err := n.ctx.DB.Txn(func(txn *client.Txn) error { return n.eventLogger.InsertEventRecord(txn, logEventType, int32(n.Descriptor.NodeID), int32(n.Descriptor.NodeID), struct { Descriptor roachpb.NodeDescriptor ClusterID uuid.UUID StartedAt int64 }{n.Descriptor, n.ClusterID, n.startedAt}, ) }); err != nil { log.Warningc(n.context(context.TODO()), "unable to log %s event for node %d: %s", logEventType, n.Descriptor.NodeID, err) } else { return } } }) }
// recordJoinEvent begins an asynchronous task which attempts to log a "node // join" or "node restart" event. This query will retry until it succeeds or the // server stops. func (n *Node) recordJoinEvent() { if !n.ctx.LogRangeEvents { return } logEventType := sql.EventLogNodeRestart if n.initialBoot { logEventType = sql.EventLogNodeJoin } n.stopper.RunWorker(func() { retryOpts := base.DefaultRetryOptions() retryOpts.Closer = n.stopper.ShouldStop() for r := retry.Start(retryOpts); r.Next(); { if err := n.ctx.DB.Txn(n.Ctx(), func(txn *client.Txn) error { return n.eventLogger.InsertEventRecord(txn, logEventType, int32(n.Descriptor.NodeID), int32(n.Descriptor.NodeID), struct { Descriptor roachpb.NodeDescriptor ClusterID uuid.UUID StartedAt int64 }{n.Descriptor, n.ClusterID, n.startedAt}, ) }); err != nil { log.Warningf(n.Ctx(), "%s: unable to log %s event: %s", n, logEventType, err) } else { return } } }) }
// maybeWarnAboutInit looks for signs indicating a cluster which // hasn't been initialized and warns. There's no absolutely sure way // to determine whether the current node is simply waiting to be // bootstrapped to an existing cluster vs. the operator having failed // to initialize the cluster via the "cockroach init" command, so // we can only warn. // // This method checks whether all gossip bootstrap hosts are // connected, and whether the node itself is a bootstrap host, but // there is still no sentinel gossip. func (g *Gossip) maybeWarnAboutInit(stopper *stop.Stopper) { stopper.RunWorker(func() { // Wait 5s before first check. select { case <-stopper.ShouldStop(): return case <-time.After(5 * time.Second): } retryOptions := retry.Options{ InitialBackoff: 5 * time.Second, // first backoff at 5s MaxBackoff: 60 * time.Second, // max backoff is 60s Multiplier: 2, // doubles Stopper: stopper, // stop no matter what on stopper } // This will never error because of infinite retries. for r := retry.Start(retryOptions); r.Next(); { g.mu.Lock() hasSentinel := g.is.getInfo(KeySentinel) != nil triedAll := g.triedAll g.mu.Unlock() // If we have the sentinel, exit the retry loop. if hasSentinel { break } // Otherwise, if all bootstrap hosts are connected, warn. if triedAll { log.Warningf("connected to gossip but missing sentinel. Has the cluster been initialized? " + "Use \"cockroach init\" to initialize.") } } }) }
// waitForOneVersion returns once there are no unexpired leases on the // previous version of the table descriptor. It returns the current version. // After returning there can only be versions of the descriptor >= to the // returned version. Lease acquisition (see acquire()) maintains the // invariant that no new leases for desc.Version-1 will be granted once // desc.Version exists. func (s LeaseStore) waitForOneVersion(tableID sqlbase.ID, retryOpts retry.Options) ( sqlbase.DescriptorVersion, error, ) { desc := &sqlbase.Descriptor{} descKey := sqlbase.MakeDescMetadataKey(tableID) var tableDesc *sqlbase.TableDescriptor for r := retry.Start(retryOpts); r.Next(); { // Get the current version of the table descriptor non-transactionally. // // TODO(pmattis): Do an inconsistent read here? if err := s.db.GetProto(descKey, desc); err != nil { return 0, err } tableDesc = desc.GetTable() if tableDesc == nil { return 0, errors.Errorf("ID %d is not a table", tableID) } // Check to see if there are any leases that still exist on the previous // version of the descriptor. now := s.clock.Now() count, err := s.countLeases(tableDesc.ID, tableDesc.Version-1, now.GoTime()) if err != nil { return 0, err } if count == 0 { break } log.Infof(context.TODO(), "publish (count leases): descID=%d name=%s version=%d count=%d", tableDesc.ID, tableDesc.Name, tableDesc.Version-1, count) } return tableDesc.Version, nil }
// Send sends call to Cockroach via an RPC request. Errors which are retryable // are retried with backoff in a loop using the default retry options. Other // errors sending the request are retried indefinitely using the same client // command ID to avoid reporting failure when in fact the command may have gone // through and been executed successfully. We retry here to eventually get // through with the same client command ID and be given the cached response. func (s *rpcSender) Send(_ context.Context, call proto.Call) { method := fmt.Sprintf("Server.%s", call.Args.Method()) var err error for r := retry.Start(s.retryOpts); r.Next(); { select { case <-s.client.Healthy(): default: err = fmt.Errorf("failed to send RPC request %s: client is unhealthy", method) log.Warning(err) continue } if err = s.client.Call(method, call.Args, call.Reply); err != nil { // Assume all errors sending request are retryable. The actual // number of things that could go wrong is vast, but we don't // want to miss any which should in theory be retried with the // same client command ID. We log the error here as a warning so // there's visiblity that this is happening. Some of the errors // we'll sweep up in this net shouldn't be retried, but we can't // really know for sure which. log.Warningf("failed to send RPC request %s: %s", method, err) continue } // On successful post, we're done with retry loop. break } if err != nil { call.Reply.Header().SetGoError(err) } }
// Tests a batch of queries very similar to those that that PGBench runs // in its TPC-B(ish) mode. func runPgbenchQueryParallel(b *testing.B, db *sql.DB) { if err := pgbench.SetupBenchDB(db, 20000, true /*quiet*/); err != nil { b.Fatal(err) } retryOpts := retry.Options{ InitialBackoff: 1 * time.Millisecond, MaxBackoff: 200 * time.Millisecond, Multiplier: 2, } b.ResetTimer() b.RunParallel(func(pb *testing.PB) { src := rand.New(rand.NewSource(5432)) r := retry.Start(retryOpts) var err error for pb.Next() { r.Reset() for r.Next() { err = pgbench.RunOne(db, src, 20000) if err == nil { break } } if err != nil { b.Fatal(err) } } }) b.StopTimer() }
// get performs an HTTPS GET to the specified path for a specific node. func get(t *testing.T, client *http.Client, node *localcluster.Container, path string) []byte { url := fmt.Sprintf("https://%s%s", node.Addr(""), path) // TODO(bram) #2059: Remove retry logic. for r := retry.Start(retryOptions); r.Next(); { resp, err := client.Get(url) if err != nil { t.Logf("could not GET %s - %s", url, err) continue } defer resp.Body.Close() body, err := ioutil.ReadAll(resp.Body) if err != nil { t.Logf("could not read body for %s - %s", url, err) continue } if resp.StatusCode != http.StatusOK { t.Logf("could not GET %s - statuscode: %d - body: %s", url, resp.StatusCode, body) continue } t.Logf("OK response from %s", url) return body } t.Fatalf("There was an error retrieving %s", url) return []byte("") }
func (txn *Txn) exec(retryable func(txn *Txn) error) (err error) { // Run retryable in a retry loop until we encounter a success or // error condition this loop isn't capable of handling. for r := retry.Start(txn.db.txnRetryOptions); r.Next(); { txn.haveTxnWrite, txn.haveEndTxn = false, false // always reset before [re]starting txn if err = retryable(txn); err == nil { if !txn.haveEndTxn && txn.haveTxnWrite { // If there were no errors running retryable, commit the txn. This // may block waiting for outstanding writes to complete in case // retryable didn't -- we need the most recent of all response // timestamps in order to commit. err = txn.Commit() } } if restartErr, ok := err.(proto.TransactionRestartError); ok { if log.V(2) { log.Warning(err) } if restartErr.CanRestartTransaction() == proto.TransactionRestart_IMMEDIATE { r.Reset() continue } else if restartErr.CanRestartTransaction() == proto.TransactionRestart_BACKOFF { continue } // By default, fall through and break. } break } if err != nil && txn.haveTxnWrite { if replyErr := txn.Rollback(); replyErr != nil { log.Errorf("failure aborting transaction: %s; abort caused by: %s", replyErr, err) } } return }
// GetSnapshot wraps Snapshot() but does not require the replica lock // to be held and it will block instead of returning // ErrSnapshotTemporaryUnavailable. func (r *Replica) GetSnapshot(ctx context.Context) (raftpb.Snapshot, error) { retryOptions := retry.Options{ InitialBackoff: 1 * time.Millisecond, MaxBackoff: 50 * time.Millisecond, Multiplier: 2, Closer: r.store.Stopper().ShouldQuiesce(), } for retry := retry.Start(retryOptions); retry.Next(); { log.Tracef(ctx, "snapshot retry loop pass %d", retry.CurrentAttempt()) r.mu.Lock() snap, err := r.SnapshotWithContext(ctx) snapshotChan := r.mu.snapshotChan r.mu.Unlock() if err == raft.ErrSnapshotTemporarilyUnavailable { if snapshotChan == nil { // The call to Snapshot() didn't start an async process due to // rate limiting. Try again later. continue } var ok bool snap, ok = <-snapshotChan if ok { return snap, nil } // Each snapshot worker's output can only be consumed once. // We could be racing with raft itself, so if we get a closed // channel loop back and try again. } else { return snap, err } } return raftpb.Snapshot{}, &roachpb.NodeUnavailableError{} }
func (txn *Txn) exec(retryable func(txn *Txn) *roachpb.Error) *roachpb.Error { // Run retryable in a retry loop until we encounter a success or // error condition this loop isn't capable of handling. var pErr *roachpb.Error for r := retry.Start(txn.db.txnRetryOptions); r.Next(); { pErr = retryable(txn) if pErr == nil && txn.Proto.Status == roachpb.PENDING { // retryable succeeded, but didn't commit. pErr = txn.commit(nil) } if pErr != nil { switch pErr.TransactionRestart { case roachpb.TransactionRestart_IMMEDIATE: if log.V(2) { log.Warning(pErr) } r.Reset() continue case roachpb.TransactionRestart_BACKOFF: if log.V(2) { log.Warning(pErr) } continue } // By default, fall through and break. } break } txn.Cleanup(pErr) return pErr }
// get performs an HTTPS GET to the specified path for a specific node. func get(t *testing.T, base, rel string) []byte { // TODO(bram) #2059: Remove retry logic. url := fmt.Sprintf("%s/%s", base, rel) for r := retry.Start(retryOptions); r.Next(); { resp, err := cluster.HTTPClient.Get(url) if err != nil { log.Infof("could not GET %s - %s", url, err) continue } defer resp.Body.Close() body, err := ioutil.ReadAll(resp.Body) if err != nil { log.Infof("could not read body for %s - %s", url, err) continue } if resp.StatusCode != http.StatusOK { log.Infof("could not GET %s - statuscode: %d - body: %s", url, resp.StatusCode, body) continue } if log.V(1) { log.Infof("OK response from %s", url) } return body } t.Fatalf("There was an error retrieving %s", url) return []byte("") }
// Batch sends a request to Cockroach via RPC. Errors which are retryable are // retried with backoff in a loop using the default retry options. Other errors // sending the request are retried indefinitely using the same client command // ID to avoid reporting failure when in fact the command may have gone through // and been executed successfully. We retry here to eventually get through with // the same client command ID and be given the cached response. func (s *rpcSender) Send(ctx context.Context, ba proto.BatchRequest) (*proto.BatchResponse, *proto.Error) { var err error var br proto.BatchResponse for r := retry.Start(s.retryOpts); r.Next(); { select { case <-s.client.Healthy(): default: err = fmt.Errorf("failed to send RPC request %s: client is unhealthy", method) log.Warning(err) continue } if err = s.client.Call(method, &ba, &br); err != nil { br.Reset() // don't trust anyone. // Assume all errors sending request are retryable. The actual // number of things that could go wrong is vast, but we don't // want to miss any which should in theory be retried with the // same client command ID. We log the error here as a warning so // there's visiblity that this is happening. Some of the errors // we'll sweep up in this net shouldn't be retried, but we can't // really know for sure which. log.Warningf("failed to send RPC request %s: %s", method, err) continue } // On successful post, we're done with retry loop. break } if err != nil { return nil, proto.NewError(err) } pErr := br.Error br.Error = nil return &br, pErr }
// Send sends call to Cockroach via an RPC request. Errors which are retryable // are retried with backoff in a loop using the default retry options. Other // errors sending the request are retried indefinitely using the same client // command ID to avoid reporting failure when in fact the command may have gone // through and been executed successfully. We retry here to eventually get // through with the same client command ID and be given the cached response. func (s *Sender) Send(_ context.Context, call proto.Call) { var err error for r := retry.Start(s.retryOpts); r.Next(); { if !s.client.IsHealthy() { log.Warningf("client %s is unhealthy; retrying", s.client) continue } method := call.Args.Method().String() c := s.client.Go("Server."+method, call.Args, call.Reply, nil) <-c.Done err = c.Error if err != nil { // Assume all errors sending request are retryable. The actual // number of things that could go wrong is vast, but we don't // want to miss any which should in theory be retried with the // same client command ID. We log the error here as a warning so // there's visiblity that this is happening. Some of the errors // we'll sweep up in this net shouldn't be retried, but we can't // really know for sure which. log.Warningf("failed to send RPC request %s: %v", method, err) continue } // On successful post, we're done with retry loop. break } if err != nil { call.Reply.Header().SetGoError(err) } }
// WaitReady waits until the infrastructure is in a state that *should* allow // for a healthy cluster. Currently, this means waiting for the load balancer // to resolve from all nodes. func (f *Farmer) WaitReady(d time.Duration) error { var rOpts = retry.Options{ InitialBackoff: time.Second, MaxBackoff: time.Minute, Multiplier: 1.5, } var err error for r := retry.Start(rOpts); r.Next(); { var elb string elb, _, err = net.SplitHostPort(f.LoadBalancer()) if err != nil || elb == "" { err = fmt.Errorf("ELB not found: %v", err) continue } for i := range f.Nodes() { if err = f.Exec(i, "nslookup "+elb); err != nil { break } } if err == nil { return nil } } return err }
// Send sends call to Cockroach via an RPC. func (s *rpcSender) Send(args Request) (Response, error) { if args.GetUser() == "" { args.User = s.user } var err error var reply Response for r := retry.Start(s.retryOpts); r.Next(); { if err = s.client.Call(RPCMethod, &args, &reply); err != nil { reply.Reset() // don't trust anyone. // Assume all errors sending request are retryable. The actual // number of things that could go wrong is vast, but we don't // want to miss any which should in theory be retried with the // same client command ID. We log the error here as a warning so // there's visibility that this is happening. Some of the errors // we'll sweep up in this net shouldn't be retried, but we can't // really know for sure which. continue } // On successful post, we're done with retry loop. break } return reply, err }
// maybeWarnAboutInit looks for signs indicating a cluster which // hasn't been initialized and warns. There's no absolutely sure way // to determine whether the current node is simply waiting to be // bootstrapped to an existing cluster vs. the operator having failed // to initialize the cluster via the "cockroach init" command, so // we can only warn. // // This method checks whether all gossip bootstrap hosts are // connected, and whether the node itself is a bootstrap host, but // there is still no sentinel gossip. func (g *Gossip) maybeWarnAboutInit(stopper *stop.Stopper) { stopper.RunWorker(func() { // Wait 5s before first check. select { case <-stopper.ShouldStop(): return case <-time.After(5 * time.Second): } retryOptions := retry.Options{ InitialBackoff: 5 * time.Second, // first backoff at 5s MaxBackoff: 60 * time.Second, // max backoff is 60s Multiplier: 2, // doubles Closer: stopper.ShouldStop(), // stop no matter what on stopper } // This will never error because of infinite retries. for r := retry.Start(retryOptions); r.Next(); { g.mu.Lock() hasConnections := g.outgoing.len()+g.incoming.len() > 0 hasSentinel := g.is.getInfo(KeySentinel) != nil triedAll := g.triedAll g.mu.Unlock() // If we have the sentinel, exit the retry loop. if hasSentinel { break } if !hasConnections { log.Warningf("not connected to gossip; check that gossip flag is set appropriately") } else if triedAll { log.Warningf("missing gossip sentinel; first range unavailable or cluster not initialized") } } }) }
// get performs an HTTPS GET to the specified path for a specific node. func get(t *testing.T, client *http.Client, node *localcluster.Container, path string) []byte { url := fmt.Sprintf("https://%s%s", node.Addr(""), path) // There seems to be some issues while trying to connect to the status // server, so retry (up to 5 times) with a 1 second delay each time. // TODO(Bram): Clean this up once we get to the bottom of the issue. for r := retry.Start(retryOptions); r.Next(); { resp, err := client.Get(url) if err != nil { t.Logf("could not GET %s - %s", url, err) continue } body, err := ioutil.ReadAll(resp.Body) if err != nil { t.Logf("could not open body for %s - %s", url, err) continue } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { t.Logf("could not GET %s - statuscode: %d - body: %s", url, resp.StatusCode, body) continue } t.Logf("OK response from %s", url) return body } t.Fatalf("There was an error retrieving %s", url) return []byte("") }
// Send implements the client.Sender interface. func (rls *retryableLocalSender) Send(_ context.Context, call proto.Call) { // Instant retry to handle the case of a range split, which is // exposed here as a RangeKeyMismatchError. retryOpts := retry.Options{} // In local tests, the RPCs are not actually sent over the wire. We // need to clone the Txn in order to avoid unexpected sharing // between TxnCoordSender and client.Txn. if header := call.Args.Header(); header.Txn != nil { header.Txn = gogoproto.Clone(header.Txn).(*proto.Transaction) } var err error for r := retry.Start(retryOpts); r.Next(); { call.Reply.Header().Error = nil rls.LocalSender.Send(context.TODO(), call) // Check for range key mismatch error (this could happen if // range was split between lookup and execution). In this case, // reset header.Replica and engage retry loop. if err = call.Reply.Header().GoError(); err != nil { if _, ok := err.(*proto.RangeKeyMismatchError); ok { // Clear request replica. call.Args.Header().Replica = proto.Replica{} log.Warning(err) continue } } return } panic(fmt.Sprintf("local sender did not succeed: %s", err)) }
// Exec executes fn in the context of a distributed transaction. // Execution is controlled by opt (see comments in TxnExecOptions). // // opt is passed to fn, and it's valid for fn to modify opt as it sees // fit during each execution attempt. // // It's valid for txn to be nil (meaning the txn has already aborted) if fn // can handle that. This is useful for continuing transactions that have been // aborted because of an error in a previous batch of statements in the hope // that a ROLLBACK will reset the state. Neither opt.AutoRetry not opt.AutoCommit // can be set in this case. // // If an error is returned, the txn has been aborted. func (txn *Txn) Exec( opt TxnExecOptions, fn func(txn *Txn, opt *TxnExecOptions) *roachpb.Error) *roachpb.Error { // Run fn in a retry loop until we encounter a success or // error condition this loop isn't capable of handling. var pErr *roachpb.Error var retryOptions retry.Options if txn == nil && (opt.AutoRetry || opt.AutoCommit) { panic("asked to retry or commit a txn that is already aborted") } if opt.AutoRetry { retryOptions = txn.db.txnRetryOptions } RetryLoop: for r := retry.Start(retryOptions); r.Next(); { pErr = fn(txn, &opt) if (pErr == nil) && opt.AutoCommit && (txn.Proto.Status == roachpb.PENDING) { // fn succeeded, but didn't commit. pErr = txn.commit(nil) } if pErr == nil { break } // Make sure the txn record that pErr carries is for this txn. // We check only when txn.Proto.ID has been initialized after an initial successful send. if pErr.GetTxn() != nil && txn.Proto.ID != nil { if errTxn := pErr.GetTxn(); !errTxn.Equal(&txn.Proto) { return roachpb.NewErrorf("mismatching transaction record in the error:\n%s\nv.s.\n%s", errTxn, txn.Proto) } } if !opt.AutoRetry { break RetryLoop } switch pErr.TransactionRestart { case roachpb.TransactionRestart_IMMEDIATE: r.Reset() case roachpb.TransactionRestart_BACKOFF: default: break RetryLoop } if log.V(2) { log.Infof("automatically retrying transaction: %s because of error: %s", txn.DebugName(), pErr) } } if txn != nil { // TODO(andrei): don't do Cleanup() on retriable errors here. // Let the sql executor do it. txn.Cleanup(pErr) } if pErr != nil { pErr.StripErrorTransaction() } return pErr }
// exec executes the request. Any error encountered is returned; it is // the caller's responsibility to update the response. func (e *Executor) execStmts(sql string, planMaker *planner) Response { var resp Response stmts, err := planMaker.parser.Parse(sql, parser.Syntax(planMaker.session.Syntax)) if err != nil { // A parse error occurred: we can't determine if there were multiple // statements or only one, so just pretend there was one. resp.Results = append(resp.Results, makeResultFromError(planMaker, roachpb.NewError(err))) return resp } for _, stmt := range stmts { result, err := e.execStmt(stmt, planMaker) if err != nil { result = makeResultFromError(planMaker, err) } // Release the leases once a transaction is complete. if planMaker.txn == nil { planMaker.releaseLeases(e.db) // Execute any schema changes that were scheduled. if len(planMaker.schemaChangers) > 0 && // Disable execution in some tests. !disableSyncSchemaChangeExec { retryOpts := retry.Options{ InitialBackoff: 20 * time.Millisecond, MaxBackoff: 200 * time.Millisecond, Multiplier: 2, } for _, sc := range planMaker.schemaChangers { sc.db = e.db for r := retry.Start(retryOpts); r.Next(); { if done, err := sc.IsDone(); err != nil { log.Warning(err) break } else if done { break } if pErr := sc.exec(); pErr != nil { if _, ok := pErr.GoError().(*roachpb.ExistingSchemaChangeLeaseError); ok { // Try again. continue } // All other errors can be reported. result = makeResultFromError(planMaker, pErr) } break } } } } resp.Results = append(resp.Results, result) } return resp }
// runHeartbeat sends periodic heartbeats to client, marking the client healthy // or unhealthy and reconnecting appropriately until either the Client or the // supplied channel is closed. func (c *Client) runHeartbeat(retryOpts retry.Options, closer <-chan struct{}) { isHealthy := false setHealthy := func() { if isHealthy { return } isHealthy = true close(c.healthy.Load().(chan struct{})) } setUnhealthy := func() { if isHealthy { isHealthy = false c.healthy.Store(make(chan struct{})) } } var err = errUnstarted // initial condition for { for r := retry.Start(retryOpts); r.Next(); { // Reconnect on failure. if err != nil { if err = c.connect(); err != nil { setUnhealthy() log.Warning(err) continue } } // Heartbeat regardless of failure. if err = c.heartbeat(); err != nil { setUnhealthy() log.Warning(err) continue } setHealthy() break } // Wait after the heartbeat so that the first iteration gets a wait-free // heartbeat attempt. select { case <-closer: c.Close() return case <-c.Closed: return case <-time.After(heartbeatInterval): // TODO(tamird): Perhaps retry more aggressively when the client is unhealthy. } } }
// runHeartbeat sends periodic heartbeats to client, marking the client healthy // or unhealthy and reconnecting appropriately until either the Client or the // supplied channel is closed. func (c *Client) runHeartbeat(retryOpts retry.Options, closer <-chan struct{}) { isHealthy := false setHealthy := func() { if isHealthy { return } isHealthy = true close(c.healthy.Load().(chan struct{})) } setUnhealthy := func() { if isHealthy { isHealthy = false c.healthy.Store(make(chan struct{})) } } connErr := errUnstarted // initial condition var beatErr error for { for r := retry.Start(retryOpts); r.Next(); { // Reconnect if connection failed or heartbeat error is not // definitely temporary. if netErr, ok := beatErr.(net.Error); connErr != nil || beatErr != nil && !(ok && netErr.Temporary()) { if connErr = c.connect(); connErr != nil { log.Warning(connErr) setUnhealthy() continue } } if beatErr = c.heartbeat(); beatErr == nil { setHealthy() break } else { log.Warning(beatErr) setUnhealthy() } } // Wait after the heartbeat so that the first iteration gets a wait-free // heartbeat attempt. select { case <-closer: c.Close() return case <-c.Closed: return case <-time.After(heartbeatInterval): // TODO(tamird): Perhaps retry more aggressively when the client is unhealthy. } } }
// TestRetryableError verifies that Send returns a retryable error // when it hits an RPC error. func TestRetryableError(t *testing.T) { defer leaktest.AfterTest(t)() clientStopper := stop.NewStopper() defer clientStopper.Stop() clientContext := newNodeTestContext(nil, clientStopper) clientContext.HeartbeatTimeout = 10 * clientContext.HeartbeatInterval serverStopper := stop.NewStopper() serverContext := newNodeTestContext(nil, serverStopper) s, ln := newTestServer(t, serverContext) registerBatch(t, s, 0) c := rpc.NewClient(ln.Addr(), clientContext) // Wait until the client becomes healthy and shut down the server. <-c.Healthy() serverStopper.Stop() // Wait until the client becomes unhealthy. func() { for r := retry.Start(retry.Options{}); r.Next(); { select { case <-c.Healthy(): case <-time.After(1 * time.Nanosecond): return } } }() sp := tracing.NewTracer().StartSpan("node test") defer sp.Finish() opts := SendOptions{ Ordering: orderStable, SendNextTimeout: 100 * time.Millisecond, Timeout: 100 * time.Millisecond, Trace: sp, } if _, err := sendBatch(opts, []net.Addr{ln.Addr()}, clientContext); err != nil { retryErr, ok := err.(retry.Retryable) if !ok { t.Fatalf("Unexpected error type: %v", err) } if !retryErr.CanRetry() { t.Errorf("Expected retryable error: %v", retryErr) } } else { t.Fatalf("Unexpected success") } }
func (ia *idAllocator) start() { ia.stopper.RunWorker(func() { defer close(ia.ids) for { var newValue int64 for newValue <= int64(ia.minID) { var ( err error res client.KeyValue ) for r := retry.Start(idAllocationRetryOpts); r.Next(); { if ia.stopper.StartTask() { idKey := ia.idKey.Load().(proto.Key) res, err = ia.db.Inc(idKey, int64(ia.blockSize)) ia.stopper.FinishTask() if err == nil { newValue = res.ValueInt() break } log.Warningf("unable to allocate %d ids from %s: %s", ia.blockSize, idKey, err) } else { return } } if err != nil { panic(fmt.Sprintf("unexpectedly exited id allocation retry loop: %s", err)) } } end := newValue + 1 start := end - int64(ia.blockSize) if start < int64(ia.minID) { start = int64(ia.minID) } // Add all new ids to the channel for consumption. for i := start; i < end; i++ { select { case ia.ids <- uint32(i): case <-ia.stopper.ShouldStop(): return } } } }) }
// execSchemaChanges releases schema leases and runs the queued // schema changers. This needs to be run after the transaction // scheduling the schema change has finished. // // The list of closures is cleared after (attempting) execution. // // Args: // results: The results from all statements in the group that scheduled the // schema changes we're about to execute. Results corresponding to the // schema change statements will be changed in case an error occurs. func (scc *schemaChangerCollection) execSchemaChanges( e *Executor, planMaker *planner, results ResultList, ) { if planMaker.txn != nil { panic("trying to execute schema changes while still in a transaction") } // Release the leases once a transaction is complete. planMaker.releaseLeases() if e.ctx.TestingKnobs.SyncSchemaChangersFilter != nil { e.ctx.TestingKnobs.SyncSchemaChangersFilter(TestingSchemaChangerCollection{scc}) } // Execute any schema changes that were scheduled, in the order of the // statements that scheduled them. for _, scEntry := range scc.schemaChangers { sc := &scEntry.sc sc.db = *e.ctx.DB for r := retry.Start(base.DefaultRetryOptions()); r.Next(); { if done, err := sc.IsDone(); err != nil { log.Warning(e.ctx.Context, err) break } else if done { break } if err := sc.exec( e.ctx.TestingKnobs.SchemaChangersStartBackfillNotification, e.ctx.TestingKnobs.SyncSchemaChangersRenameOldNameNotInUseNotification, ); err != nil { if isSchemaChangeRetryError(err) { // Try again continue } // All other errors can be reported; we report it as the result // corresponding to the statement that enqueued this changer. // There's some sketchiness here: we assume there's a single result // per statement and we clobber the result/error of the corresponding // statement. // There's also another subtlety: we can only report results for // statements in the current batch; we can't modify the results of older // statements. if scEntry.epoch == scc.curGroupNum { results[scEntry.idx] = Result{Err: err} } log.Warningf(e.ctx.Context, "Error executing schema change: %s", err) } break } } scc.schemaChangers = scc.schemaChangers[:0] }
// connect dials the connection in a backoff/retry loop. func (c *Client) connect(opts *retry.Options, context *Context) error { // Attempt to dial connection. retryOpts := clientRetryOptions if opts != nil { retryOpts = *opts } retryOpts.Stopper = context.Stopper for r := retry.Start(retryOpts); r.Next(); { tlsConfig, err := context.GetClientTLSConfig() if err != nil { // Problem loading the TLS config. Retrying will not help. return err } conn, err := tlsDialHTTP(c.addr.Network(), c.addr.String(), tlsConfig) if err != nil { // Retry if the error is temporary, otherwise fail fast. if t, ok := err.(net.Error); ok && t.Temporary() { if log.V(1) { log.Warning(err) } continue } return err } c.mu.Lock() c.Client = rpc.NewClientWithCodec(codec.NewClientCodec(conn)) c.lAddr = conn.LocalAddr() c.mu.Unlock() if c.lAddr == nil { return errClosed } // Ensure at least one heartbeat succeeds before exiting the // retry loop. If it fails, don't retry: The node is probably // dead. if err = c.heartbeat(); err != nil { return err } return nil } return util.Errorf("system is stopping") }