Example #1
0
// WaitForFullReplication waits until all stores in the cluster
// have no ranges with replication pending.
func (tc *TestCluster) WaitForFullReplication() error {
	opts := retry.Options{
		InitialBackoff: time.Millisecond * 10,
		MaxBackoff:     time.Millisecond * 100,
		Multiplier:     2,
	}

	notReplicated := true
	for r := retry.Start(opts); r.Next() && notReplicated; {
		notReplicated = false
		for _, s := range tc.Servers {
			err := s.Stores().VisitStores(func(s *storage.Store) error {
				if err := s.ComputeMetrics(); err != nil {
					return err
				}
				if s.Registry().GetGauge("ranges.replication-pending").Value() > 0 {
					notReplicated = true
				}
				return nil
			})
			if err != nil {
				return err
			}
			if notReplicated {
				break
			}
		}
	}
	return nil
}
Example #2
0
// request returns the result of performing a http get request.
func request(url string, httpClient *http.Client) ([]byte, bool) {
	for r := retry.Start(retryOptions); r.Next(); {
		req, err := http.NewRequest("GET", url, nil)
		if err != nil {
			log.Fatal(err)
			return nil, false
		}
		req.Header.Set(util.AcceptHeader, util.JSONContentType)
		resp, err := httpClient.Do(req)
		if err != nil {
			log.Infof("could not GET %s - %s", url, err)
			continue
		}
		defer resp.Body.Close()
		body, err := ioutil.ReadAll(resp.Body)
		if err != nil {
			log.Infof("could not ready body for %s - %s", url, err)
			continue
		}
		if resp.StatusCode != http.StatusOK {
			log.Infof("could not GET %s - statuscode: %d - body: %s", url, resp.StatusCode, body)
			continue
		}
		returnedContentType := resp.Header.Get(util.ContentTypeHeader)
		if returnedContentType != util.JSONContentType {
			log.Infof("unexpected content type: %v", returnedContentType)
			continue
		}
		log.Infof("OK response from %s", url)
		return body, true
	}
	log.Warningf("There was an error retrieving %s", url)
	return nil, false
}
// GetSnapshot wraps Snapshot() but does not require the replica lock
// to be held and it will block instead of returning
// ErrSnapshotTemporaryUnavailable.
func (r *Replica) GetSnapshot() (raftpb.Snapshot, error) {
	retryOptions := retry.Options{
		InitialBackoff: 1 * time.Millisecond,
		MaxBackoff:     50 * time.Millisecond,
		Multiplier:     2,
	}
	for retry := retry.Start(retryOptions); retry.Next(); {
		r.mu.Lock()
		snap, err := r.Snapshot()
		snapshotChan := r.mu.snapshotChan
		r.mu.Unlock()
		if err == raft.ErrSnapshotTemporarilyUnavailable {
			if snapshotChan == nil {
				// The call to Snapshot() didn't start an async process due to
				// rate limiting. Try again later.
				continue
			}
			var ok bool
			snap, ok = <-snapshotChan
			if ok {
				return snap, nil
			}
			// Each snapshot worker's output can only be consumed once.
			// We could be racing with raft itself, so if we get a closed
			// channel loop back and try again.
		} else {
			return snap, err
		}
	}
	panic("unreachable") // due to infinite retries
}
Example #4
0
func (txn *Txn) exec(retryable func(txn *Txn) error) error {
	// Run retryable in a retry loop until we encounter a success or
	// error condition this loop isn't capable of handling.
	var err error
	for r := retry.Start(txn.db.txnRetryOptions); r.Next(); {
		err = retryable(txn)
		if err == nil && txn.Proto.Status == roachpb.PENDING {
			// retryable succeeded, but didn't commit.
			err = txn.commit(nil)
		}
		if restartErr, ok := err.(roachpb.TransactionRestartError); ok {
			if log.V(2) {
				log.Warning(err)
			}
			switch restartErr.CanRestartTransaction() {
			case roachpb.TransactionRestart_IMMEDIATE:
				r.Reset()
				continue
			case roachpb.TransactionRestart_BACKOFF:
				continue
			}
			// By default, fall through and break.
		}
		break
	}
	txn.Cleanup(err)
	return err
}
Example #5
0
// recordJoinEvent begins an asynchronous task which attempts to log a "node
// join" or "node restart" event. This query will retry until it succeeds or the
// server stops.
func (n *Node) recordJoinEvent() {
	if !n.ctx.LogRangeEvents {
		return
	}

	logEventType := sql.EventLogNodeRestart
	if n.initialBoot {
		logEventType = sql.EventLogNodeJoin
	}

	n.stopper.RunWorker(func() {
		for r := retry.Start(retry.Options{Closer: n.stopper.ShouldStop()}); r.Next(); {
			if err := n.ctx.DB.Txn(func(txn *client.Txn) error {
				return n.eventLogger.InsertEventRecord(txn,
					logEventType,
					int32(n.Descriptor.NodeID),
					int32(n.Descriptor.NodeID),
					struct {
						Descriptor roachpb.NodeDescriptor
						ClusterID  uuid.UUID
						StartedAt  int64
					}{n.Descriptor, n.ClusterID, n.startedAt},
				)
			}); err != nil {
				log.Warningc(n.context(context.TODO()), "unable to log %s event for node %d: %s", logEventType, n.Descriptor.NodeID, err)
			} else {
				return
			}
		}
	})
}
Example #6
0
// recordJoinEvent begins an asynchronous task which attempts to log a "node
// join" or "node restart" event. This query will retry until it succeeds or the
// server stops.
func (n *Node) recordJoinEvent() {
	if !n.ctx.LogRangeEvents {
		return
	}

	logEventType := sql.EventLogNodeRestart
	if n.initialBoot {
		logEventType = sql.EventLogNodeJoin
	}

	n.stopper.RunWorker(func() {
		retryOpts := base.DefaultRetryOptions()
		retryOpts.Closer = n.stopper.ShouldStop()
		for r := retry.Start(retryOpts); r.Next(); {
			if err := n.ctx.DB.Txn(n.Ctx(), func(txn *client.Txn) error {
				return n.eventLogger.InsertEventRecord(txn,
					logEventType,
					int32(n.Descriptor.NodeID),
					int32(n.Descriptor.NodeID),
					struct {
						Descriptor roachpb.NodeDescriptor
						ClusterID  uuid.UUID
						StartedAt  int64
					}{n.Descriptor, n.ClusterID, n.startedAt},
				)
			}); err != nil {
				log.Warningf(n.Ctx(), "%s: unable to log %s event: %s", n, logEventType, err)
			} else {
				return
			}
		}
	})
}
Example #7
0
// maybeWarnAboutInit looks for signs indicating a cluster which
// hasn't been initialized and warns. There's no absolutely sure way
// to determine whether the current node is simply waiting to be
// bootstrapped to an existing cluster vs. the operator having failed
// to initialize the cluster via the "cockroach init" command, so
// we can only warn.
//
// This method checks whether all gossip bootstrap hosts are
// connected, and whether the node itself is a bootstrap host, but
// there is still no sentinel gossip.
func (g *Gossip) maybeWarnAboutInit(stopper *stop.Stopper) {
	stopper.RunWorker(func() {
		// Wait 5s before first check.
		select {
		case <-stopper.ShouldStop():
			return
		case <-time.After(5 * time.Second):
		}
		retryOptions := retry.Options{
			InitialBackoff: 5 * time.Second,  // first backoff at 5s
			MaxBackoff:     60 * time.Second, // max backoff is 60s
			Multiplier:     2,                // doubles
			Stopper:        stopper,          // stop no matter what on stopper
		}
		// This will never error because of infinite retries.
		for r := retry.Start(retryOptions); r.Next(); {
			g.mu.Lock()
			hasSentinel := g.is.getInfo(KeySentinel) != nil
			triedAll := g.triedAll
			g.mu.Unlock()
			// If we have the sentinel, exit the retry loop.
			if hasSentinel {
				break
			}
			// Otherwise, if all bootstrap hosts are connected, warn.
			if triedAll {
				log.Warningf("connected to gossip but missing sentinel. Has the cluster been initialized? " +
					"Use \"cockroach init\" to initialize.")
			}
		}
	})
}
Example #8
0
// waitForOneVersion returns once there are no unexpired leases on the
// previous version of the table descriptor. It returns the current version.
// After returning there can only be versions of the descriptor >= to the
// returned version. Lease acquisition (see acquire()) maintains the
// invariant that no new leases for desc.Version-1 will be granted once
// desc.Version exists.
func (s LeaseStore) waitForOneVersion(tableID sqlbase.ID, retryOpts retry.Options) (
	sqlbase.DescriptorVersion, error,
) {
	desc := &sqlbase.Descriptor{}
	descKey := sqlbase.MakeDescMetadataKey(tableID)
	var tableDesc *sqlbase.TableDescriptor
	for r := retry.Start(retryOpts); r.Next(); {
		// Get the current version of the table descriptor non-transactionally.
		//
		// TODO(pmattis): Do an inconsistent read here?
		if err := s.db.GetProto(descKey, desc); err != nil {
			return 0, err
		}
		tableDesc = desc.GetTable()
		if tableDesc == nil {
			return 0, errors.Errorf("ID %d is not a table", tableID)
		}
		// Check to see if there are any leases that still exist on the previous
		// version of the descriptor.
		now := s.clock.Now()
		count, err := s.countLeases(tableDesc.ID, tableDesc.Version-1, now.GoTime())
		if err != nil {
			return 0, err
		}
		if count == 0 {
			break
		}
		log.Infof(context.TODO(), "publish (count leases): descID=%d name=%s version=%d count=%d",
			tableDesc.ID, tableDesc.Name, tableDesc.Version-1, count)
	}
	return tableDesc.Version, nil
}
Example #9
0
// Send sends call to Cockroach via an RPC request. Errors which are retryable
// are retried with backoff in a loop using the default retry options. Other
// errors sending the request are retried indefinitely using the same client
// command ID to avoid reporting failure when in fact the command may have gone
// through and been executed successfully. We retry here to eventually get
// through with the same client command ID and be given the cached response.
func (s *rpcSender) Send(_ context.Context, call proto.Call) {
	method := fmt.Sprintf("Server.%s", call.Args.Method())

	var err error
	for r := retry.Start(s.retryOpts); r.Next(); {
		select {
		case <-s.client.Healthy():
		default:
			err = fmt.Errorf("failed to send RPC request %s: client is unhealthy", method)
			log.Warning(err)
			continue
		}

		if err = s.client.Call(method, call.Args, call.Reply); err != nil {
			// Assume all errors sending request are retryable. The actual
			// number of things that could go wrong is vast, but we don't
			// want to miss any which should in theory be retried with the
			// same client command ID. We log the error here as a warning so
			// there's visiblity that this is happening. Some of the errors
			// we'll sweep up in this net shouldn't be retried, but we can't
			// really know for sure which.
			log.Warningf("failed to send RPC request %s: %s", method, err)
			continue
		}

		// On successful post, we're done with retry loop.
		break
	}
	if err != nil {
		call.Reply.Header().SetGoError(err)
	}
}
Example #10
0
// Tests a batch of queries very similar to those that that PGBench runs
// in its TPC-B(ish) mode.
func runPgbenchQueryParallel(b *testing.B, db *sql.DB) {
	if err := pgbench.SetupBenchDB(db, 20000, true /*quiet*/); err != nil {
		b.Fatal(err)
	}

	retryOpts := retry.Options{
		InitialBackoff: 1 * time.Millisecond,
		MaxBackoff:     200 * time.Millisecond,
		Multiplier:     2,
	}

	b.ResetTimer()
	b.RunParallel(func(pb *testing.PB) {
		src := rand.New(rand.NewSource(5432))
		r := retry.Start(retryOpts)
		var err error
		for pb.Next() {
			r.Reset()
			for r.Next() {
				err = pgbench.RunOne(db, src, 20000)
				if err == nil {
					break
				}
			}
			if err != nil {
				b.Fatal(err)
			}
		}
	})
	b.StopTimer()
}
Example #11
0
// get performs an HTTPS GET to the specified path for a specific node.
func get(t *testing.T, client *http.Client, node *localcluster.Container, path string) []byte {
	url := fmt.Sprintf("https://%s%s", node.Addr(""), path)
	// TODO(bram) #2059: Remove retry logic.
	for r := retry.Start(retryOptions); r.Next(); {
		resp, err := client.Get(url)
		if err != nil {
			t.Logf("could not GET %s - %s", url, err)
			continue
		}
		defer resp.Body.Close()
		body, err := ioutil.ReadAll(resp.Body)
		if err != nil {
			t.Logf("could not read body for %s - %s", url, err)
			continue
		}
		if resp.StatusCode != http.StatusOK {
			t.Logf("could not GET %s - statuscode: %d - body: %s", url, resp.StatusCode, body)
			continue
		}
		t.Logf("OK response from %s", url)
		return body
	}
	t.Fatalf("There was an error retrieving %s", url)
	return []byte("")
}
Example #12
0
func (txn *Txn) exec(retryable func(txn *Txn) error) (err error) {
	// Run retryable in a retry loop until we encounter a success or
	// error condition this loop isn't capable of handling.
	for r := retry.Start(txn.db.txnRetryOptions); r.Next(); {
		txn.haveTxnWrite, txn.haveEndTxn = false, false // always reset before [re]starting txn
		if err = retryable(txn); err == nil {
			if !txn.haveEndTxn && txn.haveTxnWrite {
				// If there were no errors running retryable, commit the txn. This
				// may block waiting for outstanding writes to complete in case
				// retryable didn't -- we need the most recent of all response
				// timestamps in order to commit.
				err = txn.Commit()
			}
		}
		if restartErr, ok := err.(proto.TransactionRestartError); ok {
			if log.V(2) {
				log.Warning(err)
			}
			if restartErr.CanRestartTransaction() == proto.TransactionRestart_IMMEDIATE {
				r.Reset()
				continue
			} else if restartErr.CanRestartTransaction() == proto.TransactionRestart_BACKOFF {
				continue
			}
			// By default, fall through and break.
		}
		break
	}
	if err != nil && txn.haveTxnWrite {
		if replyErr := txn.Rollback(); replyErr != nil {
			log.Errorf("failure aborting transaction: %s; abort caused by: %s", replyErr, err)
		}
	}
	return
}
// GetSnapshot wraps Snapshot() but does not require the replica lock
// to be held and it will block instead of returning
// ErrSnapshotTemporaryUnavailable.
func (r *Replica) GetSnapshot(ctx context.Context) (raftpb.Snapshot, error) {
	retryOptions := retry.Options{
		InitialBackoff: 1 * time.Millisecond,
		MaxBackoff:     50 * time.Millisecond,
		Multiplier:     2,
		Closer:         r.store.Stopper().ShouldQuiesce(),
	}
	for retry := retry.Start(retryOptions); retry.Next(); {
		log.Tracef(ctx, "snapshot retry loop pass %d", retry.CurrentAttempt())
		r.mu.Lock()
		snap, err := r.SnapshotWithContext(ctx)
		snapshotChan := r.mu.snapshotChan
		r.mu.Unlock()
		if err == raft.ErrSnapshotTemporarilyUnavailable {
			if snapshotChan == nil {
				// The call to Snapshot() didn't start an async process due to
				// rate limiting. Try again later.
				continue
			}
			var ok bool
			snap, ok = <-snapshotChan
			if ok {
				return snap, nil
			}
			// Each snapshot worker's output can only be consumed once.
			// We could be racing with raft itself, so if we get a closed
			// channel loop back and try again.
		} else {
			return snap, err
		}
	}
	return raftpb.Snapshot{}, &roachpb.NodeUnavailableError{}
}
Example #14
0
File: txn.go Project: l2x/cockroach
func (txn *Txn) exec(retryable func(txn *Txn) *roachpb.Error) *roachpb.Error {
	// Run retryable in a retry loop until we encounter a success or
	// error condition this loop isn't capable of handling.
	var pErr *roachpb.Error
	for r := retry.Start(txn.db.txnRetryOptions); r.Next(); {
		pErr = retryable(txn)
		if pErr == nil && txn.Proto.Status == roachpb.PENDING {
			// retryable succeeded, but didn't commit.
			pErr = txn.commit(nil)
		}

		if pErr != nil {
			switch pErr.TransactionRestart {
			case roachpb.TransactionRestart_IMMEDIATE:
				if log.V(2) {
					log.Warning(pErr)
				}
				r.Reset()
				continue
			case roachpb.TransactionRestart_BACKOFF:
				if log.V(2) {
					log.Warning(pErr)
				}
				continue
			}
			// By default, fall through and break.
		}
		break
	}
	txn.Cleanup(pErr)
	return pErr
}
Example #15
0
// get performs an HTTPS GET to the specified path for a specific node.
func get(t *testing.T, base, rel string) []byte {
	// TODO(bram) #2059: Remove retry logic.
	url := fmt.Sprintf("%s/%s", base, rel)
	for r := retry.Start(retryOptions); r.Next(); {
		resp, err := cluster.HTTPClient.Get(url)
		if err != nil {
			log.Infof("could not GET %s - %s", url, err)
			continue
		}
		defer resp.Body.Close()
		body, err := ioutil.ReadAll(resp.Body)
		if err != nil {
			log.Infof("could not read body for %s - %s", url, err)
			continue
		}
		if resp.StatusCode != http.StatusOK {
			log.Infof("could not GET %s - statuscode: %d - body: %s", url, resp.StatusCode, body)
			continue
		}
		if log.V(1) {
			log.Infof("OK response from %s", url)
		}
		return body
	}
	t.Fatalf("There was an error retrieving %s", url)
	return []byte("")
}
Example #16
0
// Batch sends a request to Cockroach via RPC. Errors which are retryable are
// retried with backoff in a loop using the default retry options. Other errors
// sending the request are retried indefinitely using the same client command
// ID to avoid reporting failure when in fact the command may have gone through
// and been executed successfully. We retry here to eventually get through with
// the same client command ID and be given the cached response.
func (s *rpcSender) Send(ctx context.Context, ba proto.BatchRequest) (*proto.BatchResponse, *proto.Error) {
	var err error
	var br proto.BatchResponse
	for r := retry.Start(s.retryOpts); r.Next(); {
		select {
		case <-s.client.Healthy():
		default:
			err = fmt.Errorf("failed to send RPC request %s: client is unhealthy", method)
			log.Warning(err)
			continue
		}

		if err = s.client.Call(method, &ba, &br); err != nil {
			br.Reset() // don't trust anyone.
			// Assume all errors sending request are retryable. The actual
			// number of things that could go wrong is vast, but we don't
			// want to miss any which should in theory be retried with the
			// same client command ID. We log the error here as a warning so
			// there's visiblity that this is happening. Some of the errors
			// we'll sweep up in this net shouldn't be retried, but we can't
			// really know for sure which.
			log.Warningf("failed to send RPC request %s: %s", method, err)
			continue
		}

		// On successful post, we're done with retry loop.
		break
	}
	if err != nil {
		return nil, proto.NewError(err)
	}
	pErr := br.Error
	br.Error = nil
	return &br, pErr
}
Example #17
0
// Send sends call to Cockroach via an RPC request. Errors which are retryable
// are retried with backoff in a loop using the default retry options. Other
// errors sending the request are retried indefinitely using the same client
// command ID to avoid reporting failure when in fact the command may have gone
// through and been executed successfully. We retry here to eventually get
// through with the same client command ID and be given the cached response.
func (s *Sender) Send(_ context.Context, call proto.Call) {
	var err error
	for r := retry.Start(s.retryOpts); r.Next(); {
		if !s.client.IsHealthy() {
			log.Warningf("client %s is unhealthy; retrying", s.client)
			continue
		}

		method := call.Args.Method().String()
		c := s.client.Go("Server."+method, call.Args, call.Reply, nil)
		<-c.Done
		err = c.Error
		if err != nil {
			// Assume all errors sending request are retryable. The actual
			// number of things that could go wrong is vast, but we don't
			// want to miss any which should in theory be retried with the
			// same client command ID. We log the error here as a warning so
			// there's visiblity that this is happening. Some of the errors
			// we'll sweep up in this net shouldn't be retried, but we can't
			// really know for sure which.
			log.Warningf("failed to send RPC request %s: %v", method, err)
			continue
		}

		// On successful post, we're done with retry loop.
		break
	}
	if err != nil {
		call.Reply.Header().SetGoError(err)
	}
}
Example #18
0
// WaitReady waits until the infrastructure is in a state that *should* allow
// for a healthy cluster. Currently, this means waiting for the load balancer
// to resolve from all nodes.
func (f *Farmer) WaitReady(d time.Duration) error {
	var rOpts = retry.Options{
		InitialBackoff: time.Second,
		MaxBackoff:     time.Minute,
		Multiplier:     1.5,
	}
	var err error
	for r := retry.Start(rOpts); r.Next(); {
		var elb string
		elb, _, err = net.SplitHostPort(f.LoadBalancer())
		if err != nil || elb == "" {
			err = fmt.Errorf("ELB not found: %v", err)
			continue
		}
		for i := range f.Nodes() {
			if err = f.Exec(i, "nslookup "+elb); err != nil {
				break
			}
		}
		if err == nil {
			return nil
		}
	}
	return err
}
Example #19
0
// Send sends call to Cockroach via an RPC.
func (s *rpcSender) Send(args Request) (Response, error) {
	if args.GetUser() == "" {
		args.User = s.user
	}

	var err error
	var reply Response
	for r := retry.Start(s.retryOpts); r.Next(); {
		if err = s.client.Call(RPCMethod, &args, &reply); err != nil {
			reply.Reset() // don't trust anyone.
			// Assume all errors sending request are retryable. The actual
			// number of things that could go wrong is vast, but we don't
			// want to miss any which should in theory be retried with the
			// same client command ID. We log the error here as a warning so
			// there's visibility that this is happening. Some of the errors
			// we'll sweep up in this net shouldn't be retried, but we can't
			// really know for sure which.
			continue
		}

		// On successful post, we're done with retry loop.
		break
	}
	return reply, err
}
Example #20
0
// maybeWarnAboutInit looks for signs indicating a cluster which
// hasn't been initialized and warns. There's no absolutely sure way
// to determine whether the current node is simply waiting to be
// bootstrapped to an existing cluster vs. the operator having failed
// to initialize the cluster via the "cockroach init" command, so
// we can only warn.
//
// This method checks whether all gossip bootstrap hosts are
// connected, and whether the node itself is a bootstrap host, but
// there is still no sentinel gossip.
func (g *Gossip) maybeWarnAboutInit(stopper *stop.Stopper) {
	stopper.RunWorker(func() {
		// Wait 5s before first check.
		select {
		case <-stopper.ShouldStop():
			return
		case <-time.After(5 * time.Second):
		}
		retryOptions := retry.Options{
			InitialBackoff: 5 * time.Second,      // first backoff at 5s
			MaxBackoff:     60 * time.Second,     // max backoff is 60s
			Multiplier:     2,                    // doubles
			Closer:         stopper.ShouldStop(), // stop no matter what on stopper
		}
		// This will never error because of infinite retries.
		for r := retry.Start(retryOptions); r.Next(); {
			g.mu.Lock()
			hasConnections := g.outgoing.len()+g.incoming.len() > 0
			hasSentinel := g.is.getInfo(KeySentinel) != nil
			triedAll := g.triedAll
			g.mu.Unlock()
			// If we have the sentinel, exit the retry loop.
			if hasSentinel {
				break
			}
			if !hasConnections {
				log.Warningf("not connected to gossip; check that gossip flag is set appropriately")
			} else if triedAll {
				log.Warningf("missing gossip sentinel; first range unavailable or cluster not initialized")
			}
		}
	})
}
Example #21
0
// get performs an HTTPS GET to the specified path for a specific node.
func get(t *testing.T, client *http.Client, node *localcluster.Container, path string) []byte {
	url := fmt.Sprintf("https://%s%s", node.Addr(""), path)
	// There seems to be some issues while trying to connect to the status
	// server, so retry (up to 5 times) with a 1 second delay each time.
	// TODO(Bram): Clean this up once we get to the bottom of the issue.
	for r := retry.Start(retryOptions); r.Next(); {
		resp, err := client.Get(url)
		if err != nil {
			t.Logf("could not GET %s - %s", url, err)
			continue
		}
		body, err := ioutil.ReadAll(resp.Body)
		if err != nil {
			t.Logf("could not open body for %s - %s", url, err)
			continue
		}
		defer resp.Body.Close()
		if resp.StatusCode != http.StatusOK {
			t.Logf("could not GET %s - statuscode: %d - body: %s", url, resp.StatusCode, body)
			continue
		}
		t.Logf("OK response from %s", url)
		return body
	}
	t.Fatalf("There was an error retrieving %s", url)
	return []byte("")
}
Example #22
0
// Send implements the client.Sender interface.
func (rls *retryableLocalSender) Send(_ context.Context, call proto.Call) {
	// Instant retry to handle the case of a range split, which is
	// exposed here as a RangeKeyMismatchError.
	retryOpts := retry.Options{}
	// In local tests, the RPCs are not actually sent over the wire. We
	// need to clone the Txn in order to avoid unexpected sharing
	// between TxnCoordSender and client.Txn.
	if header := call.Args.Header(); header.Txn != nil {
		header.Txn = gogoproto.Clone(header.Txn).(*proto.Transaction)
	}

	var err error
	for r := retry.Start(retryOpts); r.Next(); {
		call.Reply.Header().Error = nil
		rls.LocalSender.Send(context.TODO(), call)
		// Check for range key mismatch error (this could happen if
		// range was split between lookup and execution). In this case,
		// reset header.Replica and engage retry loop.
		if err = call.Reply.Header().GoError(); err != nil {
			if _, ok := err.(*proto.RangeKeyMismatchError); ok {
				// Clear request replica.
				call.Args.Header().Replica = proto.Replica{}
				log.Warning(err)
				continue
			}
		}
		return
	}
	panic(fmt.Sprintf("local sender did not succeed: %s", err))
}
Example #23
0
// Exec executes fn in the context of a distributed transaction.
// Execution is controlled by opt (see comments in TxnExecOptions).
//
// opt is passed to fn, and it's valid for fn to modify opt as it sees
// fit during each execution attempt.
//
// It's valid for txn to be nil (meaning the txn has already aborted) if fn
// can handle that. This is useful for continuing transactions that have been
// aborted because of an error in a previous batch of statements in the hope
// that a ROLLBACK will reset the state. Neither opt.AutoRetry not opt.AutoCommit
// can be set in this case.
//
// If an error is returned, the txn has been aborted.
func (txn *Txn) Exec(
	opt TxnExecOptions,
	fn func(txn *Txn, opt *TxnExecOptions) *roachpb.Error) *roachpb.Error {
	// Run fn in a retry loop until we encounter a success or
	// error condition this loop isn't capable of handling.
	var pErr *roachpb.Error
	var retryOptions retry.Options
	if txn == nil && (opt.AutoRetry || opt.AutoCommit) {
		panic("asked to retry  or commit a txn that is already aborted")
	}
	if opt.AutoRetry {
		retryOptions = txn.db.txnRetryOptions
	}
RetryLoop:
	for r := retry.Start(retryOptions); r.Next(); {
		pErr = fn(txn, &opt)
		if (pErr == nil) && opt.AutoCommit && (txn.Proto.Status == roachpb.PENDING) {
			// fn succeeded, but didn't commit.
			pErr = txn.commit(nil)
		}

		if pErr == nil {
			break
		}

		// Make sure the txn record that pErr carries is for this txn.
		// We check only when txn.Proto.ID has been initialized after an initial successful send.
		if pErr.GetTxn() != nil && txn.Proto.ID != nil {
			if errTxn := pErr.GetTxn(); !errTxn.Equal(&txn.Proto) {
				return roachpb.NewErrorf("mismatching transaction record in the error:\n%s\nv.s.\n%s",
					errTxn, txn.Proto)
			}
		}

		if !opt.AutoRetry {
			break RetryLoop
		}
		switch pErr.TransactionRestart {
		case roachpb.TransactionRestart_IMMEDIATE:
			r.Reset()
		case roachpb.TransactionRestart_BACKOFF:
		default:
			break RetryLoop
		}
		if log.V(2) {
			log.Infof("automatically retrying transaction: %s because of error: %s",
				txn.DebugName(), pErr)
		}
	}
	if txn != nil {
		// TODO(andrei): don't do Cleanup() on retriable errors here.
		// Let the sql executor do it.
		txn.Cleanup(pErr)
	}
	if pErr != nil {
		pErr.StripErrorTransaction()
	}
	return pErr
}
Example #24
0
// exec executes the request. Any error encountered is returned; it is
// the caller's responsibility to update the response.
func (e *Executor) execStmts(sql string, planMaker *planner) Response {
	var resp Response
	stmts, err := planMaker.parser.Parse(sql, parser.Syntax(planMaker.session.Syntax))
	if err != nil {
		// A parse error occurred: we can't determine if there were multiple
		// statements or only one, so just pretend there was one.
		resp.Results = append(resp.Results, makeResultFromError(planMaker, roachpb.NewError(err)))
		return resp
	}
	for _, stmt := range stmts {
		result, err := e.execStmt(stmt, planMaker)
		if err != nil {
			result = makeResultFromError(planMaker, err)
		}
		// Release the leases once a transaction is complete.
		if planMaker.txn == nil {
			planMaker.releaseLeases(e.db)
			// Execute any schema changes that were scheduled.
			if len(planMaker.schemaChangers) > 0 &&
				// Disable execution in some tests.
				!disableSyncSchemaChangeExec {
				retryOpts := retry.Options{
					InitialBackoff: 20 * time.Millisecond,
					MaxBackoff:     200 * time.Millisecond,
					Multiplier:     2,
				}
				for _, sc := range planMaker.schemaChangers {
					sc.db = e.db
					for r := retry.Start(retryOpts); r.Next(); {
						if done, err := sc.IsDone(); err != nil {
							log.Warning(err)
							break
						} else if done {
							break
						}
						if pErr := sc.exec(); pErr != nil {
							if _, ok := pErr.GoError().(*roachpb.ExistingSchemaChangeLeaseError); ok {
								// Try again.
								continue
							}
							// All other errors can be reported.
							result = makeResultFromError(planMaker, pErr)
						}
						break
					}
				}
			}
		}
		resp.Results = append(resp.Results, result)
	}
	return resp
}
Example #25
0
// runHeartbeat sends periodic heartbeats to client, marking the client healthy
// or unhealthy and reconnecting appropriately until either the Client or the
// supplied channel is closed.
func (c *Client) runHeartbeat(retryOpts retry.Options, closer <-chan struct{}) {
	isHealthy := false
	setHealthy := func() {
		if isHealthy {
			return
		}
		isHealthy = true
		close(c.healthy.Load().(chan struct{}))
	}
	setUnhealthy := func() {
		if isHealthy {
			isHealthy = false
			c.healthy.Store(make(chan struct{}))
		}
	}

	var err = errUnstarted // initial condition
	for {
		for r := retry.Start(retryOpts); r.Next(); {
			// Reconnect on failure.
			if err != nil {
				if err = c.connect(); err != nil {
					setUnhealthy()
					log.Warning(err)
					continue
				}
			}

			// Heartbeat regardless of failure.
			if err = c.heartbeat(); err != nil {
				setUnhealthy()
				log.Warning(err)
				continue
			}

			setHealthy()
			break
		}

		// Wait after the heartbeat so that the first iteration gets a wait-free
		// heartbeat attempt.
		select {
		case <-closer:
			c.Close()
			return
		case <-c.Closed:
			return
		case <-time.After(heartbeatInterval):
			// TODO(tamird): Perhaps retry more aggressively when the client is unhealthy.
		}
	}
}
Example #26
0
// runHeartbeat sends periodic heartbeats to client, marking the client healthy
// or unhealthy and reconnecting appropriately until either the Client or the
// supplied channel is closed.
func (c *Client) runHeartbeat(retryOpts retry.Options, closer <-chan struct{}) {
	isHealthy := false
	setHealthy := func() {
		if isHealthy {
			return
		}
		isHealthy = true
		close(c.healthy.Load().(chan struct{}))
	}
	setUnhealthy := func() {
		if isHealthy {
			isHealthy = false
			c.healthy.Store(make(chan struct{}))
		}
	}

	connErr := errUnstarted // initial condition
	var beatErr error
	for {
		for r := retry.Start(retryOpts); r.Next(); {
			// Reconnect if connection failed or heartbeat error is not
			// definitely temporary.
			if netErr, ok := beatErr.(net.Error); connErr != nil || beatErr != nil && !(ok && netErr.Temporary()) {
				if connErr = c.connect(); connErr != nil {
					log.Warning(connErr)
					setUnhealthy()
					continue
				}
			}

			if beatErr = c.heartbeat(); beatErr == nil {
				setHealthy()
				break
			} else {
				log.Warning(beatErr)
				setUnhealthy()
			}
		}
		// Wait after the heartbeat so that the first iteration gets a wait-free
		// heartbeat attempt.
		select {
		case <-closer:
			c.Close()
			return
		case <-c.Closed:
			return
		case <-time.After(heartbeatInterval):
			// TODO(tamird): Perhaps retry more aggressively when the client is unhealthy.
		}
	}
}
Example #27
0
// TestRetryableError verifies that Send returns a retryable error
// when it hits an RPC error.
func TestRetryableError(t *testing.T) {
	defer leaktest.AfterTest(t)()

	clientStopper := stop.NewStopper()
	defer clientStopper.Stop()
	clientContext := newNodeTestContext(nil, clientStopper)
	clientContext.HeartbeatTimeout = 10 * clientContext.HeartbeatInterval

	serverStopper := stop.NewStopper()
	serverContext := newNodeTestContext(nil, serverStopper)

	s, ln := newTestServer(t, serverContext)
	registerBatch(t, s, 0)

	c := rpc.NewClient(ln.Addr(), clientContext)
	// Wait until the client becomes healthy and shut down the server.
	<-c.Healthy()
	serverStopper.Stop()
	// Wait until the client becomes unhealthy.
	func() {
		for r := retry.Start(retry.Options{}); r.Next(); {
			select {
			case <-c.Healthy():
			case <-time.After(1 * time.Nanosecond):
				return
			}
		}
	}()

	sp := tracing.NewTracer().StartSpan("node test")
	defer sp.Finish()

	opts := SendOptions{
		Ordering:        orderStable,
		SendNextTimeout: 100 * time.Millisecond,
		Timeout:         100 * time.Millisecond,
		Trace:           sp,
	}
	if _, err := sendBatch(opts, []net.Addr{ln.Addr()}, clientContext); err != nil {
		retryErr, ok := err.(retry.Retryable)
		if !ok {
			t.Fatalf("Unexpected error type: %v", err)
		}
		if !retryErr.CanRetry() {
			t.Errorf("Expected retryable error: %v", retryErr)
		}
	} else {
		t.Fatalf("Unexpected success")
	}
}
Example #28
0
func (ia *idAllocator) start() {
	ia.stopper.RunWorker(func() {
		defer close(ia.ids)

		for {
			var newValue int64
			for newValue <= int64(ia.minID) {
				var (
					err error
					res client.KeyValue
				)
				for r := retry.Start(idAllocationRetryOpts); r.Next(); {
					if ia.stopper.StartTask() {
						idKey := ia.idKey.Load().(proto.Key)
						res, err = ia.db.Inc(idKey, int64(ia.blockSize))
						ia.stopper.FinishTask()

						if err == nil {
							newValue = res.ValueInt()
							break
						}

						log.Warningf("unable to allocate %d ids from %s: %s", ia.blockSize, idKey, err)
					} else {
						return
					}
				}
				if err != nil {
					panic(fmt.Sprintf("unexpectedly exited id allocation retry loop: %s", err))
				}
			}

			end := newValue + 1
			start := end - int64(ia.blockSize)

			if start < int64(ia.minID) {
				start = int64(ia.minID)
			}

			// Add all new ids to the channel for consumption.
			for i := start; i < end; i++ {
				select {
				case ia.ids <- uint32(i):
				case <-ia.stopper.ShouldStop():
					return
				}
			}
		}
	})
}
Example #29
0
// execSchemaChanges releases schema leases and runs the queued
// schema changers. This needs to be run after the transaction
// scheduling the schema change has finished.
//
// The list of closures is cleared after (attempting) execution.
//
// Args:
//  results: The results from all statements in the group that scheduled the
//    schema changes we're about to execute. Results corresponding to the
//    schema change statements will be changed in case an error occurs.
func (scc *schemaChangerCollection) execSchemaChanges(
	e *Executor, planMaker *planner, results ResultList,
) {
	if planMaker.txn != nil {
		panic("trying to execute schema changes while still in a transaction")
	}
	// Release the leases once a transaction is complete.
	planMaker.releaseLeases()
	if e.ctx.TestingKnobs.SyncSchemaChangersFilter != nil {
		e.ctx.TestingKnobs.SyncSchemaChangersFilter(TestingSchemaChangerCollection{scc})
	}
	// Execute any schema changes that were scheduled, in the order of the
	// statements that scheduled them.
	for _, scEntry := range scc.schemaChangers {
		sc := &scEntry.sc
		sc.db = *e.ctx.DB
		for r := retry.Start(base.DefaultRetryOptions()); r.Next(); {
			if done, err := sc.IsDone(); err != nil {
				log.Warning(e.ctx.Context, err)
				break
			} else if done {
				break
			}
			if err := sc.exec(
				e.ctx.TestingKnobs.SchemaChangersStartBackfillNotification,
				e.ctx.TestingKnobs.SyncSchemaChangersRenameOldNameNotInUseNotification,
			); err != nil {
				if isSchemaChangeRetryError(err) {
					// Try again
					continue
				}
				// All other errors can be reported; we report it as the result
				// corresponding to the statement that enqueued this changer.
				// There's some sketchiness here: we assume there's a single result
				// per statement and we clobber the result/error of the corresponding
				// statement.
				// There's also another subtlety: we can only report results for
				// statements in the current batch; we can't modify the results of older
				// statements.
				if scEntry.epoch == scc.curGroupNum {
					results[scEntry.idx] = Result{Err: err}
				}
				log.Warningf(e.ctx.Context, "Error executing schema change: %s", err)
			}
			break
		}
	}
	scc.schemaChangers = scc.schemaChangers[:0]
}
Example #30
0
// connect dials the connection in a backoff/retry loop.
func (c *Client) connect(opts *retry.Options, context *Context) error {
	// Attempt to dial connection.
	retryOpts := clientRetryOptions
	if opts != nil {
		retryOpts = *opts
	}
	retryOpts.Stopper = context.Stopper

	for r := retry.Start(retryOpts); r.Next(); {
		tlsConfig, err := context.GetClientTLSConfig()
		if err != nil {
			// Problem loading the TLS config. Retrying will not help.
			return err
		}

		conn, err := tlsDialHTTP(c.addr.Network(), c.addr.String(), tlsConfig)
		if err != nil {
			// Retry if the error is temporary, otherwise fail fast.
			if t, ok := err.(net.Error); ok && t.Temporary() {
				if log.V(1) {
					log.Warning(err)
				}
				continue
			}
			return err
		}

		c.mu.Lock()
		c.Client = rpc.NewClientWithCodec(codec.NewClientCodec(conn))
		c.lAddr = conn.LocalAddr()
		c.mu.Unlock()
		if c.lAddr == nil {
			return errClosed
		}

		// Ensure at least one heartbeat succeeds before exiting the
		// retry loop. If it fails, don't retry: The node is probably
		// dead.
		if err = c.heartbeat(); err != nil {
			return err
		}

		return nil
	}

	return util.Errorf("system is stopping")
}