Beispiel #1
0
// maybeWarnAboutInit looks for signs indicating a cluster which
// hasn't been initialized and warns. There's no absolutely sure way
// to determine whether the current node is simply waiting to be
// bootstrapped to an existing cluster vs. the operator having failed
// to initialize the cluster via the "cockroach init" command, so
// we can only warn.
//
// This method checks whether all gossip bootstrap hosts are
// connected, and whether the node itself is a bootstrap host, but
// there is still no sentinel gossip.
func (g *Gossip) maybeWarnAboutInit() {
	time.Sleep(5 * time.Second)
	retryOptions := util.RetryOptions{
		Tag:         "check cluster initialization",
		Backoff:     5 * time.Second,  // first backoff at 5s
		MaxBackoff:  60 * time.Second, // max backoff is 60s
		Constant:    2,                // doubles
		MaxAttempts: 0,                // indefinite retries
	}
	util.RetryWithBackoff(retryOptions, func() (bool, error) {
		g.mu.Lock()
		hasSentinel := g.is.getInfo(KeySentinel) != nil
		allConnected := g.filterExtant(g.bootstraps).len() == 0
		g.mu.Unlock()
		// If we have the sentinel, exit the retry loop.
		if hasSentinel {
			return true, nil
		}
		// Otherwise, if all bootstrap hosts are connected and this
		// node is a bootstrap host, warn.
		if allConnected && g.isBootstrap {
			glog.Warningf("connected to gossip but missing sentinel. Has the cluster been initialized? " +
				"Use \"cockroach init\" to initialize.")
		}
		return false, nil
	})
}
Beispiel #2
0
// NewClient returns a client RPC stub for the specified address
// (usually a TCP host:port, but for testing may be a unix domain
// socket). The process-wide client RPC cache is consulted first; if
// the requested client is not present, it's created and the cache is
// updated. Specify opts to fine tune client connection behavior or
// nil to use defaults (i.e. indefinite retries with exponential
// backoff).
//
// The Client.Ready channel is closed after the client has connected
// and completed one successful heartbeat. The Closed channel is
// closed if the client fails to connect or if the client's Close()
// method is invoked.
func NewClient(addr net.Addr, opts *util.RetryOptions) *Client {
	clientMu.Lock()
	if c, ok := clients[addr.String()]; ok {
		clientMu.Unlock()
		return c
	}
	c := &Client{
		addr:   addr,
		Ready:  make(chan struct{}),
		Closed: make(chan struct{}),
	}
	clients[c.Addr().String()] = c
	clientMu.Unlock()

	// Attempt to dial connection.
	retryOpts := clientRetryOptions
	if opts != nil {
		retryOpts = *opts
	}
	retryOpts.Tag = fmt.Sprintf("client %s connection", addr)

	go func() {
		err := util.RetryWithBackoff(retryOpts, func() (bool, error) {
			// TODO(spencer): use crypto.tls.
			conn, err := net.Dial(addr.Network(), addr.String())
			if err != nil {
				log.Info(err)
				return false, nil
			}
			c.mu.Lock()
			c.Client = rpc.NewClient(conn)
			c.lAddr = conn.LocalAddr()
			c.mu.Unlock()

			// Ensure at least one heartbeat succeeds before exiting the
			// retry loop.
			if err = c.heartbeat(); err != nil {
				c.Close()
				return false, err
			}

			// Signal client is ready by closing Ready channel.
			log.Infof("client %s connected", addr)
			close(c.Ready)

			// Launch periodic heartbeat.
			go c.startHeartbeat()

			return true, nil
		})
		if err != nil {
			log.Errorf("client %s failed to connect", addr)
			c.Close()
		}
	}()

	return c
}
Beispiel #3
0
// routeRPC verifies permissions and looks up the appropriate range
// based on the supplied key and sends the RPC according to the
// specified options. routeRPC sends asynchronously and returns a
// channel which receives the reply struct when the call is
// complete. Returns a channel of the same type as "reply".
func (db *DistDB) routeRPC(method string, header *storage.RequestHeader, args, reply interface{}) interface{} {
	chanVal := reflect.MakeChan(reflect.ChanOf(reflect.BothDir, reflect.TypeOf(reply)), 1)

	// Verify permissions.
	if err := db.verifyPermissions(method, header); err != nil {
		replyVal := reflect.ValueOf(reply)
		reflect.Indirect(replyVal).FieldByName("Error").Set(reflect.ValueOf(err))
		chanVal.Send(replyVal)
		return chanVal.Interface()
	}

	// Retry logic for lookup of range by key and RPCs to range replicas.
	go func() {
		retryOpts := util.RetryOptions{
			Tag:         fmt.Sprintf("routing %s rpc", method),
			Backoff:     retryBackoff,
			MaxBackoff:  maxRetryBackoff,
			Constant:    2,
			MaxAttempts: 0, // retry indefinitely
		}
		err := util.RetryWithBackoff(retryOpts, func() (bool, error) {
			rangeMeta, err := db.rangeCache.LookupRangeMetadata(header.Key)
			if err == nil {
				err = db.sendRPC(rangeMeta.Replicas, method, args, chanVal.Interface())
			}
			if err != nil {
				// Range metadata might be out of date - evict it.
				db.rangeCache.EvictCachedRangeMetadata(header.Key)

				// If retryable, allow outer loop to retry.
				if retryErr, ok := err.(util.Retryable); ok && retryErr.CanRetry() {
					glog.Warningf("failed to invoke %s: %v", method, err)
					return false, nil
				}
				// TODO(mtracy): Make sure that errors that clearly result from
				// a stale metadata cache are retryable.
			}
			return true, err
		})
		if err != nil {
			replyVal := reflect.ValueOf(reply)
			reflect.Indirect(replyVal).FieldByName("Error").Set(reflect.ValueOf(err))
			chanVal.Send(replyVal)
		}
	}()

	return chanVal.Interface()
}
Beispiel #4
0
// routeRPC verifies permissions and looks up the appropriate range
// based on the supplied key and sends the RPC according to the
// specified options. routeRPC sends asynchronously and returns a
// channel which receives the reply struct when the call is
// complete. Returns a channel of the same type as "reply".
func (db *DistDB) routeRPC(method string, header *storage.RequestHeader, args, reply interface{}) interface{} {
	chanVal := reflect.MakeChan(reflect.ChanOf(reflect.BothDir, reflect.TypeOf(reply)), 1)

	// Verify permissions.
	if err := db.verifyPermissions(method, header); err != nil {
		replyVal := reflect.ValueOf(reply)
		reflect.Indirect(replyVal).FieldByName("Error").Set(reflect.ValueOf(err))
		chanVal.Send(replyVal)
		return chanVal.Interface()
	}

	// Retry logic for lookup of range by key and RPCs to range replicas.
	go func() {
		retryOpts := util.RetryOptions{
			Tag:         fmt.Sprintf("routing %s rpc", method),
			Backoff:     retryBackoff,
			MaxBackoff:  maxRetryBackoff,
			Constant:    2,
			MaxAttempts: 0, // retry indefinitely
		}
		err := util.RetryWithBackoff(retryOpts, func() (bool, error) {
			rangeMeta, err := db.lookupRangeMetadata(header.Key)
			if err == nil {
				err = db.sendRPC(rangeMeta.Replicas, method, args, chanVal.Interface())
			}
			if err != nil {
				// If retryable, allow outer loop to retry.
				if retryErr, ok := err.(util.Retryable); ok && retryErr.CanRetry() {
					glog.Warningf("failed to invoke %s: %v", method, err)
					return false, nil
				}
				// TODO(spencer): check error here; we need to clear this
				// segment of range cache and retry if the range wasn't found.
			}
			return true, err
		})
		if err != nil {
			replyVal := reflect.ValueOf(reply)
			reflect.Indirect(replyVal).FieldByName("Error").Set(reflect.ValueOf(err))
			chanVal.Send(replyVal)
		}
	}()

	return chanVal.Interface()
}
Beispiel #5
0
// Send sends call to Cockroach via an HTTP post. HTTP response codes
// which are retryable are retried with backoff in a loop using the
// default retry options. Other errors sending HTTP request are
// retried indefinitely using the same client command ID to avoid
// reporting failure when in fact the command may have gone through
// and been executed successfully. We retry here to eventually get
// through with the same client command ID and be given the cached
// response.
func (s *HTTPSender) Send(call *Call) {
	retryOpts := HTTPRetryOptions
	retryOpts.Tag = fmt.Sprintf("http %s", call.Method)

	if err := util.RetryWithBackoff(retryOpts, func() (util.RetryStatus, error) {
		resp, err := s.post(call)
		if err != nil {
			if resp != nil {
				log.Warningf("failed to send HTTP request with status code %d", resp.StatusCode)
				// See if we can retry based on HTTP response code.
				switch resp.StatusCode {
				case http.StatusServiceUnavailable, http.StatusGatewayTimeout, StatusTooManyRequests:
					// Retry on service unavailable and request timeout.
					// TODO(spencer): consider respecting the Retry-After header for
					// backoff / retry duration.
					return util.RetryContinue, nil
				default:
					// Can't recover from all other errors.
					return util.RetryBreak, err
				}
			}
			switch t := err.(type) {
			case *httpSendError:
				// Assume all errors sending request are retryable. The actual
				// number of things that could go wrong is vast, but we don't
				// want to miss any which should in theory be retried with
				// the same client command ID. We log the error here as a
				// warning so there's visiblity that this is happening. Some of
				// the errors we'll sweep up in this net shouldn't be retried,
				// but we can't really know for sure which.
				log.Warningf("failed to send HTTP request or read its response: %s", t)
				return util.RetryContinue, nil
			default:
				// Can't retry in order to recover from this error. Propagate.
				return util.RetryBreak, err
			}
		}
		// On successful post, we're done with retry loop.
		return util.RetryBreak, nil
	}); err != nil {
		call.Reply.Header().SetGoError(err)
	}
}
Beispiel #6
0
// ExecuteCmd verifies permissions and looks up the appropriate range
// based on the supplied key and sends the RPC according to the
// specified options. executeRPC sends asynchronously and returns a
// response value on the replyChan channel when the call is complete.
func (kv *DistKV) ExecuteCmd(method string, args proto.Request, replyChan interface{}) {
	// Augment method with "Node." prefix.
	method = "Node." + method

	// Verify permissions.
	if err := kv.verifyPermissions(method, args.Header()); err != nil {
		sendErrorReply(err, replyChan)
		return
	}

	// Retry logic for lookup of range by key and RPCs to range replicas.
	retryOpts := util.RetryOptions{
		Tag:         fmt.Sprintf("routing %s rpc", method),
		Backoff:     retryBackoff,
		MaxBackoff:  maxRetryBackoff,
		Constant:    2,
		MaxAttempts: 0, // retry indefinitely
	}
	err := util.RetryWithBackoff(retryOpts, func() (bool, error) {
		desc, err := kv.rangeCache.LookupRangeMetadata(args.Header().Key)
		if err == nil {
			err = kv.sendRPC(desc, method, args, replyChan)
		}
		if err != nil {
			// Range metadata might be out of date - evict it.
			kv.rangeCache.EvictCachedRangeMetadata(args.Header().Key)

			// If retryable, allow outer loop to retry.
			if retryErr, ok := err.(util.Retryable); ok && retryErr.CanRetry() {
				log.Warningf("failed to invoke %s: %v", method, err)
				return false, nil
			}
		}
		return true, err
	})
	if err != nil {
		sendErrorReply(err, replyChan)
	}
}
Beispiel #7
0
func (db *DistDB) routeRPCInternal(method string, args storage.Request, replyChan interface{}) {
	// Verify permissions.
	if err := db.verifyPermissions(method, args.Header()); err != nil {
		sendErrorReply(err, replyChan)
		return
	}

	// Retry logic for lookup of range by key and RPCs to range replicas.
	go func() {
		retryOpts := util.RetryOptions{
			Tag:         fmt.Sprintf("routing %s rpc", method),
			Backoff:     retryBackoff,
			MaxBackoff:  maxRetryBackoff,
			Constant:    2,
			MaxAttempts: 0, // retry indefinitely
		}
		err := util.RetryWithBackoff(retryOpts, func() (bool, error) {
			rangeMeta, err := db.rangeCache.LookupRangeMetadata(args.Header().Key)
			if err == nil {
				err = db.sendRPC(rangeMeta.Replicas, method, args, replyChan)
			}
			if err != nil {
				// Range metadata might be out of date - evict it.
				db.rangeCache.EvictCachedRangeMetadata(args.Header().Key)

				// If retryable, allow outer loop to retry.
				if retryErr, ok := err.(util.Retryable); ok && retryErr.CanRetry() {
					log.Warningf("failed to invoke %s: %v", method, err)
					return false, nil
				}
			}
			return true, err
		})
		if err != nil {
			sendErrorReply(err, replyChan)
		}
	}()
}
Beispiel #8
0
// RunTransaction executes retryable in the context of a distributed
// transaction. The transaction is automatically aborted if retryable
// returns any error aside from recoverable internal errors, and is
// automatically committed otherwise. retryable should have no side
// effects which could cause problems in the event it must be run more
// than once. The opts struct contains transaction settings.
//
// Calling RunTransaction on the transactional KV client which is
// supplied to the retryable function is an error.
func (kv *KV) RunTransaction(opts *TransactionOptions, retryable func(txn *KV) error) error {
	if _, ok := kv.sender.(*txnSender); ok {
		return util.Errorf("cannot invoke RunTransaction on an already-transactional client")
	}

	// Create a new KV for the transaction using a transactional KV sender.
	txnSender := newTxnSender(kv.Sender(), opts)
	txnKV := NewKV(txnSender, kv.clock)
	txnKV.User = kv.User
	txnKV.UserPriority = kv.UserPriority
	defer txnKV.Close()

	// Run retryable in a retry loop until we encounter a success or
	// error condition this loop isn't capable of handling.
	retryOpts := TxnRetryOptions
	retryOpts.Tag = opts.Name
	if err := util.RetryWithBackoff(retryOpts, func() (util.RetryStatus, error) {
		txnSender.txnEnd = false // always reset before [re]starting txn
		err := retryable(txnKV)
		if err == nil && !txnSender.txnEnd {
			// If there were no errors running retryable, commit the txn. This
			// may block waiting for outstanding writes to complete in case
			// retryable didn't -- we need the most recent of all response
			// timestamps in order to commit.
			etArgs := &proto.EndTransactionRequest{Commit: true}
			etReply := &proto.EndTransactionResponse{}
			// Prepare and flush for end txn in order to execute entire txn in
			// a single round trip if possible.
			txnKV.Prepare(proto.EndTransaction, etArgs, etReply)
			err = txnKV.Flush()
		}
		switch t := err.(type) {
		case *proto.ReadWithinUncertaintyIntervalError:
			// Retry immediately on read within uncertainty interval.
			return util.RetryReset, nil
		case *proto.TransactionAbortedError:
			// If the transaction was aborted, the txnSender will have created
			// a new txn. We allow backoff/retry in this case.
			return util.RetryContinue, nil
		case *proto.TransactionPushError:
			// Backoff and retry on failure to push a conflicting transaction.
			return util.RetryContinue, nil
		case *proto.TransactionRetryError:
			// Return RetryReset for an immediate retry (as in the case of
			// an SSI txn whose timestamp was pushed).
			return util.RetryReset, nil
		default:
			// For all other cases, finish retry loop, returning possible error.
			return util.RetryBreak, t
		}
	}); err != nil && !txnSender.txnEnd {
		etArgs := &proto.EndTransactionRequest{Commit: false}
		etReply := &proto.EndTransactionResponse{}
		txnKV.Call(proto.EndTransaction, etArgs, etReply)
		if etReply.Header().GoError() != nil {
			log.Errorf("failure aborting transaction: %s; abort caused by: %s", etReply.Header().GoError(), err)
		}
		return err
	}
	return nil
}