// maybeWarnAboutInit looks for signs indicating a cluster which // hasn't been initialized and warns. There's no absolutely sure way // to determine whether the current node is simply waiting to be // bootstrapped to an existing cluster vs. the operator having failed // to initialize the cluster via the "cockroach init" command, so // we can only warn. // // This method checks whether all gossip bootstrap hosts are // connected, and whether the node itself is a bootstrap host, but // there is still no sentinel gossip. func (g *Gossip) maybeWarnAboutInit() { time.Sleep(5 * time.Second) retryOptions := util.RetryOptions{ Tag: "check cluster initialization", Backoff: 5 * time.Second, // first backoff at 5s MaxBackoff: 60 * time.Second, // max backoff is 60s Constant: 2, // doubles MaxAttempts: 0, // indefinite retries } util.RetryWithBackoff(retryOptions, func() (bool, error) { g.mu.Lock() hasSentinel := g.is.getInfo(KeySentinel) != nil allConnected := g.filterExtant(g.bootstraps).len() == 0 g.mu.Unlock() // If we have the sentinel, exit the retry loop. if hasSentinel { return true, nil } // Otherwise, if all bootstrap hosts are connected and this // node is a bootstrap host, warn. if allConnected && g.isBootstrap { glog.Warningf("connected to gossip but missing sentinel. Has the cluster been initialized? " + "Use \"cockroach init\" to initialize.") } return false, nil }) }
// NewClient returns a client RPC stub for the specified address // (usually a TCP host:port, but for testing may be a unix domain // socket). The process-wide client RPC cache is consulted first; if // the requested client is not present, it's created and the cache is // updated. Specify opts to fine tune client connection behavior or // nil to use defaults (i.e. indefinite retries with exponential // backoff). // // The Client.Ready channel is closed after the client has connected // and completed one successful heartbeat. The Closed channel is // closed if the client fails to connect or if the client's Close() // method is invoked. func NewClient(addr net.Addr, opts *util.RetryOptions) *Client { clientMu.Lock() if c, ok := clients[addr.String()]; ok { clientMu.Unlock() return c } c := &Client{ addr: addr, Ready: make(chan struct{}), Closed: make(chan struct{}), } clients[c.Addr().String()] = c clientMu.Unlock() // Attempt to dial connection. retryOpts := clientRetryOptions if opts != nil { retryOpts = *opts } retryOpts.Tag = fmt.Sprintf("client %s connection", addr) go func() { err := util.RetryWithBackoff(retryOpts, func() (bool, error) { // TODO(spencer): use crypto.tls. conn, err := net.Dial(addr.Network(), addr.String()) if err != nil { log.Info(err) return false, nil } c.mu.Lock() c.Client = rpc.NewClient(conn) c.lAddr = conn.LocalAddr() c.mu.Unlock() // Ensure at least one heartbeat succeeds before exiting the // retry loop. if err = c.heartbeat(); err != nil { c.Close() return false, err } // Signal client is ready by closing Ready channel. log.Infof("client %s connected", addr) close(c.Ready) // Launch periodic heartbeat. go c.startHeartbeat() return true, nil }) if err != nil { log.Errorf("client %s failed to connect", addr) c.Close() } }() return c }
// routeRPC verifies permissions and looks up the appropriate range // based on the supplied key and sends the RPC according to the // specified options. routeRPC sends asynchronously and returns a // channel which receives the reply struct when the call is // complete. Returns a channel of the same type as "reply". func (db *DistDB) routeRPC(method string, header *storage.RequestHeader, args, reply interface{}) interface{} { chanVal := reflect.MakeChan(reflect.ChanOf(reflect.BothDir, reflect.TypeOf(reply)), 1) // Verify permissions. if err := db.verifyPermissions(method, header); err != nil { replyVal := reflect.ValueOf(reply) reflect.Indirect(replyVal).FieldByName("Error").Set(reflect.ValueOf(err)) chanVal.Send(replyVal) return chanVal.Interface() } // Retry logic for lookup of range by key and RPCs to range replicas. go func() { retryOpts := util.RetryOptions{ Tag: fmt.Sprintf("routing %s rpc", method), Backoff: retryBackoff, MaxBackoff: maxRetryBackoff, Constant: 2, MaxAttempts: 0, // retry indefinitely } err := util.RetryWithBackoff(retryOpts, func() (bool, error) { rangeMeta, err := db.rangeCache.LookupRangeMetadata(header.Key) if err == nil { err = db.sendRPC(rangeMeta.Replicas, method, args, chanVal.Interface()) } if err != nil { // Range metadata might be out of date - evict it. db.rangeCache.EvictCachedRangeMetadata(header.Key) // If retryable, allow outer loop to retry. if retryErr, ok := err.(util.Retryable); ok && retryErr.CanRetry() { glog.Warningf("failed to invoke %s: %v", method, err) return false, nil } // TODO(mtracy): Make sure that errors that clearly result from // a stale metadata cache are retryable. } return true, err }) if err != nil { replyVal := reflect.ValueOf(reply) reflect.Indirect(replyVal).FieldByName("Error").Set(reflect.ValueOf(err)) chanVal.Send(replyVal) } }() return chanVal.Interface() }
// routeRPC verifies permissions and looks up the appropriate range // based on the supplied key and sends the RPC according to the // specified options. routeRPC sends asynchronously and returns a // channel which receives the reply struct when the call is // complete. Returns a channel of the same type as "reply". func (db *DistDB) routeRPC(method string, header *storage.RequestHeader, args, reply interface{}) interface{} { chanVal := reflect.MakeChan(reflect.ChanOf(reflect.BothDir, reflect.TypeOf(reply)), 1) // Verify permissions. if err := db.verifyPermissions(method, header); err != nil { replyVal := reflect.ValueOf(reply) reflect.Indirect(replyVal).FieldByName("Error").Set(reflect.ValueOf(err)) chanVal.Send(replyVal) return chanVal.Interface() } // Retry logic for lookup of range by key and RPCs to range replicas. go func() { retryOpts := util.RetryOptions{ Tag: fmt.Sprintf("routing %s rpc", method), Backoff: retryBackoff, MaxBackoff: maxRetryBackoff, Constant: 2, MaxAttempts: 0, // retry indefinitely } err := util.RetryWithBackoff(retryOpts, func() (bool, error) { rangeMeta, err := db.lookupRangeMetadata(header.Key) if err == nil { err = db.sendRPC(rangeMeta.Replicas, method, args, chanVal.Interface()) } if err != nil { // If retryable, allow outer loop to retry. if retryErr, ok := err.(util.Retryable); ok && retryErr.CanRetry() { glog.Warningf("failed to invoke %s: %v", method, err) return false, nil } // TODO(spencer): check error here; we need to clear this // segment of range cache and retry if the range wasn't found. } return true, err }) if err != nil { replyVal := reflect.ValueOf(reply) reflect.Indirect(replyVal).FieldByName("Error").Set(reflect.ValueOf(err)) chanVal.Send(replyVal) } }() return chanVal.Interface() }
// Send sends call to Cockroach via an HTTP post. HTTP response codes // which are retryable are retried with backoff in a loop using the // default retry options. Other errors sending HTTP request are // retried indefinitely using the same client command ID to avoid // reporting failure when in fact the command may have gone through // and been executed successfully. We retry here to eventually get // through with the same client command ID and be given the cached // response. func (s *HTTPSender) Send(call *Call) { retryOpts := HTTPRetryOptions retryOpts.Tag = fmt.Sprintf("http %s", call.Method) if err := util.RetryWithBackoff(retryOpts, func() (util.RetryStatus, error) { resp, err := s.post(call) if err != nil { if resp != nil { log.Warningf("failed to send HTTP request with status code %d", resp.StatusCode) // See if we can retry based on HTTP response code. switch resp.StatusCode { case http.StatusServiceUnavailable, http.StatusGatewayTimeout, StatusTooManyRequests: // Retry on service unavailable and request timeout. // TODO(spencer): consider respecting the Retry-After header for // backoff / retry duration. return util.RetryContinue, nil default: // Can't recover from all other errors. return util.RetryBreak, err } } switch t := err.(type) { case *httpSendError: // Assume all errors sending request are retryable. The actual // number of things that could go wrong is vast, but we don't // want to miss any which should in theory be retried with // the same client command ID. We log the error here as a // warning so there's visiblity that this is happening. Some of // the errors we'll sweep up in this net shouldn't be retried, // but we can't really know for sure which. log.Warningf("failed to send HTTP request or read its response: %s", t) return util.RetryContinue, nil default: // Can't retry in order to recover from this error. Propagate. return util.RetryBreak, err } } // On successful post, we're done with retry loop. return util.RetryBreak, nil }); err != nil { call.Reply.Header().SetGoError(err) } }
// ExecuteCmd verifies permissions and looks up the appropriate range // based on the supplied key and sends the RPC according to the // specified options. executeRPC sends asynchronously and returns a // response value on the replyChan channel when the call is complete. func (kv *DistKV) ExecuteCmd(method string, args proto.Request, replyChan interface{}) { // Augment method with "Node." prefix. method = "Node." + method // Verify permissions. if err := kv.verifyPermissions(method, args.Header()); err != nil { sendErrorReply(err, replyChan) return } // Retry logic for lookup of range by key and RPCs to range replicas. retryOpts := util.RetryOptions{ Tag: fmt.Sprintf("routing %s rpc", method), Backoff: retryBackoff, MaxBackoff: maxRetryBackoff, Constant: 2, MaxAttempts: 0, // retry indefinitely } err := util.RetryWithBackoff(retryOpts, func() (bool, error) { desc, err := kv.rangeCache.LookupRangeMetadata(args.Header().Key) if err == nil { err = kv.sendRPC(desc, method, args, replyChan) } if err != nil { // Range metadata might be out of date - evict it. kv.rangeCache.EvictCachedRangeMetadata(args.Header().Key) // If retryable, allow outer loop to retry. if retryErr, ok := err.(util.Retryable); ok && retryErr.CanRetry() { log.Warningf("failed to invoke %s: %v", method, err) return false, nil } } return true, err }) if err != nil { sendErrorReply(err, replyChan) } }
func (db *DistDB) routeRPCInternal(method string, args storage.Request, replyChan interface{}) { // Verify permissions. if err := db.verifyPermissions(method, args.Header()); err != nil { sendErrorReply(err, replyChan) return } // Retry logic for lookup of range by key and RPCs to range replicas. go func() { retryOpts := util.RetryOptions{ Tag: fmt.Sprintf("routing %s rpc", method), Backoff: retryBackoff, MaxBackoff: maxRetryBackoff, Constant: 2, MaxAttempts: 0, // retry indefinitely } err := util.RetryWithBackoff(retryOpts, func() (bool, error) { rangeMeta, err := db.rangeCache.LookupRangeMetadata(args.Header().Key) if err == nil { err = db.sendRPC(rangeMeta.Replicas, method, args, replyChan) } if err != nil { // Range metadata might be out of date - evict it. db.rangeCache.EvictCachedRangeMetadata(args.Header().Key) // If retryable, allow outer loop to retry. if retryErr, ok := err.(util.Retryable); ok && retryErr.CanRetry() { log.Warningf("failed to invoke %s: %v", method, err) return false, nil } } return true, err }) if err != nil { sendErrorReply(err, replyChan) } }() }
// RunTransaction executes retryable in the context of a distributed // transaction. The transaction is automatically aborted if retryable // returns any error aside from recoverable internal errors, and is // automatically committed otherwise. retryable should have no side // effects which could cause problems in the event it must be run more // than once. The opts struct contains transaction settings. // // Calling RunTransaction on the transactional KV client which is // supplied to the retryable function is an error. func (kv *KV) RunTransaction(opts *TransactionOptions, retryable func(txn *KV) error) error { if _, ok := kv.sender.(*txnSender); ok { return util.Errorf("cannot invoke RunTransaction on an already-transactional client") } // Create a new KV for the transaction using a transactional KV sender. txnSender := newTxnSender(kv.Sender(), opts) txnKV := NewKV(txnSender, kv.clock) txnKV.User = kv.User txnKV.UserPriority = kv.UserPriority defer txnKV.Close() // Run retryable in a retry loop until we encounter a success or // error condition this loop isn't capable of handling. retryOpts := TxnRetryOptions retryOpts.Tag = opts.Name if err := util.RetryWithBackoff(retryOpts, func() (util.RetryStatus, error) { txnSender.txnEnd = false // always reset before [re]starting txn err := retryable(txnKV) if err == nil && !txnSender.txnEnd { // If there were no errors running retryable, commit the txn. This // may block waiting for outstanding writes to complete in case // retryable didn't -- we need the most recent of all response // timestamps in order to commit. etArgs := &proto.EndTransactionRequest{Commit: true} etReply := &proto.EndTransactionResponse{} // Prepare and flush for end txn in order to execute entire txn in // a single round trip if possible. txnKV.Prepare(proto.EndTransaction, etArgs, etReply) err = txnKV.Flush() } switch t := err.(type) { case *proto.ReadWithinUncertaintyIntervalError: // Retry immediately on read within uncertainty interval. return util.RetryReset, nil case *proto.TransactionAbortedError: // If the transaction was aborted, the txnSender will have created // a new txn. We allow backoff/retry in this case. return util.RetryContinue, nil case *proto.TransactionPushError: // Backoff and retry on failure to push a conflicting transaction. return util.RetryContinue, nil case *proto.TransactionRetryError: // Return RetryReset for an immediate retry (as in the case of // an SSI txn whose timestamp was pushed). return util.RetryReset, nil default: // For all other cases, finish retry loop, returning possible error. return util.RetryBreak, t } }); err != nil && !txnSender.txnEnd { etArgs := &proto.EndTransactionRequest{Commit: false} etReply := &proto.EndTransactionResponse{} txnKV.Call(proto.EndTransaction, etArgs, etReply) if etReply.Header().GoError() != nil { log.Errorf("failure aborting transaction: %s; abort caused by: %s", etReply.Header().GoError(), err) } return err } return nil }