// handleNodeStatus handles GET requests for a single node's status. func (s *statusServer) Node( ctx context.Context, req *serverpb.NodeRequest, ) (*status.NodeStatus, error) { ctx = s.AnnotateCtx(ctx) nodeID, _, err := s.parseNodeID(req.NodeId) if err != nil { return nil, grpc.Errorf(codes.InvalidArgument, err.Error()) } key := keys.NodeStatusKey(nodeID) b := &client.Batch{} b.Get(key) if err := s.db.Run(ctx, b); err != nil { log.Error(ctx, err) return nil, grpc.Errorf(codes.Internal, err.Error()) } var nodeStatus status.NodeStatus if err := b.Results[0].Rows[0].ValueProto(&nodeStatus); err != nil { err = errors.Errorf("could not unmarshal NodeStatus from %s: %s", key, err) log.Error(ctx, err) return nil, grpc.Errorf(codes.Internal, err.Error()) } return &nodeStatus, nil }
// Nodes returns all node statuses. func (s *statusServer) Nodes( ctx context.Context, req *serverpb.NodesRequest, ) (*serverpb.NodesResponse, error) { ctx = s.AnnotateCtx(ctx) startKey := keys.StatusNodePrefix endKey := startKey.PrefixEnd() b := &client.Batch{} b.Scan(startKey, endKey) if err := s.db.Run(ctx, b); err != nil { log.Error(ctx, err) return nil, grpc.Errorf(codes.Internal, err.Error()) } rows := b.Results[0].Rows resp := serverpb.NodesResponse{ Nodes: make([]status.NodeStatus, len(rows)), } for i, row := range rows { if err := row.ValueProto(&resp.Nodes[i]); err != nil { log.Error(ctx, err) return nil, grpc.Errorf(codes.Internal, err.Error()) } } return &resp, nil }
func (ds *ServerImpl) setupFlow( ctx context.Context, req *SetupFlowRequest, simpleFlowConsumer RowReceiver, ) (*Flow, error) { sp, err := tracing.JoinOrNew(ds.AmbientContext.Tracer, req.TraceContext, "flow") if err != nil { return nil, err } ctx = opentracing.ContextWithSpan(ctx, sp) txn := ds.setupTxn(ctx, &req.Txn) flowCtx := FlowCtx{ Context: ctx, id: req.Flow.FlowID, evalCtx: &ds.evalCtx, rpcCtx: ds.RPCContext, txn: txn, } f := newFlow(flowCtx, ds.flowRegistry, simpleFlowConsumer) if err := f.setupFlow(&req.Flow); err != nil { log.Error(ctx, err) sp.Finish() return nil, err } return f, nil }
func (c *v3Conn) finish(ctx context.Context) { // This is better than always flushing on error. if err := c.wr.Flush(); err != nil { log.Error(ctx, err) } _ = c.conn.Close() }
func (ds *ServerImpl) setupFlow( ctx context.Context, req *SetupFlowRequest, syncFlowConsumer RowReceiver, ) (*Flow, error) { sp, err := tracing.JoinOrNew(ds.AmbientContext.Tracer, req.TraceContext, "flow") if err != nil { return nil, err } ctx = opentracing.ContextWithSpan(ctx, sp) // TODO(radu): we should sanity check some of these fields (especially // txnProto). flowCtx := FlowCtx{ Context: ctx, id: req.Flow.FlowID, evalCtx: &ds.evalCtx, rpcCtx: ds.RPCContext, txnProto: &req.Txn, clientDB: ds.DB, } f := newFlow(flowCtx, ds.flowRegistry, syncFlowConsumer) if err := f.setupFlow(&req.Flow); err != nil { log.Error(ctx, err) sp.Finish() return nil, err } return f, nil }
func (a *allocSim) maybeLogError(err error) { if localcluster.IsUnavailableError(err) { return } log.Error(context.Background(), err) atomic.AddUint64(&a.stats.errors, 1) }
func (z *zeroSum) maybeLogError(err error) { if localcluster.IsUnavailableError(err) || strings.Contains(err.Error(), "range is frozen") { return } log.Error(context.Background(), err) atomic.AddUint64(&z.stats.errors, 1) }
func (s *statusServer) handleVars(w http.ResponseWriter, r *http.Request) { w.Header().Set(httputil.ContentTypeHeader, httputil.PlaintextContentType) err := s.metricSource.PrintAsText(w) if err != nil { log.Error(r.Context(), err) http.Error(w, err.Error(), http.StatusInternalServerError) } }
// FlowStream is part of the DistSQLServer interface. func (ds *ServerImpl) FlowStream(stream DistSQL_FlowStreamServer) error { ctx := ds.AnnotateCtx(context.TODO()) err := ds.flowStreamInt(stream) if err != nil { log.Error(ctx, err) } return err }
// SetStorage provides an instance of the Storage interface // for reading and writing gossip bootstrap data from persistent // storage. This should be invoked as early in the lifecycle of a // gossip instance as possible, but can be called at any time. func (g *Gossip) SetStorage(storage Storage) error { ctx := g.AnnotateCtx(context.TODO()) // Maintain lock ordering. var storedBI BootstrapInfo if err := storage.ReadBootstrapInfo(&storedBI); err != nil { log.Warningf(ctx, "failed to read gossip bootstrap info: %s", err) } g.mu.Lock() defer g.mu.Unlock() g.storage = storage // Merge the stored bootstrap info addresses with any we've become // aware of through gossip. existing := map[string]struct{}{} makeKey := func(a util.UnresolvedAddr) string { return fmt.Sprintf("%s,%s", a.Network(), a.String()) } for _, addr := range g.bootstrapInfo.Addresses { existing[makeKey(addr)] = struct{}{} } for _, addr := range storedBI.Addresses { // If the address is new, and isn't our own address, add it. if _, ok := existing[makeKey(addr)]; !ok && addr != g.mu.is.NodeAddr { g.maybeAddBootstrapAddress(addr) } } // Persist merged addresses. if numAddrs := len(g.bootstrapInfo.Addresses); numAddrs > len(storedBI.Addresses) { if err := g.storage.WriteBootstrapInfo(&g.bootstrapInfo); err != nil { log.Error(ctx, err) } } // Cycle through all persisted bootstrap hosts and add resolvers for // any which haven't already been added. newResolverFound := false for _, addr := range g.bootstrapInfo.Addresses { if !g.maybeAddResolver(addr) { continue } // If we find a new resolver, reset the resolver index so that the // next resolver we try is the first of the new resolvers. if !newResolverFound { newResolverFound = true g.resolverIdx = len(g.resolvers) - 1 } } // If a new resolver was found, immediately signal bootstrap. if newResolverFound { if log.V(1) { log.Infof(ctx, "found new resolvers from storage; signalling bootstrap") } g.signalStalledLocked() } return nil }
// DrainQueue locks the queue and processes the remaining queued replicas. It // processes the replicas in the order they're queued in, one at a time. // Exposed for testing only. // // TODO(bdarnell): this method may race with the call to bq.pop() in // the main loop, in which case it does not guarantee that all // replicas have been processed by the time it returns. This is most // noticeable with ForceReplicaGCScanAndProcess, since the replica GC // queue has many event-driven triggers. This should synchronize // somehow with processLoop so we wait for anything being handled // there to finish too. When that's done, the SucceedsSoon at the end // of TestRemoveRangeWithoutGC (and perhaps others) can be replaced // with a one-time check. func (bq *baseQueue) DrainQueue(clock *hlc.Clock) { ctx := bq.AnnotateCtx(context.TODO()) for repl := bq.pop(); repl != nil; repl = bq.pop() { annotatedCtx := repl.AnnotateCtx(ctx) if err := bq.processReplica(annotatedCtx, repl, clock); err != nil { bq.failures.Inc(1) log.Error(annotatedCtx, err) } } }
// updateNodeAddress is a gossip callback which fires with each // update to the node address. This allows us to compute the // total size of the gossip network (for determining max peers // each gossip node is allowed to have), as well as to create // new resolvers for each encountered host and to write the // set of gossip node addresses to persistent storage when it // changes. func (g *Gossip) updateNodeAddress(_ string, content roachpb.Value) { ctx := g.AnnotateCtx(context.TODO()) var desc roachpb.NodeDescriptor if err := content.GetProto(&desc); err != nil { log.Error(ctx, err) return } g.mu.Lock() defer g.mu.Unlock() // Skip if the node has already been seen. if _, ok := g.nodeDescs[desc.NodeID]; ok { return } g.nodeDescs[desc.NodeID] = &desc // Recompute max peers based on size of network and set the max // sizes for incoming and outgoing node sets. maxPeers := g.maxPeers(len(g.nodeDescs)) g.mu.incoming.setMaxSize(maxPeers) g.outgoing.setMaxSize(maxPeers) // Skip if it's our own address. if desc.Address == g.mu.is.NodeAddr { return } // Add this new node address (if it's not already there) to our list // of resolvers so we can keep connecting to gossip if the original // resolvers go offline. g.maybeAddResolver(desc.Address) // Add new address (if it's not already there) to bootstrap info and // persist if possible. if g.storage != nil && g.maybeAddBootstrapAddress(desc.Address) { if err := g.storage.WriteBootstrapInfo(&g.bootstrapInfo); err != nil { log.Error(ctx, err) } } }
// process() is called on every range for which this node is a lease holder. func (q *consistencyQueue) process(ctx context.Context, repl *Replica, _ config.SystemConfig) error { req := roachpb.CheckConsistencyRequest{} if _, pErr := repl.CheckConsistency(ctx, req); pErr != nil { log.Error(ctx, pErr.GoError()) } // Update the last processed time for this queue. if err := repl.setQueueLastProcessed(ctx, q.name, repl.store.Clock().Now()); err != nil { log.ErrEventf(ctx, "failed to update last processed time: %v", err) } return nil }
// Addr returns the TCP address to connect to. func (c *Container) Addr(port nat.Port) *net.TCPAddr { containerInfo, err := c.Inspect() if err != nil { log.Error(context.TODO(), err) return nil } bindings, ok := containerInfo.NetworkSettings.Ports[port] if !ok || len(bindings) == 0 { return nil } portNum, err := strconv.Atoi(bindings[0].HostPort) if err != nil { log.Error(context.TODO(), err) return nil } return &net.TCPAddr{ IP: dockerIP(), Port: portNum, } }
// GRPCDial calls grpc.Dial with the options appropriate for the context. func (ctx *Context) GRPCDial(target string, opts ...grpc.DialOption) (*grpc.ClientConn, error) { ctx.conns.Lock() meta, ok := ctx.conns.cache[target] if !ok { meta = &connMeta{} ctx.conns.cache[target] = meta } ctx.conns.Unlock() meta.Do(func() { var dialOpt grpc.DialOption if ctx.Insecure { dialOpt = grpc.WithInsecure() } else { tlsConfig, err := ctx.GetClientTLSConfig() if err != nil { meta.err = err return } dialOpt = grpc.WithTransportCredentials(credentials.NewTLS(tlsConfig)) } dialOpts := make([]grpc.DialOption, 0, 2+len(opts)) dialOpts = append(dialOpts, dialOpt) dialOpts = append(dialOpts, grpc.WithBackoffMaxDelay(maxBackoff)) dialOpts = append(dialOpts, opts...) if log.V(1) { log.Infof(ctx.masterCtx, "dialing %s", target) } meta.conn, meta.err = grpc.DialContext(ctx.masterCtx, target, dialOpts...) if meta.err == nil { if err := ctx.Stopper.RunTask(func() { ctx.Stopper.RunWorker(func() { err := ctx.runHeartbeat(meta.conn, target) if err != nil && !grpcutil.IsClosedConnection(err) { log.Error(ctx.masterCtx, err) } ctx.removeConn(target, meta) }) }); err != nil { meta.err = err // removeConn and ctx's cleanup worker both lock ctx.conns. However, // to avoid racing with meta's initialization, the cleanup worker // blocks on meta.Do while holding ctx.conns. Invoke removeConn // asynchronously to avoid deadlock. go ctx.removeConn(target, meta) } } }) return meta.conn, meta.err }
func (rq *replicateQueue) shouldQueue( ctx context.Context, now hlc.Timestamp, repl *Replica, sysCfg config.SystemConfig, ) (shouldQ bool, priority float64) { if !repl.store.splitQueue.Disabled() && repl.needsSplitBySize() { // If the range exceeds the split threshold, let that finish first. // Ranges must fit in memory on both sender and receiver nodes while // being replicated. This supplements the check provided by // acceptsUnsplitRanges, which looks at zone config boundaries rather // than data size. // // This check is ignored if the split queue is disabled, since in that // case, the split will never come. return } // Find the zone config for this range. desc := repl.Desc() zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { log.Error(ctx, err) return } action, priority := rq.allocator.ComputeAction(zone, desc) if action != AllocatorNoop { if log.V(2) { log.Infof(ctx, "%s repair needed (%s), enqueuing", repl, action) } return true, priority } // See if there is a rebalancing opportunity present. leaseStoreID := repl.store.StoreID() if lease, _ := repl.getLease(); lease != nil { leaseStoreID = lease.Replica.StoreID } target := rq.allocator.RebalanceTarget( zone.Constraints, desc.Replicas, leaseStoreID, desc.RangeID, ) if log.V(2) { if target != nil { log.Infof(ctx, "%s rebalance target found, enqueuing", repl) } else { log.Infof(ctx, "%s no rebalance target found, not enqueuing", repl) } } return target != nil, 0 }
// SendNext invokes the specified RPC on the supplied client when the // client is ready. On success, the reply is sent on the channel; // otherwise an error is sent. func (gt *grpcTransport) SendNext(done chan<- BatchCall) { client := gt.orderedClients[gt.clientIndex] gt.clientIndex++ gt.setPending(client.args.Replica, true) addr := client.remoteAddr if log.V(2) { log.Infof(gt.opts.ctx, "sending request to %s: %+v", addr, client.args) } if localServer := gt.rpcContext.GetLocalInternalServerForAddr(addr); enableLocalCalls && localServer != nil { // Clone the request. At the time of writing, Replica may mutate it // during command execution which can lead to data races. // // TODO(tamird): we should clone all of client.args.Header, but the // assertions in protoutil.Clone fire and there seems to be no // reasonable workaround. origTxn := client.args.Txn if origTxn != nil { clonedTxn := origTxn.Clone() client.args.Txn = &clonedTxn } reply, err := localServer.Batch(gt.opts.ctx, &client.args) gt.setPending(client.args.Replica, false) done <- BatchCall{Reply: reply, Err: err} return } go func() { // HACK: GRPC leaks if client calls are made with a context which // is cancelable but doesn't actually get canceled. Insulate this // call from our outer context, which may last for the lifetime of // a client session. // TODO(bdarnell): remove after https://github.com/grpc/grpc-go/issues/888 // is fixed. ctx, cancel := context.WithCancel(gt.opts.ctx) defer cancel() reply, err := client.client.Batch(ctx, &client.args) if reply != nil { for i := range reply.Responses { if err := reply.Responses[i].GetInner().Verify(client.args.Requests[i].GetInner()); err != nil { log.Error(gt.opts.ctx, err) } } } gt.setPending(client.args.Replica, false) done <- BatchCall{Reply: reply, Err: err} }() }
// maybeCleanupBootstrapAddresses cleans up the stored bootstrap addresses to // include only those currently available via gossip. The gossip mutex must // be held by the caller. func (g *Gossip) maybeCleanupBootstrapAddressesLocked() { if g.storage == nil || g.hasCleanedBS { return } defer func() { g.hasCleanedBS = true }() ctx := g.AnnotateCtx(context.TODO()) log.Event(ctx, "cleaning up bootstrap addresses") g.resolvers = g.resolvers[:0] g.resolverIdx = 0 g.bootstrapInfo.Addresses = g.bootstrapInfo.Addresses[:0] g.bootstrapAddrs = map[util.UnresolvedAddr]struct{}{} g.resolverAddrs = map[util.UnresolvedAddr]resolver.Resolver{} g.resolversTried = map[int]struct{}{} var desc roachpb.NodeDescriptor if err := g.mu.is.visitInfos(func(key string, i *Info) error { if strings.HasPrefix(key, KeyNodeIDPrefix) { if err := i.Value.GetProto(&desc); err != nil { return err } if desc.Address == g.mu.is.NodeAddr { return nil } g.maybeAddResolver(desc.Address) g.maybeAddBootstrapAddress(desc.Address) } return nil }); err != nil { log.Error(ctx, err) return } if err := g.storage.WriteBootstrapInfo(&g.bootstrapInfo); err != nil { log.Error(ctx, err) } }
// storeGossipUpdate is the gossip callback used to keep the StorePool up to date. func (sp *StorePool) storeGossipUpdate(_ string, content roachpb.Value) { var storeDesc roachpb.StoreDescriptor if err := content.GetProto(&storeDesc); err != nil { ctx := sp.AnnotateCtx(context.TODO()) log.Error(ctx, err) return } sp.mu.Lock() defer sp.mu.Unlock() // Does this storeDetail exist yet? detail := sp.getStoreDetailLocked(storeDesc.StoreID) detail.markAlive(sp.clock.Now(), &storeDesc) sp.mu.queue.enqueue(detail) }
// storeGossipUpdate is the gossip callback used to keep the StorePool up to date. func (sp *StorePool) storeGossipUpdate(_ string, content roachpb.Value) { var storeDesc roachpb.StoreDescriptor if err := content.GetProto(&storeDesc); err != nil { ctx := sp.AnnotateCtx(context.TODO()) log.Error(ctx, err) return } sp.mu.Lock() defer sp.mu.Unlock() detail := sp.getStoreDetailLocked(storeDesc.StoreID) detail.desc = &storeDesc detail.lastUpdatedTime = sp.clock.PhysicalTime() sp.mu.nodeLocalities[storeDesc.Node.NodeID] = storeDesc.Node.Locality }
func (r *Replica) computeChecksumPostApply( ctx context.Context, args roachpb.ComputeChecksumRequest, ) { stopper := r.store.Stopper() id := args.ChecksumID now := timeutil.Now() r.mu.Lock() var notify chan struct{} if c, ok := r.mu.checksums[id]; !ok { // There is no record of this ID. Make a new notification. notify = make(chan struct{}) } else if !c.started { // A CollectChecksumRequest is waiting on the existing notification. notify = c.notify } else { // A previous attempt was made to compute the checksum. r.mu.Unlock() return } r.gcOldChecksumEntriesLocked(now) // Create an entry with checksum == nil and gcTimestamp unset. r.mu.checksums[id] = replicaChecksum{started: true, notify: notify} desc := *r.mu.state.Desc r.mu.Unlock() snap := r.store.NewSnapshot() // Compute SHA asynchronously and store it in a map by UUID. if err := stopper.RunAsyncTask(ctx, func(ctx context.Context) { defer snap.Close() var snapshot *roachpb.RaftSnapshotData if args.Snapshot { snapshot = &roachpb.RaftSnapshotData{} } sha, err := r.sha512(desc, snap, snapshot) if err != nil { log.Errorf(ctx, "%v", err) sha = nil } r.computeChecksumDone(ctx, id, sha, snapshot) }); err != nil { defer snap.Close() log.Error(ctx, errors.Wrapf(err, "could not run async checksum computation (ID = %s)", id)) // Set checksum to nil. r.computeChecksumDone(ctx, id, nil, nil) } }
// livenessGossipUpdate is the gossip callback used to keep the // in-memory liveness info up to date. func (nl *NodeLiveness) livenessGossipUpdate(key string, content roachpb.Value) { var liveness Liveness if err := content.GetProto(&liveness); err != nil { log.Error(context.TODO(), err) return } // If there's an existing liveness record, only update the received // timestamp if this is our first receipt of this node's liveness // or if the expiration or epoch was advanced. nl.mu.Lock() defer nl.mu.Unlock() exLiveness, ok := nl.mu.nodes[liveness.NodeID] if !ok || exLiveness.Expiration.Less(liveness.Expiration) || exLiveness.Epoch < liveness.Epoch { nl.mu.nodes[liveness.NodeID] = liveness } }
// SendNext invokes the specified RPC on the supplied client when the // client is ready. On success, the reply is sent on the channel; // otherwise an error is sent. func (gt *grpcTransport) SendNext(ctx context.Context, done chan<- BatchCall) { client := gt.orderedClients[gt.clientIndex] gt.clientIndex++ gt.setPending(client.args.Replica, true) batchFn := func(ctx context.Context, args *roachpb.BatchRequest) (*roachpb.BatchResponse, error) { reply, err := client.client.Batch(ctx, args) if reply != nil { for i := range reply.Responses { if err := reply.Responses[i].GetInner().Verify(client.args.Requests[i].GetInner()); err != nil { log.Error(ctx, err) } } } return reply, err } addr := client.remoteAddr if localServer := gt.rpcContext.GetLocalInternalServerForAddr(addr); enableLocalCalls && localServer != nil { batchFn = func(ctx context.Context, args *roachpb.BatchRequest) (*roachpb.BatchResponse, error) { gt.opts.metrics.LocalSentCount.Inc(1) return localServer.Batch(ctx, args) } // Clone the request. At the time of writing, Replica may mutate it // during command execution which can lead to data races. // // TODO(tamird): we should clone all of client.args.Header, but the // assertions in protoutil.Clone fire and there seems to be no // reasonable workaround. origTxn := client.args.Txn if origTxn != nil { clonedTxn := origTxn.Clone() client.args.Txn = &clonedTxn } } go func() { gt.opts.metrics.SentCount.Inc(1) if log.V(2) { log.Infof(ctx, "sending request to %s: %+v", addr, client.args) } reply, err := batchFn(ctx, &client.args) gt.setPending(client.args.Replica, false) done <- BatchCall{Reply: reply, Err: err} }() }
// SendNext invokes the specified RPC on the supplied client when the // client is ready. On success, the reply is sent on the channel; // otherwise an error is sent. func (gt *grpcTransport) SendNext(ctx context.Context, done chan<- BatchCall) { client := gt.orderedClients[gt.clientIndex] gt.clientIndex++ gt.setPending(client.args.Replica, true) // Fork the original context as this async send may outlast the // caller's context. ctx, sp := tracing.ForkCtxSpan(ctx, "grpcTransport SendNext") go func() { defer tracing.FinishSpan(sp) gt.opts.metrics.SentCount.Inc(1) reply, err := func() (*roachpb.BatchResponse, error) { if enableLocalCalls { if localServer := gt.rpcContext.GetLocalInternalServerForAddr(client.remoteAddr); localServer != nil { // Clone the request. At the time of writing, Replica may mutate it // during command execution which can lead to data races. // // TODO(tamird): we should clone all of client.args.Header, but the // assertions in protoutil.Clone fire and there seems to be no // reasonable workaround. origTxn := client.args.Txn if origTxn != nil { clonedTxn := origTxn.Clone() client.args.Txn = &clonedTxn } gt.opts.metrics.LocalSentCount.Inc(1) log.VEvent(ctx, 2, "sending request to local server") return localServer.Batch(ctx, &client.args) } } log.VEventf(ctx, 2, "sending request to %s", client.remoteAddr) reply, err := client.client.Batch(ctx, &client.args) if reply != nil { for i := range reply.Responses { if err := reply.Responses[i].GetInner().Verify(client.args.Requests[i].GetInner()); err != nil { log.Error(ctx, err) } } } return reply, err }() gt.setPending(client.args.Replica, false) done <- BatchCall{Reply: reply, Err: err} }() }
// deadReplicasGossipUpdate is the gossip callback used to keep the StorePool up to date. func (sp *StorePool) deadReplicasGossipUpdate(_ string, content roachpb.Value) { var replicas roachpb.StoreDeadReplicas if err := content.GetProto(&replicas); err != nil { ctx := sp.AnnotateCtx(context.TODO()) log.Error(ctx, err) return } sp.mu.Lock() defer sp.mu.Unlock() detail := sp.getStoreDetailLocked(replicas.StoreID) deadReplicas := make(map[roachpb.RangeID][]roachpb.ReplicaDescriptor) for _, r := range replicas.Replicas { deadReplicas[r.RangeID] = append(deadReplicas[r.RangeID], r.Replica) } detail.deadReplicas = deadReplicas }
// RunSyncFlow is part of the DistSQLServer interface. func (ds *ServerImpl) RunSyncFlow(req *SetupFlowRequest, stream DistSQL_RunSyncFlowServer) error { // Set up the outgoing mailbox for the stream. mbox := newOutboxSyncFlowStream(stream) ctx := ds.AnnotateCtx(stream.Context()) f, err := ds.SetupSyncFlow(ctx, req, mbox) if err != nil { log.Error(ctx, err) return err } mbox.setFlowCtx(&f.FlowCtx) if err := ds.Stopper.RunTask(func() { f.waitGroup.Add(1) mbox.start(&f.waitGroup) f.Start(func() {}) f.Wait() f.Cleanup() }); err != nil { return err } return mbox.err }
// shouldQueue determines whether a range should be queued for // splitting. This is true if the range is intersected by a zone config // prefix or if the range's size in bytes exceeds the limit for the zone. func (sq *splitQueue) shouldQueue( ctx context.Context, now hlc.Timestamp, repl *Replica, sysCfg config.SystemConfig, ) (shouldQ bool, priority float64) { desc := repl.Desc() if len(sysCfg.ComputeSplitKeys(desc.StartKey, desc.EndKey)) > 0 { // Set priority to 1 in the event the range is split by zone configs. priority = 1 shouldQ = true } // Add priority based on the size of range compared to the max // size for the zone it's in. zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { log.Error(ctx, err) return } if ratio := float64(repl.GetMVCCStats().Total()) / float64(zone.RangeMaxBytes); ratio > 1 { priority += ratio shouldQ = true } return }
// updateNodeAddress is a gossip callback which fires with each // update to the node address. This allows us to compute the // total size of the gossip network (for determining max peers // each gossip node is allowed to have), as well as to create // new resolvers for each encountered host and to write the // set of gossip node addresses to persistent storage when it // changes. func (g *Gossip) updateNodeAddress(key string, content roachpb.Value) { ctx := g.AnnotateCtx(context.TODO()) var desc roachpb.NodeDescriptor if err := content.GetProto(&desc); err != nil { log.Error(ctx, err) return } g.mu.Lock() defer g.mu.Unlock() // If desc is the empty descriptor, that indicates that the node has been // removed from the cluster. If that's the case, remove it from our map of // nodes to prevent other parts of the system from trying to talk to it. // We can't directly compare the node against the empty descriptor because // the proto has a repeated field and thus isn't comparable. if desc.NodeID == 0 && desc.Address.IsEmpty() { nodeID, err := NodeIDFromKey(key) if err != nil { log.Errorf(ctx, "unable to update node address for removed node: %s", err) return } log.Infof(ctx, "removed node %d from gossip", nodeID) delete(g.nodeDescs, nodeID) return } // Skip if the node has already been seen. if _, ok := g.nodeDescs[desc.NodeID]; ok { return } g.nodeDescs[desc.NodeID] = &desc // Recompute max peers based on size of network and set the max // sizes for incoming and outgoing node sets. maxPeers := g.maxPeers(len(g.nodeDescs)) g.mu.incoming.setMaxSize(maxPeers) g.outgoing.setMaxSize(maxPeers) // Skip if it's our own address. if desc.Address == g.mu.is.NodeAddr { return } // Add this new node address (if it's not already there) to our list // of resolvers so we can keep connecting to gossip if the original // resolvers go offline. g.maybeAddResolver(desc.Address) // We ignore empty addresses for the sake of not breaking the many tests // that don't bother specifying addresses. if desc.Address.IsEmpty() { return } // If the new node's address conflicts with another node's address, then it // must be the case that the new node has replaced the previous one. Remove // it from our set of tracked descriptors to ensure we don't attempt to // connect to its previous identity (as came up in issue #10266). oldNodeID, ok := g.bootstrapAddrs[desc.Address] if ok && oldNodeID != unknownNodeID && oldNodeID != desc.NodeID { log.Infof(ctx, "removing node %d which was at same address (%s) as new node %v", oldNodeID, desc.Address, desc) delete(g.nodeDescs, oldNodeID) // Deleting the local copy isn't enough to remove the node from the gossip // network. We also have to clear it out in the infoStore by overwriting // it with an empty descriptor, which can be represented as just an empty // byte array due to how protocol buffers are serialied. // Calling addInfoLocked here is somewhat recursive since // updateNodeAddress is typically called in response to the infoStore // being updated but won't lead to deadlock because it's called // asynchronously. key := MakeNodeIDKey(oldNodeID) var emptyProto []byte if err := g.addInfoLocked(key, emptyProto, ttlNodeDescriptorGossip); err != nil { log.Errorf(ctx, "failed to empty node descriptor for node %d: %s", oldNodeID, err) } } // Add new address (if it's not already there) to bootstrap info and // persist if possible. added := g.maybeAddBootstrapAddress(desc.Address, desc.NodeID) if added && g.storage != nil { if err := g.storage.WriteBootstrapInfo(&g.bootstrapInfo); err != nil { log.Error(ctx, err) } } }
// Start starts the server on the specified port, starts gossip and initializes // the node using the engines from the server's context. // // The passed context can be used to trace the server startup. The context // should represent the general startup operation. func (s *Server) Start(ctx context.Context) error { ctx = s.AnnotateCtx(ctx) startTime := timeutil.Now() tlsConfig, err := s.cfg.GetServerTLSConfig() if err != nil { return err } httpServer := netutil.MakeServer(s.stopper, tlsConfig, s) plainRedirectServer := netutil.MakeServer(s.stopper, tlsConfig, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { http.Redirect(w, r, "https://"+r.Host+r.RequestURI, http.StatusPermanentRedirect) })) // The following code is a specialization of util/net.go's ListenAndServe // which adds pgwire support. A single port is used to serve all protocols // (pg, http, h2) via the following construction: // // non-TLS case: // net.Listen -> cmux.New // | // - -> pgwire.Match -> pgwire.Server.ServeConn // - -> cmux.Any -> grpc.(*Server).Serve // // TLS case: // net.Listen -> cmux.New // | // - -> pgwire.Match -> pgwire.Server.ServeConn // - -> cmux.Any -> grpc.(*Server).Serve // // Note that the difference between the TLS and non-TLS cases exists due to // Go's lack of an h2c (HTTP2 Clear Text) implementation. See inline comments // in util.ListenAndServe for an explanation of how h2c is implemented there // and here. ln, err := net.Listen("tcp", s.cfg.Addr) if err != nil { return err } log.Eventf(ctx, "listening on port %s", s.cfg.Addr) unresolvedListenAddr, err := officialAddr(s.cfg.Addr, ln.Addr()) if err != nil { return err } s.cfg.Addr = unresolvedListenAddr.String() unresolvedAdvertAddr, err := officialAddr(s.cfg.AdvertiseAddr, ln.Addr()) if err != nil { return err } s.cfg.AdvertiseAddr = unresolvedAdvertAddr.String() s.rpcContext.SetLocalInternalServer(s.node) m := cmux.New(ln) pgL := m.Match(pgwire.Match) anyL := m.Match(cmux.Any()) httpLn, err := net.Listen("tcp", s.cfg.HTTPAddr) if err != nil { return err } unresolvedHTTPAddr, err := officialAddr(s.cfg.HTTPAddr, httpLn.Addr()) if err != nil { return err } s.cfg.HTTPAddr = unresolvedHTTPAddr.String() workersCtx := s.AnnotateCtx(context.Background()) s.stopper.RunWorker(func() { <-s.stopper.ShouldQuiesce() if err := httpLn.Close(); err != nil { log.Fatal(workersCtx, err) } }) if tlsConfig != nil { httpMux := cmux.New(httpLn) clearL := httpMux.Match(cmux.HTTP1()) tlsL := httpMux.Match(cmux.Any()) s.stopper.RunWorker(func() { netutil.FatalIfUnexpected(httpMux.Serve()) }) s.stopper.RunWorker(func() { netutil.FatalIfUnexpected(plainRedirectServer.Serve(clearL)) }) httpLn = tls.NewListener(tlsL, tlsConfig) } s.stopper.RunWorker(func() { netutil.FatalIfUnexpected(httpServer.Serve(httpLn)) }) s.stopper.RunWorker(func() { <-s.stopper.ShouldQuiesce() netutil.FatalIfUnexpected(anyL.Close()) <-s.stopper.ShouldStop() s.grpc.Stop() }) s.stopper.RunWorker(func() { netutil.FatalIfUnexpected(s.grpc.Serve(anyL)) }) s.stopper.RunWorker(func() { pgCtx := s.pgServer.AmbientCtx.AnnotateCtx(context.Background()) netutil.FatalIfUnexpected(httpServer.ServeWith(s.stopper, pgL, func(conn net.Conn) { connCtx := log.WithLogTagStr(pgCtx, "client", conn.RemoteAddr().String()) if err := s.pgServer.ServeConn(connCtx, conn); err != nil && !netutil.IsClosedConnection(err) { // Report the error on this connection's context, so that we // know which remote client caused the error when looking at // the logs. log.Error(connCtx, err) } })) }) if len(s.cfg.SocketFile) != 0 { // Unix socket enabled: postgres protocol only. unixLn, err := net.Listen("unix", s.cfg.SocketFile) if err != nil { return err } s.stopper.RunWorker(func() { <-s.stopper.ShouldQuiesce() if err := unixLn.Close(); err != nil { log.Fatal(workersCtx, err) } }) s.stopper.RunWorker(func() { pgCtx := s.pgServer.AmbientCtx.AnnotateCtx(context.Background()) netutil.FatalIfUnexpected(httpServer.ServeWith(s.stopper, unixLn, func(conn net.Conn) { connCtx := log.WithLogTagStr(pgCtx, "client", conn.RemoteAddr().String()) if err := s.pgServer.ServeConn(connCtx, conn); err != nil && !netutil.IsClosedConnection(err) { // Report the error on this connection's context, so that we // know which remote client caused the error when looking at // the logs. log.Error(connCtx, err) } })) }) } // Enable the debug endpoints first to provide an earlier window // into what's going on with the node in advance of exporting node // functionality. // TODO(marc): when cookie-based authentication exists, // apply it for all web endpoints. s.mux.HandleFunc(debugEndpoint, http.HandlerFunc(handleDebug)) s.gossip.Start(unresolvedAdvertAddr) log.Event(ctx, "started gossip") s.engines, err = s.cfg.CreateEngines() if err != nil { return errors.Wrap(err, "failed to create engines") } s.stopper.AddCloser(&s.engines) // We might have to sleep a bit to protect against this node producing non- // monotonic timestamps. Before restarting, its clock might have been driven // by other nodes' fast clocks, but when we restarted, we lost all this // information. For example, a client might have written a value at a // timestamp that's in the future of the restarted node's clock, and if we // don't do something, the same client's read would not return the written // value. So, we wait up to MaxOffset; we couldn't have served timestamps more // than MaxOffset in the future (assuming that MaxOffset was not changed, see // #9733). // // As an optimization for tests, we don't sleep if all the stores are brand // new. In this case, the node will not serve anything anyway until it // synchronizes with other nodes. { anyStoreBootstrapped := false for _, e := range s.engines { if _, err := storage.ReadStoreIdent(ctx, e); err != nil { // NotBootstrappedError is expected. if _, ok := err.(*storage.NotBootstrappedError); !ok { return err } } else { anyStoreBootstrapped = true break } } if anyStoreBootstrapped { sleepDuration := s.clock.MaxOffset() - timeutil.Since(startTime) if sleepDuration > 0 { log.Infof(ctx, "sleeping for %s to guarantee HLC monotonicity", sleepDuration) time.Sleep(sleepDuration) } } } // Now that we have a monotonic HLC wrt previous incarnations of the process, // init all the replicas. err = s.node.start( ctx, unresolvedAdvertAddr, s.engines, s.cfg.NodeAttributes, s.cfg.Locality, ) if err != nil { return err } log.Event(ctx, "started node") s.nodeLiveness.StartHeartbeat(ctx, s.stopper) // We can now add the node registry. s.recorder.AddNode(s.registry, s.node.Descriptor, s.node.startedAt) // Begin recording runtime statistics. s.startSampleEnvironment(s.cfg.MetricsSampleInterval) // Begin recording time series data collected by the status monitor. s.tsDB.PollSource( s.cfg.AmbientCtx, s.recorder, s.cfg.MetricsSampleInterval, ts.Resolution10s, s.stopper, ) // Begin recording status summaries. s.node.startWriteSummaries(s.cfg.MetricsSampleInterval) // Create and start the schema change manager only after a NodeID // has been assigned. testingKnobs := &sql.SchemaChangerTestingKnobs{} if s.cfg.TestingKnobs.SQLSchemaChanger != nil { testingKnobs = s.cfg.TestingKnobs.SQLSchemaChanger.(*sql.SchemaChangerTestingKnobs) } sql.NewSchemaChangeManager(testingKnobs, *s.db, s.gossip, s.leaseMgr).Start(s.stopper) s.distSQLServer.Start() log.Infof(ctx, "starting %s server at %s", s.cfg.HTTPRequestScheme(), unresolvedHTTPAddr) log.Infof(ctx, "starting grpc/postgres server at %s", unresolvedListenAddr) log.Infof(ctx, "advertising CockroachDB node at %s", unresolvedAdvertAddr) if len(s.cfg.SocketFile) != 0 { log.Infof(ctx, "starting postgres server at unix:%s", s.cfg.SocketFile) } s.stopper.RunWorker(func() { netutil.FatalIfUnexpected(m.Serve()) }) log.Event(ctx, "accepting connections") // Initialize grpc-gateway mux and context. jsonpb := &protoutil.JSONPb{ EnumsAsInts: true, EmitDefaults: true, Indent: " ", } protopb := new(protoutil.ProtoPb) gwMux := gwruntime.NewServeMux( gwruntime.WithMarshalerOption(gwruntime.MIMEWildcard, jsonpb), gwruntime.WithMarshalerOption(httputil.JSONContentType, jsonpb), gwruntime.WithMarshalerOption(httputil.AltJSONContentType, jsonpb), gwruntime.WithMarshalerOption(httputil.ProtoContentType, protopb), gwruntime.WithMarshalerOption(httputil.AltProtoContentType, protopb), ) gwCtx, gwCancel := context.WithCancel(s.AnnotateCtx(context.Background())) s.stopper.AddCloser(stop.CloserFn(gwCancel)) // Setup HTTP<->gRPC handlers. conn, err := s.rpcContext.GRPCDial(s.cfg.Addr) if err != nil { return errors.Errorf("error constructing grpc-gateway: %s; are your certificates valid?", err) } for _, gw := range []grpcGatewayServer{s.admin, s.status, &s.tsServer} { if err := gw.RegisterGateway(gwCtx, gwMux, conn); err != nil { return err } } var uiFileSystem http.FileSystem uiDebug := envutil.EnvOrDefaultBool("COCKROACH_DEBUG_UI", false) if uiDebug { uiFileSystem = http.Dir("pkg/ui") } else { uiFileSystem = &assetfs.AssetFS{ Asset: ui.Asset, AssetDir: ui.AssetDir, AssetInfo: ui.AssetInfo, } } uiFileServer := http.FileServer(uiFileSystem) s.mux.HandleFunc("/", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { if r.URL.Path == "/" { if uiDebug { r.URL.Path = "debug.html" } else { r.URL.Path = "release.html" } } uiFileServer.ServeHTTP(w, r) })) // TODO(marc): when cookie-based authentication exists, // apply it for all web endpoints. s.mux.Handle(adminPrefix, gwMux) s.mux.Handle(ts.URLPrefix, gwMux) s.mux.Handle(statusPrefix, gwMux) s.mux.Handle("/health", gwMux) s.mux.Handle(statusVars, http.HandlerFunc(s.status.handleVars)) log.Event(ctx, "added http endpoints") if err := sdnotify.Ready(); err != nil { log.Errorf(ctx, "failed to signal readiness using systemd protocol: %s", err) } log.Event(ctx, "server ready") return nil }
// maybeAddToPurgatory possibly adds the specified replica to the // purgatory queue, which holds replicas which have failed // processing. To be added, the failing error must implement // purgatoryError and the queue implementation must have its own // mechanism for signaling re-processing of replicas held in // purgatory. func (bq *baseQueue) maybeAddToPurgatory( ctx context.Context, repl *Replica, triggeringErr error, clock *hlc.Clock, stopper *stop.Stopper, ) { // Increment failures metric here to capture all error returns from // process(). bq.failures.Inc(1) // Check whether the failure is a purgatory error and whether the queue supports it. if _, ok := triggeringErr.(purgatoryError); !ok || bq.impl.purgatoryChan() == nil { log.Error(ctx, triggeringErr) return } bq.mu.Lock() defer bq.mu.Unlock() // First, check whether the replica has already been re-added to queue. if _, ok := bq.mu.replicas[repl.RangeID]; ok { return } log.Error(ctx, errors.Wrap(triggeringErr, "purgatory")) item := &replicaItem{value: repl.RangeID} bq.mu.replicas[repl.RangeID] = item defer func() { bq.purgatory.Update(int64(len(bq.mu.purgatory))) }() // If purgatory already exists, just add to the map and we're done. if bq.mu.purgatory != nil { bq.mu.purgatory[repl.RangeID] = triggeringErr return } // Otherwise, create purgatory and start processing. bq.mu.purgatory = map[roachpb.RangeID]error{ repl.RangeID: triggeringErr, } stopper.RunWorker(func() { ctx := bq.AnnotateCtx(context.Background()) ticker := time.NewTicker(purgatoryReportInterval) for { select { case <-bq.impl.purgatoryChan(): // Remove all items from purgatory into a copied slice. bq.mu.Lock() ranges := make([]roachpb.RangeID, 0, len(bq.mu.purgatory)) for rangeID := range bq.mu.purgatory { item := bq.mu.replicas[rangeID] ranges = append(ranges, item.value) bq.remove(item) } bq.mu.Unlock() for _, id := range ranges { repl, err := bq.store.GetReplica(id) if err != nil { log.Errorf(ctx, "range %s no longer exists on store: %s", id, err) return } if stopper.RunTask(func() { annotatedCtx := repl.AnnotateCtx(ctx) if err := bq.processReplica(annotatedCtx, repl, clock); err != nil { bq.maybeAddToPurgatory(annotatedCtx, repl, err, clock, stopper) } }) != nil { return } } bq.mu.Lock() if len(bq.mu.purgatory) == 0 { log.Infof(ctx, "purgatory is now empty") bq.mu.purgatory = nil bq.mu.Unlock() return } bq.mu.Unlock() case <-ticker.C: // Report purgatory status. bq.mu.Lock() errMap := map[string]int{} for _, err := range bq.mu.purgatory { errMap[err.Error()]++ } bq.mu.Unlock() for errStr, count := range errMap { log.Errorf(ctx, "%d replicas failing with %q", count, errStr) } case <-stopper.ShouldStop(): return } } }) }