Пример #1
0
func startBankTransfers(t testing.TB, stopper *stop.Stopper, sqlDB *gosql.DB, numAccounts int) {
	const maxTransfer = 999
	for {
		select {
		case <-stopper.ShouldQuiesce():
			return // All done.
		default:
			// Keep going.
		}

		from := rand.Intn(numAccounts)
		to := rand.Intn(numAccounts - 1)
		for from == to {
			to = numAccounts - 1
		}

		amount := rand.Intn(maxTransfer)

		const update = `UPDATE bench.bank
				SET balance = CASE id WHEN $1 THEN balance-$3 WHEN $2 THEN balance+$3 END
				WHERE id IN ($1, $2)`
		util.SucceedsSoon(t, func() error {
			select {
			case <-stopper.ShouldQuiesce():
				return nil // All done.
			default:
				// Keep going.
			}
			_, err := sqlDB.Exec(update, from, to, amount)
			return err
		})
	}
}
Пример #2
0
// waitAndProcess waits for the pace interval and processes the replica
// if repl is not nil. The method returns true when the scanner needs
// to be stopped. The method also removes a replica from queues when it
// is signaled via the removed channel.
func (rs *replicaScanner) waitAndProcess(
	ctx context.Context, start time.Time, clock *hlc.Clock, stopper *stop.Stopper, repl *Replica,
) bool {
	waitInterval := rs.paceInterval(start, timeutil.Now())
	rs.waitTimer.Reset(waitInterval)
	if log.V(6) {
		log.Infof(ctx, "wait timer interval set to %s", waitInterval)
	}
	for {
		select {
		case <-rs.waitTimer.C:
			if log.V(6) {
				log.Infof(ctx, "wait timer fired")
			}
			rs.waitTimer.Read = true
			if repl == nil {
				return false
			}

			if log.V(2) {
				log.Infof(ctx, "replica scanner processing %s", repl)
			}
			for _, q := range rs.queues {
				q.MaybeAdd(repl, clock.Now())
			}
			return false

		case repl := <-rs.removed:
			rs.removeReplica(repl)

		case <-stopper.ShouldStop():
			return true
		}
	}
}
Пример #3
0
// TestingSetupZoneConfigHook initializes the zone config hook
// to 'testingZoneConfigHook' which uses 'testingZoneConfig'.
// Settings go back to their previous values when the stopper runs our closer.
func TestingSetupZoneConfigHook(stopper *stop.Stopper) {
	stopper.AddCloser(stop.CloserFn(testingResetZoneConfigHook))

	testingLock.Lock()
	defer testingLock.Unlock()
	if testingHasHook {
		panic("TestingSetupZoneConfigHook called without restoring state")
	}
	testingHasHook = true
	testingZoneConfig = make(zoneConfigMap)
	testingPreviousHook = ZoneConfigHook
	ZoneConfigHook = testingZoneConfigHook
	testingLargestIDHook = func(maxID uint32) (max uint32) {
		testingLock.Lock()
		defer testingLock.Unlock()
		for id := range testingZoneConfig {
			if maxID > 0 && id > maxID {
				continue
			}
			if id > max {
				max = id
			}
		}
		return
	}
}
Пример #4
0
// ServeWith accepts connections on ln and serves them using serveConn.
func (s *Server) ServeWith(stopper *stop.Stopper, l net.Listener, serveConn func(net.Conn)) error {
	// Inspired by net/http.(*Server).Serve
	var tempDelay time.Duration // how long to sleep on accept failure
	for {
		rw, e := l.Accept()
		if e != nil {
			if ne, ok := e.(net.Error); ok && ne.Temporary() {
				if tempDelay == 0 {
					tempDelay = 5 * time.Millisecond
				} else {
					tempDelay *= 2
				}
				if max := 1 * time.Second; tempDelay > max {
					tempDelay = max
				}
				httpLogger.Printf("http: Accept error: %v; retrying in %v", e, tempDelay)
				time.Sleep(tempDelay)
				continue
			}
			return e
		}
		tempDelay = 0
		go func() {
			defer stopper.Recover()
			s.Server.ConnState(rw, http.StateNew) // before Serve can return
			serveConn(rw)
			s.Server.ConnState(rw, http.StateClosed)
		}()
	}
}
Пример #5
0
// createTestAbortCache creates an in-memory engine and
// returns a abort cache using the supplied Range ID.
func createTestAbortCache(
	t *testing.T, rangeID roachpb.RangeID, stopper *stop.Stopper,
) (*AbortCache, engine.Engine) {
	eng := engine.NewInMem(roachpb.Attributes{}, 1<<20)
	stopper.AddCloser(eng)
	return NewAbortCache(rangeID), eng
}
Пример #6
0
// startUser simulates a stream of user events until the stopper
// indicates it's time to exit.
func startUser(ctx Context, stopper *stop.Stopper) {
	for {
		userID := 1 + int(rand.ExpFloat64()/rate)
		op := randomOp()

		if err := stopper.RunTask(func() {
			err := runUserOp(ctx, userID, op.typ)
			stats.Lock()
			_ = stats.hist.RecordValue(int64(userID))
			stats.totalOps++
			stats.opCounts[op.typ]++
			switch {
			case err == errNoUser:
				stats.noUserOps++
			case err == errNoPhoto:
				stats.noPhotoOps++
			case err != nil:
				stats.failedOps++
				log.Printf("failed to run %s op for %d: %s", op.name, userID, err)
			}
			stats.Unlock()
		}); err != nil {
			return
		}
	}
}
Пример #7
0
// gossip loops, sending deltas of the infostore and receiving deltas
// in turn. If an alternate is proposed on response, the client addr
// is modified and method returns for forwarding by caller.
func (c *client) gossip(
	ctx context.Context,
	g *Gossip,
	stream Gossip_GossipClient,
	stopper *stop.Stopper,
	wg *sync.WaitGroup,
) error {
	sendGossipChan := make(chan struct{}, 1)

	// Register a callback for gossip updates.
	updateCallback := func(_ string, _ roachpb.Value) {
		select {
		case sendGossipChan <- struct{}{}:
		default:
		}
	}
	// Defer calling "undoer" callback returned from registration.
	defer g.RegisterCallback(".*", updateCallback)()

	errCh := make(chan error, 1)
	// This wait group is used to allow the caller to wait until gossip
	// processing is terminated.
	wg.Add(1)
	stopper.RunWorker(func() {
		defer wg.Done()

		errCh <- func() error {
			for {
				reply, err := stream.Recv()
				if err != nil {
					return err
				}
				if err := c.handleResponse(ctx, g, reply); err != nil {
					return err
				}
			}
		}()
	})

	for {
		select {
		case <-c.closer:
			return nil
		case <-stopper.ShouldStop():
			return nil
		case err := <-errCh:
			return err
		case <-sendGossipChan:
			if err := c.sendGossip(g, stream); err != nil {
				return err
			}
		}
	}
}
Пример #8
0
// New creates an instance of a gossip node.
// The higher level manages the NodeIDContainer instance (which can be shared by
// various server components). The ambient context is expected to already
// contain the node ID.
func New(
	ambient log.AmbientContext,
	nodeID *base.NodeIDContainer,
	rpcContext *rpc.Context,
	grpcServer *grpc.Server,
	resolvers []resolver.Resolver,
	stopper *stop.Stopper,
	registry *metric.Registry,
) *Gossip {
	ambient.SetEventLog("gossip", "gossip")
	g := &Gossip{
		server:            newServer(ambient, nodeID, stopper, registry),
		Connected:         make(chan struct{}),
		rpcContext:        rpcContext,
		outgoing:          makeNodeSet(minPeers, metric.NewGauge(MetaConnectionsOutgoingGauge)),
		bootstrapping:     map[string]struct{}{},
		disconnected:      make(chan *client, 10),
		stalledCh:         make(chan struct{}, 1),
		stallInterval:     defaultStallInterval,
		bootstrapInterval: defaultBootstrapInterval,
		cullInterval:      defaultCullInterval,
		nodeDescs:         map[roachpb.NodeID]*roachpb.NodeDescriptor{},
		resolverAddrs:     map[util.UnresolvedAddr]resolver.Resolver{},
		bootstrapAddrs:    map[util.UnresolvedAddr]roachpb.NodeID{},
	}
	stopper.AddCloser(stop.CloserFn(g.server.AmbientContext.FinishEventLog))

	registry.AddMetric(g.outgoing.gauge)
	g.clientsMu.breakers = map[string]*circuit.Breaker{}
	resolverAddrs := make([]string, len(resolvers))
	for i, resolver := range resolvers {
		resolverAddrs[i] = resolver.Addr()
	}
	ctx := g.AnnotateCtx(context.Background())
	if log.V(1) {
		log.Infof(ctx, "initial resolvers: %v", resolverAddrs)
	}
	g.SetResolvers(resolvers)

	g.mu.Lock()
	// Add ourselves as a SystemConfig watcher.
	g.mu.is.registerCallback(KeySystemConfig, g.updateSystemConfig)
	// Add ourselves as a node descriptor watcher.
	g.mu.is.registerCallback(MakePrefixPattern(KeyNodeIDPrefix), g.updateNodeAddress)
	g.mu.Unlock()

	RegisterGossipServer(grpcServer, g.server)
	return g
}
Пример #9
0
// scanLoop loops endlessly, scanning through replicas available via
// the replica set, or until the scanner is stopped. The iteration
// is paced to complete a full scan in approximately the scan interval.
func (rs *replicaScanner) scanLoop(clock *hlc.Clock, stopper *stop.Stopper) {
	stopper.RunWorker(func() {
		ctx := rs.AnnotateCtx(context.Background())
		start := timeutil.Now()

		// waitTimer is reset in each call to waitAndProcess.
		defer rs.waitTimer.Stop()

		for {
			if rs.GetDisabled() {
				if done := rs.waitEnabled(stopper); done {
					return
				}
				continue
			}
			var shouldStop bool
			count := 0
			rs.replicas.Visit(func(repl *Replica) bool {
				count++
				shouldStop = rs.waitAndProcess(ctx, start, clock, stopper, repl)
				return !shouldStop
			})
			if count == 0 {
				// No replicas processed, just wait.
				shouldStop = rs.waitAndProcess(ctx, start, clock, stopper, nil)
			}

			shouldStop = shouldStop || nil != stopper.RunTask(func() {
				// Increment iteration count.
				rs.mu.Lock()
				defer rs.mu.Unlock()
				rs.mu.scanCount++
				rs.mu.total += timeutil.Since(start)
				if log.V(6) {
					log.Infof(ctx, "reset replica scan iteration")
				}

				// Reset iteration and start time.
				start = timeutil.Now()
			})
			if shouldStop {
				return
			}
		}
	})
}
Пример #10
0
// NewExecutor creates an Executor and registers a callback on the
// system config.
func NewExecutor(
	cfg ExecutorConfig, stopper *stop.Stopper, startupMemMetrics *MemoryMetrics,
) *Executor {
	exec := &Executor{
		cfg:     cfg,
		reCache: parser.NewRegexpCache(512),

		Latency:          metric.NewLatency(MetaLatency, cfg.MetricsSampleInterval),
		TxnBeginCount:    metric.NewCounter(MetaTxnBegin),
		TxnCommitCount:   metric.NewCounter(MetaTxnCommit),
		TxnAbortCount:    metric.NewCounter(MetaTxnAbort),
		TxnRollbackCount: metric.NewCounter(MetaTxnRollback),
		SelectCount:      metric.NewCounter(MetaSelect),
		UpdateCount:      metric.NewCounter(MetaUpdate),
		InsertCount:      metric.NewCounter(MetaInsert),
		DeleteCount:      metric.NewCounter(MetaDelete),
		DdlCount:         metric.NewCounter(MetaDdl),
		MiscCount:        metric.NewCounter(MetaMisc),
		QueryCount:       metric.NewCounter(MetaQuery),
	}

	exec.systemConfigCond = sync.NewCond(exec.systemConfigMu.RLocker())

	gossipUpdateC := cfg.Gossip.RegisterSystemConfigChannel()
	stopper.RunWorker(func() {
		for {
			select {
			case <-gossipUpdateC:
				sysCfg, _ := cfg.Gossip.GetSystemConfig()
				exec.updateSystemConfig(sysCfg)
			case <-stopper.ShouldStop():
				return
			}
		}
	})

	ctx := log.WithLogTag(context.Background(), "startup", nil)
	startupSession := NewSession(ctx, SessionArgs{}, exec, nil, startupMemMetrics)
	if err := exec.virtualSchemas.init(&startupSession.planner); err != nil {
		log.Fatal(ctx, err)
	}
	startupSession.Finish(exec)

	return exec
}
Пример #11
0
// startComputePeriodicMetrics starts a loop which periodically instructs each
// store to compute the value of metrics which cannot be incrementally
// maintained.
func (n *Node) startComputePeriodicMetrics(stopper *stop.Stopper, interval time.Duration) {
	stopper.RunWorker(func() {
		ctx := n.AnnotateCtx(context.Background())
		// Compute periodic stats at the same frequency as metrics are sampled.
		ticker := time.NewTicker(interval)
		defer ticker.Stop()
		for tick := 0; ; tick++ {
			select {
			case <-ticker.C:
				if err := n.computePeriodicMetrics(tick); err != nil {
					log.Errorf(ctx, "failed computing periodic metrics: %s", err)
				}
			case <-stopper.ShouldStop():
				return
			}
		}
	})
}
Пример #12
0
// NewContext creates an rpc Context with the supplied values.
func NewContext(
	ambient log.AmbientContext, baseCtx *base.Config, hlcClock *hlc.Clock, stopper *stop.Stopper,
) *Context {
	ctx := &Context{
		Config: baseCtx,
	}
	if hlcClock != nil {
		ctx.localClock = hlcClock
	} else {
		ctx.localClock = hlc.NewClock(hlc.UnixNano)
	}
	ctx.breakerClock = breakerClock{
		clock: ctx.localClock,
	}
	var cancel context.CancelFunc
	ctx.masterCtx, cancel = context.WithCancel(ambient.AnnotateCtx(context.Background()))
	ctx.Stopper = stopper
	ctx.RemoteClocks = newRemoteClockMonitor(
		ctx.masterCtx, ctx.localClock, 10*defaultHeartbeatInterval)
	ctx.HeartbeatInterval = defaultHeartbeatInterval
	ctx.HeartbeatTimeout = 2 * defaultHeartbeatInterval
	ctx.conns.cache = make(map[string]*connMeta)

	stopper.RunWorker(func() {
		<-stopper.ShouldQuiesce()

		cancel()
		ctx.conns.Lock()
		for key, meta := range ctx.conns.cache {
			meta.Do(func() {
				// Make sure initialization is not in progress when we're removing the
				// conn. We need to set the error in case we win the race against the
				// real initialization code.
				if meta.err == nil {
					meta.err = &roachpb.NodeUnavailableError{}
				}
			})
			ctx.removeConnLocked(key, meta)
		}
		ctx.conns.Unlock()
	})

	return ctx
}
Пример #13
0
// CreateLocal creates a new local cockroach cluster. The stopper is used to
// gracefully shutdown the channel (e.g. when a signal arrives). The cluster
// must be started before being used and keeps logs in the specified logDir, if
// supplied.
func CreateLocal(
	ctx context.Context, cfg TestConfig, logDir string, privileged bool, stopper *stop.Stopper,
) *LocalCluster {
	select {
	case <-stopper.ShouldStop():
		// The stopper was already closed, exit early.
		os.Exit(1)
	default:
	}

	if *cockroachImage == builderImageFull && !exists(*cockroachBinary) {
		log.Fatalf(ctx, "\"%s\": does not exist", *cockroachBinary)
	}

	cli, err := client.NewEnvClient()
	maybePanic(err)

	retryingClient := retryingDockerClient{
		resilientDockerClient: resilientDockerClient{APIClient: cli},
		attempts:              10,
		timeout:               10 * time.Second,
	}

	clusterID := uuid.MakeV4()
	clusterIDS := clusterID.Short()
	// Only pass a nonzero logDir down to LocalCluster when instructed to keep
	// logs.
	var uniqueLogDir string
	if logDir != "" {
		uniqueLogDir = fmt.Sprintf("%s-%s", logDir, clusterIDS)
	}
	return &LocalCluster{
		clusterID: clusterIDS,
		client:    retryingClient,
		config:    cfg,
		stopper:   stopper,
		// TODO(tschottdorf): deadlocks will occur if these channels fill up.
		events:         make(chan Event, 1000),
		expectedEvents: make(chan Event, 1000),
		logDir:         uniqueLogDir,
		privileged:     privileged,
	}
}
Пример #14
0
func (tq *testQueue) Start(clock *hlc.Clock, stopper *stop.Stopper) {
	stopper.RunWorker(func() {
		for {
			select {
			case <-time.After(1 * time.Millisecond):
				tq.Lock()
				if !tq.disabled && len(tq.ranges) > 0 {
					tq.ranges = tq.ranges[1:]
					tq.processed++
				}
				tq.Unlock()
			case <-stopper.ShouldStop():
				tq.Lock()
				tq.done = true
				tq.Unlock()
				return
			}
		}
	})
}
Пример #15
0
func openStore(cmd *cobra.Command, dir string, stopper *stop.Stopper) (*engine.RocksDB, error) {
	cache := engine.NewRocksDBCache(512 << 20)
	defer cache.Release()
	maxOpenFiles, err := server.SetOpenFileLimitForOneStore()
	if err != nil {
		return nil, err
	}
	db, err := engine.NewRocksDB(
		roachpb.Attributes{},
		dir,
		cache,
		0,
		maxOpenFiles,
	)
	if err != nil {
		return nil, err
	}
	stopper.AddCloser(db)
	return db, nil
}
Пример #16
0
// waitEnabled loops, removing replicas from the scanner's queues,
// until scanning is enabled or the stopper signals shutdown,
func (rs *replicaScanner) waitEnabled(stopper *stop.Stopper) bool {
	rs.mu.Lock()
	rs.mu.waitEnabledCount++
	rs.mu.Unlock()
	for {
		if !rs.GetDisabled() {
			return false
		}
		select {
		case <-rs.setDisabledCh:
			continue

		case repl := <-rs.removed:
			rs.removeReplica(repl)

		case <-stopper.ShouldStop():
			return true
		}
	}
}
Пример #17
0
// start will run continuously and expire old reservations.
func (b *bookie) start(stopper *stop.Stopper) {
	stopper.RunWorker(func() {
		var timeoutTimer timeutil.Timer
		defer timeoutTimer.Stop()
		ctx := context.TODO()
		for {
			var timeout time.Duration
			b.mu.Lock()
			nextExpiration := b.mu.queue.peek()
			if nextExpiration == nil {
				// No reservations to expire.
				timeout = b.reservationTimeout
			} else {
				now := b.clock.Now()
				if now.GoTime().After(nextExpiration.expireAt.GoTime()) {
					// We have a reservation expiration, remove it.
					expiredReservation := b.mu.queue.dequeue()
					// Is it an active reservation?
					if b.mu.reservationsByRangeID[expiredReservation.RangeID] == expiredReservation {
						b.fillReservationLocked(ctx, expiredReservation)
					} else if log.V(2) {
						log.Infof(ctx, "[r%d] expired reservation has already been filled",
							expiredReservation.RangeID)
					}
					// Set the timeout to 0 to force another peek.
					timeout = 0
				} else {
					timeout = nextExpiration.expireAt.GoTime().Sub(now.GoTime())
				}
			}
			b.mu.Unlock()
			timeoutTimer.Reset(timeout)
			select {
			case <-timeoutTimer.C:
				timeoutTimer.Read = true
			case <-stopper.ShouldStop():
				return
			}
		}
	})
}
Пример #18
0
// InitSenderForLocalTestCluster initializes a TxnCoordSender that can be used
// with LocalTestCluster.
func InitSenderForLocalTestCluster(
	nodeDesc *roachpb.NodeDescriptor,
	tracer opentracing.Tracer,
	clock *hlc.Clock,
	latency time.Duration,
	stores client.Sender,
	stopper *stop.Stopper,
	gossip *gossip.Gossip,
) client.Sender {
	retryOpts := base.DefaultRetryOptions()
	retryOpts.Closer = stopper.ShouldQuiesce()
	senderTransportFactory := SenderTransportFactory(tracer, stores)
	distSender := NewDistSender(DistSenderConfig{
		Clock:           clock,
		RPCRetryOptions: &retryOpts,
		nodeDescriptor:  nodeDesc,
		TransportFactory: func(
			opts SendOptions,
			rpcContext *rpc.Context,
			replicas ReplicaSlice,
			args roachpb.BatchRequest,
		) (Transport, error) {
			transport, err := senderTransportFactory(opts, rpcContext, replicas, args)
			if err != nil {
				return nil, err
			}
			return &localTestClusterTransport{transport, latency}, nil
		},
	}, gossip)

	ambient := log.AmbientContext{Tracer: tracer}
	return NewTxnCoordSender(
		ambient,
		distSender,
		clock,
		false, /* !linearizable */
		stopper,
		MakeTxnMetrics(metric.TestSampleInterval),
	)
}
Пример #19
0
// MakeServer constructs a Server that tracks active connections, closing them
// when signalled by stopper.
func MakeServer(stopper *stop.Stopper, tlsConfig *tls.Config, handler http.Handler) Server {
	var mu syncutil.Mutex
	activeConns := make(map[net.Conn]struct{})
	server := Server{
		Server: &http.Server{
			Handler:   handler,
			TLSConfig: tlsConfig,
			ConnState: func(conn net.Conn, state http.ConnState) {
				mu.Lock()
				switch state {
				case http.StateNew:
					activeConns[conn] = struct{}{}
				case http.StateClosed:
					delete(activeConns, conn)
				}
				mu.Unlock()
			},
			ErrorLog: httpLogger,
		},
	}

	// net/http.(*Server).Serve/http2.ConfigureServer are not thread safe with
	// respect to net/http.(*Server).TLSConfig, so we call it synchronously here.
	if err := http2.ConfigureServer(server.Server, nil); err != nil {
		log.Fatal(context.TODO(), err)
	}

	stopper.RunWorker(func() {
		<-stopper.ShouldStop()

		mu.Lock()
		for conn := range activeConns {
			conn.Close()
		}
		mu.Unlock()
	})

	return server
}
Пример #20
0
// startGossip loops on a periodic ticker to gossip node-related
// information. Starts a goroutine to loop until the node is closed.
func (n *Node) startGossip(stopper *stop.Stopper) {
	stopper.RunWorker(func() {
		ctx := n.AnnotateCtx(context.Background())
		// This should always return immediately and acts as a sanity check that we
		// don't try to gossip before we're connected.
		select {
		case <-n.storeCfg.Gossip.Connected:
		default:
			panic(fmt.Sprintf("%s: not connected to gossip", n))
		}
		// Verify we've already gossiped our node descriptor.
		if _, err := n.storeCfg.Gossip.GetNodeDescriptor(n.Descriptor.NodeID); err != nil {
			panic(err)
		}

		gossipStoresInterval := envutil.EnvOrDefaultDuration("COCKROACH_GOSSIP_STORES_INTERVAL",
			gossip.DefaultGossipStoresInterval)
		statusTicker := time.NewTicker(gossipStatusInterval)
		storesTicker := time.NewTicker(gossipStoresInterval)
		nodeTicker := time.NewTicker(gossipNodeDescriptorInterval)
		defer storesTicker.Stop()
		defer nodeTicker.Stop()
		n.gossipStores(ctx) // one-off run before going to sleep
		for {
			select {
			case <-statusTicker.C:
				n.storeCfg.Gossip.LogStatus()
			case <-storesTicker.C:
				n.gossipStores(ctx)
			case <-nodeTicker.C:
				if err := n.storeCfg.Gossip.SetNodeDescriptor(&n.Descriptor); err != nil {
					log.Warningf(ctx, "couldn't gossip descriptor for node %d: %s", n.Descriptor.NodeID, err)
				}
			case <-stopper.ShouldStop():
				return
			}
		}
	})
}
Пример #21
0
func startStats(stopper *stop.Stopper) {
	var lastOps int
	ticker := time.NewTicker(statsInterval)
	for {
		select {
		case <-ticker.C:
			stats.Lock()
			opsPerSec := float64(stats.totalOps-lastOps) / float64(statsInterval/1E9)
			log.Printf("%d ops, %d no-user, %d no-photo, %d errs (%.2f/s)", stats.totalOps, stats.noUserOps, stats.noPhotoOps, stats.failedOps, opsPerSec)
			lastOps = stats.totalOps
			stats.Unlock()
		case <-stopper.ShouldStop():
			stats.Lock()
			if !stats.computing {
				stats.computing = true
				//showHistogram()
			}
			stats.Unlock()
			return
		}
	}
}
Пример #22
0
// start will run continuously and mark stores as offline if they haven't been
// heard from in longer than timeUntilStoreDead.
func (sp *StorePool) start(stopper *stop.Stopper) {
	stopper.RunWorker(func() {
		var timeoutTimer timeutil.Timer
		defer timeoutTimer.Stop()
		for {
			var timeout time.Duration
			sp.mu.Lock()
			detail := sp.mu.queue.peek()
			if detail == nil {
				// No stores yet, wait the full timeout.
				timeout = sp.timeUntilStoreDead
			} else {
				// Check to see if the store should be marked as dead.
				deadAsOf := detail.lastUpdatedTime.GoTime().Add(sp.timeUntilStoreDead)
				now := sp.clock.Now()
				if now.GoTime().After(deadAsOf) {
					deadDetail := sp.mu.queue.dequeue()
					deadDetail.markDead(now)
					// The next store might be dead as well, set the timeout to
					// 0 to process it immediately.
					timeout = 0
				} else {
					// Store is still alive, schedule the next check for when
					// it should timeout.
					timeout = deadAsOf.Sub(now.GoTime())
				}
			}
			sp.mu.Unlock()
			timeoutTimer.Reset(timeout)
			select {
			case <-timeoutTimer.C:
				timeoutTimer.Read = true
			case <-stopper.ShouldStop():
				return
			}
		}
	})
}
Пример #23
0
// processLoop processes the entries in the queue until the provided
// stopper signals exit.
//
// TODO(spencer): current load should factor into replica processing timer.
func (bq *baseQueue) processLoop(clock *hlc.Clock, stopper *stop.Stopper) {
	stopper.RunWorker(func() {
		ctx := bq.AnnotateCtx(context.Background())
		defer func() {
			bq.mu.Lock()
			bq.mu.stopped = true
			bq.mu.Unlock()
			bq.AmbientContext.FinishEventLog()
		}()

		// nextTime is initially nil; we don't start any timers until the queue
		// becomes non-empty.
		var nextTime <-chan time.Time

		immediately := make(chan time.Time)
		close(immediately)

		for {
			select {
			// Exit on stopper.
			case <-stopper.ShouldStop():
				return

			// Incoming signal sets the next time to process if there were previously
			// no replicas in the queue.
			case <-bq.incoming:
				if nextTime == nil {
					// When a replica is added, wake up immediately. This is mainly
					// to facilitate testing without unnecessary sleeps.
					nextTime = immediately

					// In case we're in a test, still block on the impl.
					bq.impl.timer()
				}
			// Process replicas as the timer expires.
			case <-nextTime:
				repl := bq.pop()
				if repl != nil {
					if stopper.RunTask(func() {
						annotatedCtx := repl.AnnotateCtx(ctx)
						if err := bq.processReplica(annotatedCtx, repl, clock); err != nil {
							// Maybe add failing replica to purgatory if the queue supports it.
							bq.maybeAddToPurgatory(annotatedCtx, repl, err, clock, stopper)
						}
					}) != nil {
						return
					}
				}
				if bq.Length() == 0 {
					nextTime = nil
				} else {
					nextTime = time.After(bq.impl.timer())
				}
			}
		}
	})
}
Пример #24
0
// BidirectionalPartitionNemesis is a nemesis which randomly severs the network
// symmetrically between two random groups of nodes. Partitioned and connected
// mode take alternating turns, with random durations of up to 15s.
func BidirectionalPartitionNemesis(
	ctx context.Context, t *testing.T, c cluster.Cluster, stopper *stop.Stopper,
) {
	randSec := func() time.Duration { return time.Duration(rand.Int63n(15 * int64(time.Second))) }
	log.Infof(ctx, "cleaning up any previous rules")
	_ = restoreNetwork(ctx, t, c) // clean up any potential leftovers
	log.Infof(ctx, "starting partition nemesis")
	for {
		ch := make(chan struct{})
		go func() {
			select {
			case <-time.After(randSec()):
			case <-stopper.ShouldStop():
			}
			close(ch)
		}()
		cutNetwork(ctx, t, c, ch, randomBidirectionalPartition(c.NumNodes())...)
		select {
		case <-stopper.ShouldStop():
			return
		case <-time.After(randSec()):
		}
	}
}
Пример #25
0
// StartHeartbeat starts a periodic heartbeat to refresh this node's
// last heartbeat in the node liveness table.
func (nl *NodeLiveness) StartHeartbeat(ctx context.Context, stopper *stop.Stopper) {
	log.VEventf(ctx, 1, "starting liveness heartbeat")

	stopper.RunWorker(func() {
		ambient := nl.ambientCtx
		ambient.AddLogTag("hb", nil)
		ticker := time.NewTicker(nl.heartbeatInterval)
		defer ticker.Stop()
		for {
			ctx, sp := ambient.AnnotateCtxWithSpan(context.Background(), "heartbeat")
			if err := nl.heartbeat(ctx); err != nil {
				log.Errorf(ctx, "failed liveness heartbeat: %s", err)
			}
			sp.Finish()
			select {
			case <-ticker.C:
			case <-nl.stopHeartbeat:
				return
			case <-stopper.ShouldStop():
				return
			}
		}
	})
}
Пример #26
0
// ListenAndServeGRPC creates a listener and serves the specified grpc Server
// on it, closing the listener when signalled by the stopper.
func ListenAndServeGRPC(
	stopper *stop.Stopper, server *grpc.Server, addr net.Addr,
) (net.Listener, error) {
	ln, err := net.Listen(addr.Network(), addr.String())
	if err != nil {
		return ln, err
	}

	stopper.RunWorker(func() {
		<-stopper.ShouldQuiesce()
		FatalIfUnexpected(ln.Close())
		<-stopper.ShouldStop()
		server.Stop()
	})

	stopper.RunWorker(func() {
		FatalIfUnexpected(server.Serve(ln))
	})
	return ln, nil
}
Пример #27
0
func (s *raftScheduler) Start(stopper *stop.Stopper) {
	stopper.RunWorker(func() {
		<-stopper.ShouldStop()
		s.mu.Lock()
		s.mu.stopped = true
		s.mu.Unlock()
		s.mu.cond.Broadcast()
	})

	s.done.Add(s.numWorkers)
	for i := 0; i < s.numWorkers; i++ {
		stopper.RunWorker(func() {
			s.worker(stopper)
		})
	}
}
Пример #28
0
// StartHeartbeat starts a periodic heartbeat to refresh this node's
// last heartbeat in the node liveness table.
func (nl *NodeLiveness) StartHeartbeat(ctx context.Context, stopper *stop.Stopper) {
	log.VEventf(ctx, 1, "starting liveness heartbeat")
	retryOpts := base.DefaultRetryOptions()
	retryOpts.Closer = stopper.ShouldQuiesce()

	stopper.RunWorker(func() {
		ambient := nl.ambientCtx
		ambient.AddLogTag("hb", nil)
		ticker := time.NewTicker(nl.heartbeatInterval)
		defer ticker.Stop()
		for {
			if !nl.pauseHeartbeat.Load().(bool) {
				ctx, sp := ambient.AnnotateCtxWithSpan(context.Background(), "heartbeat")
				ctx, cancel := context.WithTimeout(ctx, nl.heartbeatInterval)
				// Retry heartbeat in the event the conditional put fails.
				for r := retry.StartWithCtx(ctx, retryOpts); r.Next(); {
					liveness, err := nl.Self()
					if err != nil && err != ErrNoLivenessRecord {
						log.Errorf(ctx, "unexpected error getting liveness: %v", err)
					}
					if err := nl.Heartbeat(ctx, liveness); err != nil {
						if err == errSkippedHeartbeat {
							continue
						}
						log.Errorf(ctx, "failed liveness heartbeat: %v", err)
					}
					break
				}
				cancel()
				sp.Finish()
			}
			select {
			case <-ticker.C:
			case <-stopper.ShouldStop():
				return
			}
		}
	})
}
Пример #29
0
// maybeAddToPurgatory possibly adds the specified replica to the
// purgatory queue, which holds replicas which have failed
// processing. To be added, the failing error must implement
// purgatoryError and the queue implementation must have its own
// mechanism for signaling re-processing of replicas held in
// purgatory.
func (bq *baseQueue) maybeAddToPurgatory(
	ctx context.Context, repl *Replica, triggeringErr error, clock *hlc.Clock, stopper *stop.Stopper,
) {
	// Increment failures metric here to capture all error returns from
	// process().
	bq.failures.Inc(1)

	// Check whether the failure is a purgatory error and whether the queue supports it.
	if _, ok := triggeringErr.(purgatoryError); !ok || bq.impl.purgatoryChan() == nil {
		log.Error(ctx, triggeringErr)
		return
	}
	bq.mu.Lock()
	defer bq.mu.Unlock()

	// First, check whether the replica has already been re-added to queue.
	if _, ok := bq.mu.replicas[repl.RangeID]; ok {
		return
	}

	log.Error(ctx, errors.Wrap(triggeringErr, "purgatory"))

	item := &replicaItem{value: repl.RangeID}
	bq.mu.replicas[repl.RangeID] = item

	defer func() {
		bq.purgatory.Update(int64(len(bq.mu.purgatory)))
	}()

	// If purgatory already exists, just add to the map and we're done.
	if bq.mu.purgatory != nil {
		bq.mu.purgatory[repl.RangeID] = triggeringErr
		return
	}

	// Otherwise, create purgatory and start processing.
	bq.mu.purgatory = map[roachpb.RangeID]error{
		repl.RangeID: triggeringErr,
	}

	stopper.RunWorker(func() {
		ctx := bq.AnnotateCtx(context.Background())
		ticker := time.NewTicker(purgatoryReportInterval)
		for {
			select {
			case <-bq.impl.purgatoryChan():
				// Remove all items from purgatory into a copied slice.
				bq.mu.Lock()
				ranges := make([]roachpb.RangeID, 0, len(bq.mu.purgatory))
				for rangeID := range bq.mu.purgatory {
					item := bq.mu.replicas[rangeID]
					ranges = append(ranges, item.value)
					bq.remove(item)
				}
				bq.mu.Unlock()
				for _, id := range ranges {
					repl, err := bq.store.GetReplica(id)
					if err != nil {
						log.Errorf(ctx, "range %s no longer exists on store: %s", id, err)
						return
					}
					if stopper.RunTask(func() {
						annotatedCtx := repl.AnnotateCtx(ctx)
						if err := bq.processReplica(annotatedCtx, repl, clock); err != nil {
							bq.maybeAddToPurgatory(annotatedCtx, repl, err, clock, stopper)
						}
					}) != nil {
						return
					}
				}
				bq.mu.Lock()
				if len(bq.mu.purgatory) == 0 {
					log.Infof(ctx, "purgatory is now empty")
					bq.mu.purgatory = nil
					bq.mu.Unlock()
					return
				}
				bq.mu.Unlock()
			case <-ticker.C:
				// Report purgatory status.
				bq.mu.Lock()
				errMap := map[string]int{}
				for _, err := range bq.mu.purgatory {
					errMap[err.Error()]++
				}
				bq.mu.Unlock()
				for errStr, count := range errMap {
					log.Errorf(ctx, "%d replicas failing with %q", count, errStr)
				}
			case <-stopper.ShouldStop():
				return
			}
		}
	})
}
Пример #30
0
// initStores initializes the Stores map from ID to Store. Stores are
// added to the local sender if already bootstrapped. A bootstrapped
// Store has a valid ident with cluster, node and Store IDs set. If
// the Store doesn't yet have a valid ident, it's added to the
// bootstraps list for initialization once the cluster and node IDs
// have been determined.
func (n *Node) initStores(
	ctx context.Context, engines []engine.Engine, stopper *stop.Stopper, bootstrapped bool,
) error {
	var bootstraps []*storage.Store

	if len(engines) == 0 {
		return errors.Errorf("no engines")
	}
	for _, e := range engines {
		s := storage.NewStore(n.storeCfg, e, &n.Descriptor)
		log.Eventf(ctx, "created store for engine: %s", e)
		if bootstrapped {
			s.NotifyBootstrapped()
		}
		// Initialize each store in turn, handling un-bootstrapped errors by
		// adding the store to the bootstraps list.
		if err := s.Start(ctx, stopper); err != nil {
			if _, ok := err.(*storage.NotBootstrappedError); ok {
				log.Infof(ctx, "store %s not bootstrapped", s)
				bootstraps = append(bootstraps, s)
				continue
			}
			return errors.Errorf("failed to start store: %s", err)
		}
		if s.Ident.ClusterID == *uuid.EmptyUUID || s.Ident.NodeID == 0 {
			return errors.Errorf("unidentified store: %s", s)
		}
		capacity, err := s.Capacity()
		if err != nil {
			return errors.Errorf("could not query store capacity: %s", err)
		}
		log.Infof(ctx, "initialized store %s: %+v", s, capacity)
		n.addStore(s)
	}

	// If there are no initialized stores and no gossip resolvers,
	// bootstrap this node as the seed of a new cluster.
	if n.stores.GetStoreCount() == 0 {
		resolvers := n.storeCfg.Gossip.GetResolvers()
		// Check for the case of uninitialized node having only itself specified as join host.
		switch len(resolvers) {
		case 0:
			return errNeedsBootstrap
		case 1:
			if resolvers[0].Addr() == n.Descriptor.Address.String() {
				return errCannotJoinSelf
			}
		}
	}

	// Verify all initialized stores agree on cluster and node IDs.
	if err := n.validateStores(); err != nil {
		return err
	}
	log.Event(ctx, "validated stores")

	// Set the stores map as the gossip persistent storage, so that
	// gossip can bootstrap using the most recently persisted set of
	// node addresses.
	if err := n.storeCfg.Gossip.SetStorage(n.stores); err != nil {
		return fmt.Errorf("failed to initialize the gossip interface: %s", err)
	}

	// Connect gossip before starting bootstrap. For new nodes, connecting
	// to the gossip network is necessary to get the cluster ID.
	n.connectGossip(ctx)
	log.Event(ctx, "connected to gossip")

	// If no NodeID has been assigned yet, allocate a new node ID by
	// supplying 0 to initNodeID.
	if n.Descriptor.NodeID == 0 {
		n.initNodeID(0)
		n.initialBoot = true
		log.Eventf(ctx, "allocated node ID %d", n.Descriptor.NodeID)
	}

	// Bootstrap any uninitialized stores asynchronously.
	if len(bootstraps) > 0 {
		if err := stopper.RunAsyncTask(ctx, func(ctx context.Context) {
			n.bootstrapStores(ctx, bootstraps, stopper)
		}); err != nil {
			return err
		}
	}

	return nil
}