// startUser simulates a stream of user events until the stopper // indicates it's time to exit. func startUser(ctx Context, stopper *stop.Stopper) { for { userID := 1 + int(rand.ExpFloat64()/rate) op := randomOp() if err := stopper.RunTask(func() { err := runUserOp(ctx, userID, op.typ) stats.Lock() _ = stats.hist.RecordValue(int64(userID)) stats.totalOps++ stats.opCounts[op.typ]++ switch { case err == errNoUser: stats.noUserOps++ case err == errNoPhoto: stats.noPhotoOps++ case err != nil: stats.failedOps++ log.Printf("failed to run %s op for %d: %s", op.name, userID, err) } stats.Unlock() }); err != nil { return } } }
// processLoop processes the entries in the queue until the provided // stopper signals exit. // // TODO(spencer): current load should factor into replica processing timer. func (bq *baseQueue) processLoop(clock *hlc.Clock, stopper *stop.Stopper) { stopper.RunWorker(func() { ctx := bq.AnnotateCtx(context.Background()) defer func() { bq.mu.Lock() bq.mu.stopped = true bq.mu.Unlock() bq.AmbientContext.FinishEventLog() }() // nextTime is initially nil; we don't start any timers until the queue // becomes non-empty. var nextTime <-chan time.Time immediately := make(chan time.Time) close(immediately) for { select { // Exit on stopper. case <-stopper.ShouldStop(): return // Incoming signal sets the next time to process if there were previously // no replicas in the queue. case <-bq.incoming: if nextTime == nil { // When a replica is added, wake up immediately. This is mainly // to facilitate testing without unnecessary sleeps. nextTime = immediately // In case we're in a test, still block on the impl. bq.impl.timer() } // Process replicas as the timer expires. case <-nextTime: repl := bq.pop() if repl != nil { if stopper.RunTask(func() { annotatedCtx := repl.AnnotateCtx(ctx) if err := bq.processReplica(annotatedCtx, repl, clock); err != nil { // Maybe add failing replica to purgatory if the queue supports it. bq.maybeAddToPurgatory(annotatedCtx, repl, err, clock, stopper) } }) != nil { return } } if bq.Length() == 0 { nextTime = nil } else { nextTime = time.After(bq.impl.timer()) } } } }) }
// scanLoop loops endlessly, scanning through replicas available via // the replica set, or until the scanner is stopped. The iteration // is paced to complete a full scan in approximately the scan interval. func (rs *replicaScanner) scanLoop(clock *hlc.Clock, stopper *stop.Stopper) { stopper.RunWorker(func() { ctx := rs.AnnotateCtx(context.Background()) start := timeutil.Now() // waitTimer is reset in each call to waitAndProcess. defer rs.waitTimer.Stop() for { if rs.GetDisabled() { if done := rs.waitEnabled(stopper); done { return } continue } var shouldStop bool count := 0 rs.replicas.Visit(func(repl *Replica) bool { count++ shouldStop = rs.waitAndProcess(ctx, start, clock, stopper, repl) return !shouldStop }) if count == 0 { // No replicas processed, just wait. shouldStop = rs.waitAndProcess(ctx, start, clock, stopper, nil) } shouldStop = shouldStop || nil != stopper.RunTask(func() { // Increment iteration count. rs.mu.Lock() defer rs.mu.Unlock() rs.mu.scanCount++ rs.mu.total += timeutil.Since(start) if log.V(6) { log.Infof(ctx, "reset replica scan iteration") } // Reset iteration and start time. start = timeutil.Now() }) if shouldStop { return } } }) }
// maybeAddToPurgatory possibly adds the specified replica to the // purgatory queue, which holds replicas which have failed // processing. To be added, the failing error must implement // purgatoryError and the queue implementation must have its own // mechanism for signaling re-processing of replicas held in // purgatory. func (bq *baseQueue) maybeAddToPurgatory( ctx context.Context, repl *Replica, triggeringErr error, clock *hlc.Clock, stopper *stop.Stopper, ) { // Increment failures metric here to capture all error returns from // process(). bq.failures.Inc(1) // Check whether the failure is a purgatory error and whether the queue supports it. if _, ok := triggeringErr.(purgatoryError); !ok || bq.impl.purgatoryChan() == nil { log.Error(ctx, triggeringErr) return } bq.mu.Lock() defer bq.mu.Unlock() // First, check whether the replica has already been re-added to queue. if _, ok := bq.mu.replicas[repl.RangeID]; ok { return } log.Error(ctx, errors.Wrap(triggeringErr, "purgatory")) item := &replicaItem{value: repl.RangeID} bq.mu.replicas[repl.RangeID] = item defer func() { bq.purgatory.Update(int64(len(bq.mu.purgatory))) }() // If purgatory already exists, just add to the map and we're done. if bq.mu.purgatory != nil { bq.mu.purgatory[repl.RangeID] = triggeringErr return } // Otherwise, create purgatory and start processing. bq.mu.purgatory = map[roachpb.RangeID]error{ repl.RangeID: triggeringErr, } stopper.RunWorker(func() { ctx := bq.AnnotateCtx(context.Background()) ticker := time.NewTicker(purgatoryReportInterval) for { select { case <-bq.impl.purgatoryChan(): // Remove all items from purgatory into a copied slice. bq.mu.Lock() ranges := make([]roachpb.RangeID, 0, len(bq.mu.purgatory)) for rangeID := range bq.mu.purgatory { item := bq.mu.replicas[rangeID] ranges = append(ranges, item.value) bq.remove(item) } bq.mu.Unlock() for _, id := range ranges { repl, err := bq.store.GetReplica(id) if err != nil { log.Errorf(ctx, "range %s no longer exists on store: %s", id, err) return } if stopper.RunTask(func() { annotatedCtx := repl.AnnotateCtx(ctx) if err := bq.processReplica(annotatedCtx, repl, clock); err != nil { bq.maybeAddToPurgatory(annotatedCtx, repl, err, clock, stopper) } }) != nil { return } } bq.mu.Lock() if len(bq.mu.purgatory) == 0 { log.Infof(ctx, "purgatory is now empty") bq.mu.purgatory = nil bq.mu.Unlock() return } bq.mu.Unlock() case <-ticker.C: // Report purgatory status. bq.mu.Lock() errMap := map[string]int{} for _, err := range bq.mu.purgatory { errMap[err.Error()]++ } bq.mu.Unlock() for errStr, count := range errMap { log.Errorf(ctx, "%d replicas failing with %q", count, errStr) } case <-stopper.ShouldStop(): return } } }) }