Beispiel #1
0
// processLoop processes the entries in the queue until the provided
// stopper signals exit.
//
// TODO(spencer): current load should factor into replica processing timer.
func (bq *baseQueue) processLoop(clock *hlc.Clock, stopper *stop.Stopper) {
	stopper.RunWorker(func() {
		ctx := bq.AnnotateCtx(context.Background())
		defer func() {
			bq.mu.Lock()
			bq.mu.stopped = true
			bq.mu.Unlock()
			bq.AmbientContext.FinishEventLog()
		}()

		// nextTime is initially nil; we don't start any timers until the queue
		// becomes non-empty.
		var nextTime <-chan time.Time

		immediately := make(chan time.Time)
		close(immediately)

		for {
			select {
			// Exit on stopper.
			case <-stopper.ShouldStop():
				return

			// Incoming signal sets the next time to process if there were previously
			// no replicas in the queue.
			case <-bq.incoming:
				if nextTime == nil {
					// When a replica is added, wake up immediately. This is mainly
					// to facilitate testing without unnecessary sleeps.
					nextTime = immediately

					// In case we're in a test, still block on the impl.
					bq.impl.timer()
				}
			// Process replicas as the timer expires.
			case <-nextTime:
				repl := bq.pop()
				if repl != nil {
					if stopper.RunTask(func() {
						annotatedCtx := repl.AnnotateCtx(ctx)
						if err := bq.processReplica(annotatedCtx, repl, clock); err != nil {
							// Maybe add failing replica to purgatory if the queue supports it.
							bq.maybeAddToPurgatory(annotatedCtx, repl, err, clock, stopper)
						}
					}) != nil {
						return
					}
				}
				if bq.Length() == 0 {
					nextTime = nil
				} else {
					nextTime = time.After(bq.impl.timer())
				}
			}
		}
	})
}
Beispiel #2
0
// gossip loops, sending deltas of the infostore and receiving deltas
// in turn. If an alternate is proposed on response, the client addr
// is modified and method returns for forwarding by caller.
func (c *client) gossip(
	ctx context.Context,
	g *Gossip,
	stream Gossip_GossipClient,
	stopper *stop.Stopper,
	wg *sync.WaitGroup,
) error {
	sendGossipChan := make(chan struct{}, 1)

	// Register a callback for gossip updates.
	updateCallback := func(_ string, _ roachpb.Value) {
		select {
		case sendGossipChan <- struct{}{}:
		default:
		}
	}
	// Defer calling "undoer" callback returned from registration.
	defer g.RegisterCallback(".*", updateCallback)()

	errCh := make(chan error, 1)
	// This wait group is used to allow the caller to wait until gossip
	// processing is terminated.
	wg.Add(1)
	stopper.RunWorker(func() {
		defer wg.Done()

		errCh <- func() error {
			for {
				reply, err := stream.Recv()
				if err != nil {
					return err
				}
				if err := c.handleResponse(ctx, g, reply); err != nil {
					return err
				}
			}
		}()
	})

	for {
		select {
		case <-c.closer:
			return nil
		case <-stopper.ShouldStop():
			return nil
		case err := <-errCh:
			return err
		case <-sendGossipChan:
			if err := c.sendGossip(g, stream); err != nil {
				return err
			}
		}
	}
}
Beispiel #3
0
func (s *raftScheduler) Start(stopper *stop.Stopper) {
	stopper.RunWorker(func() {
		<-stopper.ShouldStop()
		s.mu.Lock()
		s.mu.stopped = true
		s.mu.Unlock()
		s.mu.cond.Broadcast()
	})

	s.done.Add(s.numWorkers)
	for i := 0; i < s.numWorkers; i++ {
		stopper.RunWorker(func() {
			s.worker(stopper)
		})
	}
}
Beispiel #4
0
// scanLoop loops endlessly, scanning through replicas available via
// the replica set, or until the scanner is stopped. The iteration
// is paced to complete a full scan in approximately the scan interval.
func (rs *replicaScanner) scanLoop(clock *hlc.Clock, stopper *stop.Stopper) {
	stopper.RunWorker(func() {
		ctx := rs.AnnotateCtx(context.Background())
		start := timeutil.Now()

		// waitTimer is reset in each call to waitAndProcess.
		defer rs.waitTimer.Stop()

		for {
			if rs.GetDisabled() {
				if done := rs.waitEnabled(stopper); done {
					return
				}
				continue
			}
			var shouldStop bool
			count := 0
			rs.replicas.Visit(func(repl *Replica) bool {
				count++
				shouldStop = rs.waitAndProcess(ctx, start, clock, stopper, repl)
				return !shouldStop
			})
			if count == 0 {
				// No replicas processed, just wait.
				shouldStop = rs.waitAndProcess(ctx, start, clock, stopper, nil)
			}

			shouldStop = shouldStop || nil != stopper.RunTask(func() {
				// Increment iteration count.
				rs.mu.Lock()
				defer rs.mu.Unlock()
				rs.mu.scanCount++
				rs.mu.total += timeutil.Since(start)
				if log.V(6) {
					log.Infof(ctx, "reset replica scan iteration")
				}

				// Reset iteration and start time.
				start = timeutil.Now()
			})
			if shouldStop {
				return
			}
		}
	})
}
Beispiel #5
0
// NewExecutor creates an Executor and registers a callback on the
// system config.
func NewExecutor(
	cfg ExecutorConfig, stopper *stop.Stopper, startupMemMetrics *MemoryMetrics,
) *Executor {
	exec := &Executor{
		cfg:     cfg,
		reCache: parser.NewRegexpCache(512),

		Latency:          metric.NewLatency(MetaLatency, cfg.MetricsSampleInterval),
		TxnBeginCount:    metric.NewCounter(MetaTxnBegin),
		TxnCommitCount:   metric.NewCounter(MetaTxnCommit),
		TxnAbortCount:    metric.NewCounter(MetaTxnAbort),
		TxnRollbackCount: metric.NewCounter(MetaTxnRollback),
		SelectCount:      metric.NewCounter(MetaSelect),
		UpdateCount:      metric.NewCounter(MetaUpdate),
		InsertCount:      metric.NewCounter(MetaInsert),
		DeleteCount:      metric.NewCounter(MetaDelete),
		DdlCount:         metric.NewCounter(MetaDdl),
		MiscCount:        metric.NewCounter(MetaMisc),
		QueryCount:       metric.NewCounter(MetaQuery),
	}

	exec.systemConfigCond = sync.NewCond(exec.systemConfigMu.RLocker())

	gossipUpdateC := cfg.Gossip.RegisterSystemConfigChannel()
	stopper.RunWorker(func() {
		for {
			select {
			case <-gossipUpdateC:
				sysCfg, _ := cfg.Gossip.GetSystemConfig()
				exec.updateSystemConfig(sysCfg)
			case <-stopper.ShouldStop():
				return
			}
		}
	})

	ctx := log.WithLogTag(context.Background(), "startup", nil)
	startupSession := NewSession(ctx, SessionArgs{}, exec, nil, startupMemMetrics)
	if err := exec.virtualSchemas.init(&startupSession.planner); err != nil {
		log.Fatal(ctx, err)
	}
	startupSession.Finish(exec)

	return exec
}
Beispiel #6
0
// startComputePeriodicMetrics starts a loop which periodically instructs each
// store to compute the value of metrics which cannot be incrementally
// maintained.
func (n *Node) startComputePeriodicMetrics(stopper *stop.Stopper, interval time.Duration) {
	stopper.RunWorker(func() {
		ctx := n.AnnotateCtx(context.Background())
		// Compute periodic stats at the same frequency as metrics are sampled.
		ticker := time.NewTicker(interval)
		defer ticker.Stop()
		for tick := 0; ; tick++ {
			select {
			case <-ticker.C:
				if err := n.computePeriodicMetrics(tick); err != nil {
					log.Errorf(ctx, "failed computing periodic metrics: %s", err)
				}
			case <-stopper.ShouldStop():
				return
			}
		}
	})
}
Beispiel #7
0
// NewContext creates an rpc Context with the supplied values.
func NewContext(
	ambient log.AmbientContext, baseCtx *base.Config, hlcClock *hlc.Clock, stopper *stop.Stopper,
) *Context {
	ctx := &Context{
		Config: baseCtx,
	}
	if hlcClock != nil {
		ctx.localClock = hlcClock
	} else {
		ctx.localClock = hlc.NewClock(hlc.UnixNano)
	}
	ctx.breakerClock = breakerClock{
		clock: ctx.localClock,
	}
	var cancel context.CancelFunc
	ctx.masterCtx, cancel = context.WithCancel(ambient.AnnotateCtx(context.Background()))
	ctx.Stopper = stopper
	ctx.RemoteClocks = newRemoteClockMonitor(
		ctx.masterCtx, ctx.localClock, 10*defaultHeartbeatInterval)
	ctx.HeartbeatInterval = defaultHeartbeatInterval
	ctx.HeartbeatTimeout = 2 * defaultHeartbeatInterval
	ctx.conns.cache = make(map[string]*connMeta)

	stopper.RunWorker(func() {
		<-stopper.ShouldQuiesce()

		cancel()
		ctx.conns.Lock()
		for key, meta := range ctx.conns.cache {
			meta.Do(func() {
				// Make sure initialization is not in progress when we're removing the
				// conn. We need to set the error in case we win the race against the
				// real initialization code.
				if meta.err == nil {
					meta.err = &roachpb.NodeUnavailableError{}
				}
			})
			ctx.removeConnLocked(key, meta)
		}
		ctx.conns.Unlock()
	})

	return ctx
}
Beispiel #8
0
// ListenAndServeGRPC creates a listener and serves the specified grpc Server
// on it, closing the listener when signalled by the stopper.
func ListenAndServeGRPC(
	stopper *stop.Stopper, server *grpc.Server, addr net.Addr,
) (net.Listener, error) {
	ln, err := net.Listen(addr.Network(), addr.String())
	if err != nil {
		return ln, err
	}

	stopper.RunWorker(func() {
		<-stopper.ShouldQuiesce()
		FatalIfUnexpected(ln.Close())
		<-stopper.ShouldStop()
		server.Stop()
	})

	stopper.RunWorker(func() {
		FatalIfUnexpected(server.Serve(ln))
	})
	return ln, nil
}
Beispiel #9
0
func (tq *testQueue) Start(clock *hlc.Clock, stopper *stop.Stopper) {
	stopper.RunWorker(func() {
		for {
			select {
			case <-time.After(1 * time.Millisecond):
				tq.Lock()
				if !tq.disabled && len(tq.ranges) > 0 {
					tq.ranges = tq.ranges[1:]
					tq.processed++
				}
				tq.Unlock()
			case <-stopper.ShouldStop():
				tq.Lock()
				tq.done = true
				tq.Unlock()
				return
			}
		}
	})
}
Beispiel #10
0
// start will run continuously and expire old reservations.
func (b *bookie) start(stopper *stop.Stopper) {
	stopper.RunWorker(func() {
		var timeoutTimer timeutil.Timer
		defer timeoutTimer.Stop()
		ctx := context.TODO()
		for {
			var timeout time.Duration
			b.mu.Lock()
			nextExpiration := b.mu.queue.peek()
			if nextExpiration == nil {
				// No reservations to expire.
				timeout = b.reservationTimeout
			} else {
				now := b.clock.Now()
				if now.GoTime().After(nextExpiration.expireAt.GoTime()) {
					// We have a reservation expiration, remove it.
					expiredReservation := b.mu.queue.dequeue()
					// Is it an active reservation?
					if b.mu.reservationsByRangeID[expiredReservation.RangeID] == expiredReservation {
						b.fillReservationLocked(ctx, expiredReservation)
					} else if log.V(2) {
						log.Infof(ctx, "[r%d] expired reservation has already been filled",
							expiredReservation.RangeID)
					}
					// Set the timeout to 0 to force another peek.
					timeout = 0
				} else {
					timeout = nextExpiration.expireAt.GoTime().Sub(now.GoTime())
				}
			}
			b.mu.Unlock()
			timeoutTimer.Reset(timeout)
			select {
			case <-timeoutTimer.C:
				timeoutTimer.Read = true
			case <-stopper.ShouldStop():
				return
			}
		}
	})
}
Beispiel #11
0
// StartHeartbeat starts a periodic heartbeat to refresh this node's
// last heartbeat in the node liveness table.
func (nl *NodeLiveness) StartHeartbeat(ctx context.Context, stopper *stop.Stopper) {
	log.VEventf(ctx, 1, "starting liveness heartbeat")
	retryOpts := base.DefaultRetryOptions()
	retryOpts.Closer = stopper.ShouldQuiesce()

	stopper.RunWorker(func() {
		ambient := nl.ambientCtx
		ambient.AddLogTag("hb", nil)
		ticker := time.NewTicker(nl.heartbeatInterval)
		defer ticker.Stop()
		for {
			if !nl.pauseHeartbeat.Load().(bool) {
				ctx, sp := ambient.AnnotateCtxWithSpan(context.Background(), "heartbeat")
				ctx, cancel := context.WithTimeout(ctx, nl.heartbeatInterval)
				// Retry heartbeat in the event the conditional put fails.
				for r := retry.StartWithCtx(ctx, retryOpts); r.Next(); {
					liveness, err := nl.Self()
					if err != nil && err != ErrNoLivenessRecord {
						log.Errorf(ctx, "unexpected error getting liveness: %v", err)
					}
					if err := nl.Heartbeat(ctx, liveness); err != nil {
						if err == errSkippedHeartbeat {
							continue
						}
						log.Errorf(ctx, "failed liveness heartbeat: %v", err)
					}
					break
				}
				cancel()
				sp.Finish()
			}
			select {
			case <-ticker.C:
			case <-stopper.ShouldStop():
				return
			}
		}
	})
}
Beispiel #12
0
// MakeServer constructs a Server that tracks active connections, closing them
// when signalled by stopper.
func MakeServer(stopper *stop.Stopper, tlsConfig *tls.Config, handler http.Handler) Server {
	var mu syncutil.Mutex
	activeConns := make(map[net.Conn]struct{})
	server := Server{
		Server: &http.Server{
			Handler:   handler,
			TLSConfig: tlsConfig,
			ConnState: func(conn net.Conn, state http.ConnState) {
				mu.Lock()
				switch state {
				case http.StateNew:
					activeConns[conn] = struct{}{}
				case http.StateClosed:
					delete(activeConns, conn)
				}
				mu.Unlock()
			},
			ErrorLog: httpLogger,
		},
	}

	// net/http.(*Server).Serve/http2.ConfigureServer are not thread safe with
	// respect to net/http.(*Server).TLSConfig, so we call it synchronously here.
	if err := http2.ConfigureServer(server.Server, nil); err != nil {
		log.Fatal(context.TODO(), err)
	}

	stopper.RunWorker(func() {
		<-stopper.ShouldStop()

		mu.Lock()
		for conn := range activeConns {
			conn.Close()
		}
		mu.Unlock()
	})

	return server
}
Beispiel #13
0
// startGossip loops on a periodic ticker to gossip node-related
// information. Starts a goroutine to loop until the node is closed.
func (n *Node) startGossip(stopper *stop.Stopper) {
	stopper.RunWorker(func() {
		ctx := n.AnnotateCtx(context.Background())
		// This should always return immediately and acts as a sanity check that we
		// don't try to gossip before we're connected.
		select {
		case <-n.storeCfg.Gossip.Connected:
		default:
			panic(fmt.Sprintf("%s: not connected to gossip", n))
		}
		// Verify we've already gossiped our node descriptor.
		if _, err := n.storeCfg.Gossip.GetNodeDescriptor(n.Descriptor.NodeID); err != nil {
			panic(err)
		}

		gossipStoresInterval := envutil.EnvOrDefaultDuration("COCKROACH_GOSSIP_STORES_INTERVAL",
			gossip.DefaultGossipStoresInterval)
		statusTicker := time.NewTicker(gossipStatusInterval)
		storesTicker := time.NewTicker(gossipStoresInterval)
		nodeTicker := time.NewTicker(gossipNodeDescriptorInterval)
		defer storesTicker.Stop()
		defer nodeTicker.Stop()
		n.gossipStores(ctx) // one-off run before going to sleep
		for {
			select {
			case <-statusTicker.C:
				n.storeCfg.Gossip.LogStatus()
			case <-storesTicker.C:
				n.gossipStores(ctx)
			case <-nodeTicker.C:
				if err := n.storeCfg.Gossip.SetNodeDescriptor(&n.Descriptor); err != nil {
					log.Warningf(ctx, "couldn't gossip descriptor for node %d: %s", n.Descriptor.NodeID, err)
				}
			case <-stopper.ShouldStop():
				return
			}
		}
	})
}
Beispiel #14
0
// start will run continuously and mark stores as offline if they haven't been
// heard from in longer than timeUntilStoreDead.
func (sp *StorePool) start(stopper *stop.Stopper) {
	stopper.RunWorker(func() {
		var timeoutTimer timeutil.Timer
		defer timeoutTimer.Stop()
		for {
			var timeout time.Duration
			sp.mu.Lock()
			detail := sp.mu.queue.peek()
			if detail == nil {
				// No stores yet, wait the full timeout.
				timeout = sp.timeUntilStoreDead
			} else {
				// Check to see if the store should be marked as dead.
				deadAsOf := detail.lastUpdatedTime.GoTime().Add(sp.timeUntilStoreDead)
				now := sp.clock.Now()
				if now.GoTime().After(deadAsOf) {
					deadDetail := sp.mu.queue.dequeue()
					deadDetail.markDead(now)
					// The next store might be dead as well, set the timeout to
					// 0 to process it immediately.
					timeout = 0
				} else {
					// Store is still alive, schedule the next check for when
					// it should timeout.
					timeout = deadAsOf.Sub(now.GoTime())
				}
			}
			sp.mu.Unlock()
			timeoutTimer.Reset(timeout)
			select {
			case <-timeoutTimer.C:
				timeoutTimer.Read = true
			case <-stopper.ShouldStop():
				return
			}
		}
	})
}
Beispiel #15
0
// StartHeartbeat starts a periodic heartbeat to refresh this node's
// last heartbeat in the node liveness table.
func (nl *NodeLiveness) StartHeartbeat(ctx context.Context, stopper *stop.Stopper) {
	log.VEventf(ctx, 1, "starting liveness heartbeat")

	stopper.RunWorker(func() {
		ambient := nl.ambientCtx
		ambient.AddLogTag("hb", nil)
		ticker := time.NewTicker(nl.heartbeatInterval)
		defer ticker.Stop()
		for {
			ctx, sp := ambient.AnnotateCtxWithSpan(context.Background(), "heartbeat")
			if err := nl.heartbeat(ctx); err != nil {
				log.Errorf(ctx, "failed liveness heartbeat: %s", err)
			}
			sp.Finish()
			select {
			case <-ticker.C:
			case <-nl.stopHeartbeat:
				return
			case <-stopper.ShouldStop():
				return
			}
		}
	})
}
Beispiel #16
0
// maybeAddToPurgatory possibly adds the specified replica to the
// purgatory queue, which holds replicas which have failed
// processing. To be added, the failing error must implement
// purgatoryError and the queue implementation must have its own
// mechanism for signaling re-processing of replicas held in
// purgatory.
func (bq *baseQueue) maybeAddToPurgatory(
	ctx context.Context, repl *Replica, triggeringErr error, clock *hlc.Clock, stopper *stop.Stopper,
) {
	// Increment failures metric here to capture all error returns from
	// process().
	bq.failures.Inc(1)

	// Check whether the failure is a purgatory error and whether the queue supports it.
	if _, ok := triggeringErr.(purgatoryError); !ok || bq.impl.purgatoryChan() == nil {
		log.Error(ctx, triggeringErr)
		return
	}
	bq.mu.Lock()
	defer bq.mu.Unlock()

	// First, check whether the replica has already been re-added to queue.
	if _, ok := bq.mu.replicas[repl.RangeID]; ok {
		return
	}

	log.Error(ctx, errors.Wrap(triggeringErr, "purgatory"))

	item := &replicaItem{value: repl.RangeID}
	bq.mu.replicas[repl.RangeID] = item

	defer func() {
		bq.purgatory.Update(int64(len(bq.mu.purgatory)))
	}()

	// If purgatory already exists, just add to the map and we're done.
	if bq.mu.purgatory != nil {
		bq.mu.purgatory[repl.RangeID] = triggeringErr
		return
	}

	// Otherwise, create purgatory and start processing.
	bq.mu.purgatory = map[roachpb.RangeID]error{
		repl.RangeID: triggeringErr,
	}

	stopper.RunWorker(func() {
		ctx := bq.AnnotateCtx(context.Background())
		ticker := time.NewTicker(purgatoryReportInterval)
		for {
			select {
			case <-bq.impl.purgatoryChan():
				// Remove all items from purgatory into a copied slice.
				bq.mu.Lock()
				ranges := make([]roachpb.RangeID, 0, len(bq.mu.purgatory))
				for rangeID := range bq.mu.purgatory {
					item := bq.mu.replicas[rangeID]
					ranges = append(ranges, item.value)
					bq.remove(item)
				}
				bq.mu.Unlock()
				for _, id := range ranges {
					repl, err := bq.store.GetReplica(id)
					if err != nil {
						log.Errorf(ctx, "range %s no longer exists on store: %s", id, err)
						return
					}
					if stopper.RunTask(func() {
						annotatedCtx := repl.AnnotateCtx(ctx)
						if err := bq.processReplica(annotatedCtx, repl, clock); err != nil {
							bq.maybeAddToPurgatory(annotatedCtx, repl, err, clock, stopper)
						}
					}) != nil {
						return
					}
				}
				bq.mu.Lock()
				if len(bq.mu.purgatory) == 0 {
					log.Infof(ctx, "purgatory is now empty")
					bq.mu.purgatory = nil
					bq.mu.Unlock()
					return
				}
				bq.mu.Unlock()
			case <-ticker.C:
				// Report purgatory status.
				bq.mu.Lock()
				errMap := map[string]int{}
				for _, err := range bq.mu.purgatory {
					errMap[err.Error()]++
				}
				bq.mu.Unlock()
				for errStr, count := range errMap {
					log.Errorf(ctx, "%d replicas failing with %q", count, errStr)
				}
			case <-stopper.ShouldStop():
				return
			}
		}
	})
}
Beispiel #17
0
// Start starts a goroutine that runs outstanding schema changes
// for tables received in the latest system configuration via gossip.
func (s *SchemaChangeManager) Start(stopper *stop.Stopper) {
	stopper.RunWorker(func() {
		descKeyPrefix := keys.MakeTablePrefix(uint32(sqlbase.DescriptorTable.ID))
		gossipUpdateC := s.gossip.RegisterSystemConfigChannel()
		timer := &time.Timer{}
		delay := 360 * time.Second
		if s.testingKnobs.AsyncExecQuickly {
			delay = 20 * time.Millisecond
		}
		for {
			select {
			case <-gossipUpdateC:
				cfg, _ := s.gossip.GetSystemConfig()
				// Read all tables and their versions
				if log.V(2) {
					log.Info(context.TODO(), "received a new config")
				}
				schemaChanger := SchemaChanger{
					nodeID:       s.leaseMgr.nodeID.Get(),
					db:           s.db,
					leaseMgr:     s.leaseMgr,
					testingKnobs: s.testingKnobs,
				}
				// Keep track of existing schema changers.
				oldSchemaChangers := make(map[sqlbase.ID]struct{}, len(s.schemaChangers))
				for k := range s.schemaChangers {
					oldSchemaChangers[k] = struct{}{}
				}
				execAfter := timeutil.Now().Add(delay)
				// Loop through the configuration to find all the tables.
				for _, kv := range cfg.Values {
					if !bytes.HasPrefix(kv.Key, descKeyPrefix) {
						continue
					}
					// Attempt to unmarshal config into a table/database descriptor.
					var descriptor sqlbase.Descriptor
					if err := kv.Value.GetProto(&descriptor); err != nil {
						log.Warningf(context.TODO(), "%s: unable to unmarshal descriptor %v", kv.Key, kv.Value)
						continue
					}
					switch union := descriptor.Union.(type) {
					case *sqlbase.Descriptor_Table:
						table := union.Table
						table.MaybeUpgradeFormatVersion()
						if err := table.ValidateTable(); err != nil {
							log.Errorf(context.TODO(), "%s: received invalid table descriptor: %v", kv.Key, table)
							continue
						}

						// Keep track of outstanding schema changes.
						// If all schema change commands always set UpVersion, why
						// check for the presence of mutations?
						// A schema change execution might fail soon after
						// unsetting UpVersion, and we still want to process
						// outstanding mutations. Similar with a table marked for deletion.
						if table.UpVersion || table.Dropped() || table.Adding() ||
							table.Renamed() || len(table.Mutations) > 0 {
							if log.V(2) {
								log.Infof(context.TODO(), "%s: queue up pending schema change; table: %d, version: %d",
									kv.Key, table.ID, table.Version)
							}

							// Only track the first schema change. We depend on
							// gossip to renotify us when a schema change has been
							// completed.
							schemaChanger.tableID = table.ID
							if len(table.Mutations) == 0 {
								schemaChanger.mutationID = sqlbase.InvalidMutationID
							} else {
								schemaChanger.mutationID = table.Mutations[0].MutationID
							}
							schemaChanger.execAfter = execAfter
							// Keep track of this schema change.
							// Remove from oldSchemaChangers map.
							delete(oldSchemaChangers, table.ID)
							if sc, ok := s.schemaChangers[table.ID]; ok {
								if sc.mutationID == schemaChanger.mutationID {
									// Ignore duplicate.
									continue
								}
							}
							s.schemaChangers[table.ID] = schemaChanger
						}

					case *sqlbase.Descriptor_Database:
						// Ignore.
					}
				}
				// Delete old schema changers.
				for k := range oldSchemaChangers {
					delete(s.schemaChangers, k)
				}
				timer = s.newTimer()

			case <-timer.C:
				if s.testingKnobs.AsyncExecNotification != nil &&
					s.testingKnobs.AsyncExecNotification() != nil {
					timer = s.newTimer()
					continue
				}
				for tableID, sc := range s.schemaChangers {
					if timeutil.Since(sc.execAfter) > 0 {
						err := sc.exec()
						if err != nil {
							if err == errExistingSchemaChangeLease {
							} else if err == sqlbase.ErrDescriptorNotFound {
								// Someone deleted this table. Don't try to run the schema
								// changer again. Note that there's no gossip update for the
								// deletion which would remove this schemaChanger.
								delete(s.schemaChangers, tableID)
							} else {
								// We don't need to act on integrity
								// constraints violations because exec()
								// purges mutations that violate integrity
								// constraints.
								log.Warningf(context.TODO(), "Error executing schema change: %s", err)
							}
						}
						// Advance the execAfter time so that this schema
						// changer doesn't get called again for a while.
						sc.execAfter = timeutil.Now().Add(delay)
					}
					// Only attempt to run one schema changer.
					break
				}
				timer = s.newTimer()

			case <-stopper.ShouldStop():
				return
			}
		}
	})
}
Beispiel #18
0
// RefreshLeases starts a goroutine that refreshes the lease manager
// leases for tables received in the latest system configuration via gossip.
func (m *LeaseManager) RefreshLeases(s *stop.Stopper, db *client.DB, gossip *gossip.Gossip) {
	s.RunWorker(func() {
		descKeyPrefix := keys.MakeTablePrefix(uint32(sqlbase.DescriptorTable.ID))
		gossipUpdateC := gossip.RegisterSystemConfigChannel()
		for {
			select {
			case <-gossipUpdateC:
				cfg, _ := gossip.GetSystemConfig()
				if m.testingKnobs.GossipUpdateEvent != nil {
					m.testingKnobs.GossipUpdateEvent(cfg)
				}
				// Read all tables and their versions
				if log.V(2) {
					log.Info(context.TODO(), "received a new config; will refresh leases")
				}

				// Loop through the configuration to find all the tables.
				for _, kv := range cfg.Values {
					if !bytes.HasPrefix(kv.Key, descKeyPrefix) {
						continue
					}
					// Attempt to unmarshal config into a table/database descriptor.
					var descriptor sqlbase.Descriptor
					if err := kv.Value.GetProto(&descriptor); err != nil {
						log.Warningf(context.TODO(), "%s: unable to unmarshal descriptor %v", kv.Key, kv.Value)
						continue
					}
					switch union := descriptor.Union.(type) {
					case *sqlbase.Descriptor_Table:
						table := union.Table
						table.MaybeUpgradeFormatVersion()
						if err := table.ValidateTable(); err != nil {
							log.Errorf(context.TODO(), "%s: received invalid table descriptor: %v", kv.Key, table)
							continue
						}
						if log.V(2) {
							log.Infof(context.TODO(), "%s: refreshing lease table: %d (%s), version: %d, deleted: %t",
								kv.Key, table.ID, table.Name, table.Version, table.Dropped())
						}
						// Try to refresh the table lease to one >= this version.
						if t := m.findTableState(table.ID, false /* create */); t != nil {
							if err := t.purgeOldLeases(
								db, table.Dropped(), table.Version, m.LeaseStore); err != nil {
								log.Warningf(context.TODO(), "error purging leases for table %d(%s): %s",
									table.ID, table.Name, err)
							}
						}
					case *sqlbase.Descriptor_Database:
						// Ignore.
					}
				}
				if m.testingKnobs.TestingLeasesRefreshedEvent != nil {
					m.testingKnobs.TestingLeasesRefreshedEvent(cfg)
				}

			case <-s.ShouldStop():
				return
			}
		}
	})
}
Beispiel #19
0
// start dials the remote addr and commences gossip once connected. Upon exit,
// the client is sent on the disconnected channel. This method starts client
// processing in a goroutine and returns immediately.
func (c *client) start(
	g *Gossip,
	disconnected chan *client,
	rpcCtx *rpc.Context,
	stopper *stop.Stopper,
	nodeID roachpb.NodeID,
	breaker *circuit.Breaker,
) {
	stopper.RunWorker(func() {
		ctx, cancel := context.WithCancel(c.AnnotateCtx(context.Background()))
		var wg sync.WaitGroup
		defer func() {
			// This closes the outgoing stream, causing any attempt to send or
			// receive to return an error.
			//
			// Note: it is still possible for incoming gossip to be processed after
			// this point.
			cancel()

			// The stream is closed, but there may still be some incoming gossip
			// being processed. Wait until that is complete to avoid racing the
			// client's removal against the discovery of its remote's node ID.
			wg.Wait()
			disconnected <- c
		}()

		consecFailures := breaker.ConsecFailures()
		var stream Gossip_GossipClient
		if err := breaker.Call(func() error {
			// Note: avoid using `grpc.WithBlock` here. This code is already
			// asynchronous from the caller's perspective, so the only effect of
			// `WithBlock` here is blocking shutdown - at the time of this writing,
			// that ends ups up making `kv` tests take twice as long.
			conn, err := rpcCtx.GRPCDial(c.addr.String())
			if err != nil {
				return err
			}
			if stream, err = NewGossipClient(conn).Gossip(ctx); err != nil {
				return err
			}
			return c.requestGossip(g, stream)
		}, 0); err != nil {
			if consecFailures == 0 {
				log.Warningf(ctx, "node %d: failed to start gossip client: %s", nodeID, err)
			}
			return
		}

		// Start gossiping.
		log.Infof(ctx, "node %d: started gossip client to %s", nodeID, c.addr)
		if err := c.gossip(ctx, g, stream, stopper, &wg); err != nil {
			if !grpcutil.IsClosedConnection(err) {
				g.mu.Lock()
				if c.peerID != 0 {
					log.Infof(ctx, "node %d: closing client to node %d (%s): %s", nodeID, c.peerID, c.addr, err)
				} else {
					log.Infof(ctx, "node %d: closing client to %s: %s", nodeID, c.addr, err)
				}
				g.mu.Unlock()
			}
		}
	})
}