Example #1
0
// checkAllocatorStable returns whether the replica distribution within the
// cluster has been stable for at least `StableInterval`. Only unrecoverable
// errors are returned.
func (at *allocatorTest) checkAllocatorStable(db *gosql.DB) (bool, error) {
	q := `SELECT NOW()-timestamp, rangeID, storeID, eventType FROM rangelog WHERE ` +
		`timestamp=(SELECT MAX(timestamp) FROM rangelog WHERE eventType IN ($1, $2, $3))`
	eventTypes := []interface{}{
		string(storage.RangeEventLogSplit),
		string(storage.RangeEventLogAdd),
		string(storage.RangeEventLogRemove),
	}
	var elapsedStr string
	var rangeID int64
	var storeID int64
	var eventType string

	row := db.QueryRow(q, eventTypes...)
	if row == nil {
		log.Errorf("couldn't find any range events")
		return false, nil
	}
	if err := row.Scan(&elapsedStr, &rangeID, &storeID, &eventType); err != nil {
		// Log but don't return errors, to increase resilience against transient
		// errors.
		log.Errorf("error checking rebalancer: %s", err)
		return false, nil
	}
	elapsedSinceLastRangeEvent, err := time.ParseDuration(elapsedStr)
	if err != nil {
		return false, err
	}

	log.Infof("last range event: %s for range %d/store %d (%s ago)",
		eventType, rangeID, storeID, elapsedSinceLastRangeEvent)
	return elapsedSinceLastRangeEvent >= StableInterval, nil
}
Example #2
0
// runGetZone retrieves the zone config for a given object id,
// and if present, outputs its YAML representation.
// TODO(marc): accept db/table names rather than IDs.
func runGetZone(cmd *cobra.Command, args []string) {
	if len(args) != 1 {
		mustUsage(cmd)
		return
	}
	id, err := strconv.Atoi(args[0])
	if err != nil {
		log.Errorf("could not parse object ID %s", args[0])
		return
	}

	db, _ := makeSQLClient()
	defer func() { _ = db.Close() }()
	// TODO(marc): switch to placeholders once they work with pgwire.
	_, rows, err := runQueryWithFormat(db, fmtMap{"config": formatZone},
		fmt.Sprintf(`SELECT * FROM system.zones WHERE id=%d`, id))
	if err != nil {
		log.Error(err)
		return
	}

	if len(rows) == 0 {
		log.Errorf("Object %d: no zone config found", id)
		return
	}
	fmt.Println(rows[0][1])
}
Example #3
0
// runStart starts the cockroach node using -stores as the list of
// storage devices ("stores") on this machine and -gossip as the list
// of "well-known" hosts used to join this node to the cockroach
// cluster via the gossip network.
func runStart(cmd *commander.Command, args []string) {
	log.Info("Starting cockroach cluster")
	s, err := newServer()
	if err != nil {
		log.Errorf("Failed to start Cockroach server: %v", err)
		return
	}

	// Init engines from -stores.
	engines, err := initEngines(*stores)
	if err != nil {
		log.Errorf("Failed to initialize engines from -stores=%q: %v", *stores, err)
		return
	}
	if len(engines) == 0 {
		log.Errorf("No valid engines specified after initializing from -stores=%q", *stores)
		return
	}

	err = s.start(engines, false)
	defer s.stop()
	if err != nil {
		log.Errorf("Cockroach server exited with error: %v", err)
		return
	}

	c := make(chan os.Signal, 1)
	signal.Notify(c, os.Interrupt, os.Kill)

	// Block until one of the signals above is received.
	<-c
}
Example #4
0
func runInit(cmd *commander.Command, args []string) {
	// Initialize the engine based on the first argument and
	// then verify it's not in-memory.

	err := Context.Init()
	if err != nil {
		log.Errorf("Failed to initialize context: %v", err)
		return
	}
	e := Context.Engines[0]
	if _, ok := e.(*engine.InMem); ok {
		log.Errorf("Cannot initialize a cluster using an in-memory store")
		return
	}
	// Generate a new UUID for cluster ID and bootstrap the cluster.
	clusterID := uuid.New()
	localDB, err := server.BootstrapCluster(clusterID, e)
	if err != nil {
		log.Errorf("Failed to bootstrap cluster: %v", err)
		return
	}
	// Close localDB and bootstrap engine.
	localDB.Close()
	e.Stop()

	fmt.Printf("Cockroach cluster %s has been initialized\n", clusterID)
	if Context.BootstrapOnly {
		fmt.Printf("To start the cluster, run \"cockroach start\"\n")
		return
	}
	runStart(cmd, args)
}
Example #5
0
func runInit(cmd *commander.Command, args []string) {
	// Initialize the engine based on the first argument and
	// then verify it's not in-memory.
	engines, err := initEngines(*stores)
	if err != nil {
		log.Errorf("Failed to initialize engines from -stores=%s: %v", *stores, err)
		return
	}
	if len(engines) == 0 {
		log.Errorf("No valid engines specified after initializing from -stores=%s", *stores)
		return
	}
	e := engines[0]
	if _, ok := e.(*engine.InMem); ok {
		log.Errorf("Cannot initialize a cluster using an in-memory store")
		return
	}
	// Generate a new UUID for cluster ID and bootstrap the cluster.
	clusterID := uuid.New()
	localDB, err := BootstrapCluster(clusterID, e)
	if err != nil {
		log.Errorf("Failed to bootstrap cluster: %v", err)
		return
	}
	// Close localDB and bootstrap engine.
	localDB.Close()
	e.Stop()

	fmt.Printf("Cockroach cluster %s has been initialized\n", clusterID)
	if *bootstrapOnly {
		fmt.Printf("To start the cluster, run \"cockroach start\"\n")
		return
	}
	runStart(cmd, args)
}
Example #6
0
// computeSplitKeys returns an array of keys at which the supplied
// range should be split, as computed by intersecting the range with
// accounting and zone config map boundaries.
func computeSplitKeys(g *gossip.Gossip, rng *Range) []proto.Key {
	// Now split the range into pieces by intersecting it with the
	// boundaries of the config map.
	splitKeys := proto.KeySlice{}
	for _, configKey := range []string{gossip.KeyConfigAccounting, gossip.KeyConfigZone} {
		info, err := g.GetInfo(configKey)
		if err != nil {
			log.Errorf("unable to fetch %s config from gossip: %s", configKey, err)
			continue
		}
		configMap := info.(PrefixConfigMap)
		splits, err := configMap.SplitRangeByPrefixes(rng.Desc().StartKey, rng.Desc().EndKey)
		if err != nil {
			log.Errorf("unable to split %s by prefix map %s", rng, configMap)
			continue
		}
		// Gather new splits.
		for _, split := range splits {
			if split.end.Less(rng.Desc().EndKey) {
				splitKeys = append(splitKeys, split.end)
			}
		}
	}

	// Sort and unique the combined split keys from intersections with
	// both the accounting and zone config maps.
	sort.Sort(splitKeys)
	var unique []proto.Key
	for i, key := range splitKeys {
		if i == 0 || !key.Equal(splitKeys[i-1]) {
			unique = append(unique, key)
		}
	}
	return unique
}
Example #7
0
// runStart starts the cockroach node using -stores as the list of
// storage devices ("stores") on this machine and -gossip as the list
// of "well-known" hosts used to join this node to the cockroach
// cluster via the gossip network.
func runStart(cmd *commander.Command, args []string) {
	info := util.GetBuildInfo()
	log.Infof("Build Vers: %s", info.Vers)
	log.Infof("Build Tag:  %s", info.Tag)
	log.Infof("Build Time: %s", info.Time)
	log.Infof("Build Deps: %s", info.Deps)

	log.Info("Starting cockroach cluster")
	s, err := server.NewServer(Context)
	if err != nil {
		log.Errorf("Failed to start Cockroach server: %v", err)
		return
	}

	err = Context.Init()
	if err != nil {
		log.Errorf("Failed to initialize context: %v", err)
		return
	}

	err = s.Start(false)
	defer s.Stop()
	if err != nil {
		log.Errorf("Cockroach server exited with error: %v", err)
		return
	}

	c := make(chan os.Signal, 1)
	signal.Notify(c, os.Interrupt, os.Kill)

	// Block until one of the signals above is received.
	<-c
}
Example #8
0
func (r *Replica) verifyChecksumTrigger(
	ctx context.Context, args roachpb.VerifyChecksumRequest,
) {
	id := args.ChecksumID
	c, ok := r.getChecksum(ctx, id)
	if !ok {
		log.Errorf(ctx, "consistency check skipped: checksum for id = %v doesn't exist", id)
		// Return success because a checksum might be missing only on
		// this replica. A checksum might be missing because of a
		// number of reasons: GC-ed, server restart, and ComputeChecksum
		// version incompatibility.
		return
	}
	if c.checksum != nil && !bytes.Equal(c.checksum, args.Checksum) {
		// Replication consistency problem!
		logFunc := log.Errorf

		// Collect some more debug information.
		if args.Snapshot == nil {
			// No debug information; run another consistency check to deliver
			// more debug information.
			if err := r.store.stopper.RunAsyncTask(func() {
				log.Errorf(ctx, "%s: consistency check failed; fetching details", r)
				desc := r.Desc()
				startKey := desc.StartKey.AsRawKey()
				// Can't use a start key less than LocalMax.
				if bytes.Compare(startKey, keys.LocalMax) < 0 {
					startKey = keys.LocalMax
				}
				if err := r.store.db.CheckConsistency(startKey, desc.EndKey.AsRawKey(), true /* withDiff */); err != nil {
					log.Errorf(ctx, "couldn't rerun consistency check: %s", err)
				}
			}); err != nil {
				log.Error(ctx, errors.Wrap(err, "could not rerun consistency check"))
			}
		} else {
			// Compute diff.
			diff := diffRange(args.Snapshot, c.snapshot)
			if diff != nil {
				for _, d := range diff {
					l := "leader"
					if d.LeaseHolder {
						l = "replica"
					}
					log.Errorf(ctx, "consistency check failed: k:v = (%s (%x), %s, %x) not present on %s",
						d.Key, d.Key, d.Timestamp, d.Value, l)
				}
			}
			if r.store.ctx.ConsistencyCheckPanicOnFailure {
				if p := r.store.ctx.TestingKnobs.BadChecksumPanic; p != nil {
					p(diff)
				} else {
					logFunc = log.Fatalf
				}
			}
		}

		logFunc(ctx, "consistency check failed on replica: %s, checksum mismatch: e = %x, v = %x", args.Checksum, c.checksum)
	}
}
Example #9
0
// runStart starts the cockroach node using --stores as the list of
// storage devices ("stores") on this machine and --gossip as the list
// of "well-known" hosts used to join this node to the cockroach
// cluster via the gossip network.
func runStart(cmd *cobra.Command, args []string) {
	info := util.GetBuildInfo()
	log.Infof("build Vers: %s", info.Vers)
	log.Infof("build Tag:  %s", info.Tag)
	log.Infof("build Time: %s", info.Time)
	log.Infof("build Deps: %s", info.Deps)

	// Default user for servers.
	Context.User = security.NodeUser
	// First initialize the Context as it is used in other places.
	err := Context.Init("start")
	if err != nil {
		log.Errorf("failed to initialize context: %s", err)
		return
	}

	log.Info("starting cockroach cluster")
	stopper := util.NewStopper()
	stopper.AddWorker()
	s, err := server.NewServer(Context, stopper)
	if err != nil {
		log.Errorf("failed to start Cockroach server: %s", err)
		return
	}

	err = s.Start(false)
	if err != nil {
		log.Errorf("cockroach server exited with error: %s", err)
		return
	}

	signalCh := make(chan os.Signal, 1)
	signal.Notify(signalCh, os.Interrupt, os.Kill)
	// TODO(spencer): move this behind a build tag.
	signal.Notify(signalCh, syscall.SIGTERM)

	// Block until one of the signals above is received or the stopper
	// is stopped externally (for example, via the quit endpoint).
	select {
	case <-stopper.ShouldStop():
		stopper.SetStopped()
	case <-signalCh:
		log.Infof("initiating graceful shutdown of server")
		stopper.SetStopped()
		go func() {
			s.Stop()
		}()
	}

	select {
	case <-signalCh:
		log.Warningf("second signal received, initiating hard shutdown")
	case <-time.After(time.Minute):
		log.Warningf("time limit reached, initiating hard shutdown")
		return
	case <-stopper.IsStopped():
		log.Infof("server drained and shutdown completed")
	}
	log.Flush()
}
Example #10
0
// runGetZone retrieves the zone config for a given object id,
// and if present, outputs its YAML representation.
// TODO(marc): accept db/table names rather than IDs.
func runGetZone(cmd *cobra.Command, args []string) {
	if len(args) != 1 {
		mustUsage(cmd)
		return
	}
	id, err := strconv.Atoi(args[0])
	if err != nil {
		log.Errorf("could not parse object ID %s", args[0])
		return
	}

	db := makeSQLClient()
	_, rows, err := runQueryWithFormat(db, fmtMap{"config": formatZone},
		`SELECT * FROM system.zones WHERE id=$1`, id)
	if err != nil {
		log.Error(err)
		return
	}

	if len(rows) == 0 {
		log.Errorf("Object %d: no zone config found", id)
		return
	}
	fmt.Fprintln(osStdout, rows[0][1])
}
// TestStoreRangeMergeDistantRanges attempts to merge two ranges
// that are not not next to each other.
func TestStoreRangeMergeDistantRanges(t *testing.T) {
	store := createTestStore(t)
	defer store.Stop()

	// Split into 3 ranges
	argsSplit, replySplit := adminSplitArgs(engine.KeyMin, []byte("d"), 1, store.StoreID())
	if err := store.ExecuteCmd(proto.AdminSplit, argsSplit, replySplit); err != nil {
		t.Fatalf("Can't split range %s", err)
	}
	argsSplit, replySplit = adminSplitArgs(engine.KeyMin, []byte("b"), 1, store.StoreID())
	if err := store.ExecuteCmd(proto.AdminSplit, argsSplit, replySplit); err != nil {
		t.Fatalf("Can't split range %s", err)
	}

	rangeA := store.LookupRange([]byte("a"), nil)
	rangeB := store.LookupRange([]byte("c"), nil)
	rangeC := store.LookupRange([]byte("e"), nil)

	if bytes.Equal(rangeA.Desc().StartKey, rangeB.Desc().StartKey) {
		log.Errorf("split ranges keys are equal %q!=%q", rangeA.Desc().StartKey, rangeB.Desc().StartKey)
	}
	if bytes.Equal(rangeB.Desc().StartKey, rangeC.Desc().StartKey) {
		log.Errorf("split ranges keys are equal %q!=%q", rangeB.Desc().StartKey, rangeC.Desc().StartKey)
	}
	if bytes.Equal(rangeA.Desc().StartKey, rangeC.Desc().StartKey) {
		log.Errorf("split ranges keys are equal %q!=%q", rangeA.Desc().StartKey, rangeC.Desc().StartKey)
	}

	argsMerge, replyMerge := adminMergeArgs(rangeC.Desc().StartKey, *rangeC.Desc(), 1, store.StoreID())
	rangeA.AdminMerge(argsMerge, replyMerge)
	if replyMerge.Error == nil {
		t.Fatal("Should not be able to merge two ranges that are not adjacent.")
	}
}
Example #12
0
func runAddNodes(cmd *cobra.Command, args []string) {
	if len(args) != 1 {
		cmd.Usage()
		return
	}
	numNodes, err := strconv.Atoi(args[0])
	if err != nil || numNodes < 1 {
		log.Errorf("argument %s must be an integer > 0", args)
		return
	}

	driver, err := NewDriver(Context)
	if err != nil {
		log.Errorf("could not create driver: %v", err)
		return
	}

	for i := 1; i <= numNodes; i++ {
		log.Infof("adding node %d of %d", i, numNodes)
		err := AddOneNode(driver)
		if err != nil {
			log.Errorf("problem adding node: %v", err)
			return
		}
	}
}
Example #13
0
// computeSplitKeys returns an array of keys at which the supplied
// range should be split, as computed by intersecting the range with
// zone config map boundaries.
func computeSplitKeys(g *gossip.Gossip, repl *Replica) []proto.Key {
	// Now split the range into pieces by intersecting it with the
	// boundaries of the config map.
	configMap, err := repl.rm.Gossip().GetZoneConfig()
	if err != nil {
		log.Errorf("unable to fetch zone config from gossip: %s", err)
		return nil
	}
	desc := repl.Desc()
	splits, err := configMap.SplitRangeByPrefixes(desc.StartKey, desc.EndKey)
	if err != nil {
		log.Errorf("unable to split %s by prefix map %s", repl, configMap)
		return nil
	}

	// Gather new splits.
	var splitKeys proto.KeySlice
	for _, split := range splits {
		if split.End.Less(desc.EndKey) {
			splitKeys = append(splitKeys, split.End)
		}
	}

	// Sort and unique the combined split keys from intersections with
	// the zone config maps.
	sort.Sort(splitKeys)
	var unique []proto.Key
	for i, key := range splitKeys {
		if i == 0 || !key.Equal(splitKeys[i-1]) {
			unique = append(unique, key)
		}
	}
	return unique
}
Example #14
0
// writeSummaries retrieves status summaries from the supplied
// NodeStatusRecorder and persists them to the cockroach data store.
func (s *Server) writeSummaries() error {
	nodeStatus, storeStatuses := s.recorder.GetStatusSummaries()
	if nodeStatus != nil {
		key := keys.NodeStatusKey(int32(nodeStatus.Desc.NodeID))
		if err := s.db.Put(key, nodeStatus); err != nil {
			return err
		}
		if log.V(1) {
			statusJSON, err := json.Marshal(nodeStatus)
			if err != nil {
				log.Errorf("error marshaling nodeStatus to json: %s", err)
			}
			log.Infof("node %d status: %s", nodeStatus.Desc.NodeID, statusJSON)
		}
	}

	for _, ss := range storeStatuses {
		key := keys.StoreStatusKey(int32(ss.Desc.StoreID))
		if err := s.db.Put(key, &ss); err != nil {
			return err
		}
		if log.V(1) {
			statusJSON, err := json.Marshal(&ss)
			if err != nil {
				log.Errorf("error marshaling storeStatus to json: %s", err)
			}
			log.Infof("store %d status: %s", ss.Desc.StoreID, statusJSON)
		}
	}
	return nil
}
Example #15
0
// runExterminate destroys the data held in the specified stores.
func runExterminate(cmd *cobra.Command, args []string) {
	if err := context.InitStores(); err != nil {
		log.Errorf("failed to initialize context: %s", err)
		return
	}

	// First attempt to shutdown the server. Note that an error of EOF just
	// means the HTTP server shutdown before the request to quit returned.
	admin := client.NewAdminClient(&context.Context, context.Addr, client.Quit)
	body, err := admin.Get()
	if err != nil {
		log.Infof("shutdown node %s: %s", context.Addr, err)
	} else {
		log.Infof("shutdown node in anticipation of data extermination: %s", body)
	}

	// Exterminate all data held in specified stores.
	for _, e := range context.Engines {
		if rocksdb, ok := e.(*engine.RocksDB); ok {
			log.Infof("exterminating data from store %s", e)
			if err := rocksdb.Destroy(); err != nil {
				log.Errorf("unable to destroy store %s: %s", e, err)
				osExit(1)
			}
		}
	}
	log.Infof("exterminated all data from stores %s", context.Engines)
}
Example #16
0
// runInit.
func runInit(cmd *commander.Command, args []string) {
	if len(args) != 1 {
		cmd.Usage()
		return
	}
	// Initialize the engine based on the first argument and
	// then verify it's not in-memory.
	engines, err := initEngines(args[0])
	if err != nil {
		log.Errorf("Failed to initialize engine %q: %v", args[0], err)
		return
	}
	e := engines[0]
	if _, ok := e.(*engine.InMem); ok {
		log.Errorf("Cannot initialize a cluster using an in-memory store")
		return
	}
	// Generate a new UUID for cluster ID and bootstrap the cluster.
	clusterID := uuid.New()
	localDB, err := BootstrapCluster(clusterID, e)
	if err != nil {
		log.Errorf("Failed to bootstrap cluster: %v", err)
		return
	}
	defer localDB.Close()
	fmt.Fprintf(os.Stdout, "Cockroach cluster %s has been initialized\n", clusterID)
	fmt.Fprintf(os.Stdout, "To start the cluster, run \"cockroach start\"\n")
}
Example #17
0
// Filter makes decisions about garbage collection based on the
// garbage collection policy for batches of values for the same key.
// The GC policy is determined via the policyFn specified when the
// GarbageCollector was created. Returns a slice of deletions, one
// per incoming keys. If an index in the returned array is set to
// true, then that value will be garbage collected.
func (gc *GarbageCollector) Filter(keys []Key, values [][]byte) []bool {
	if len(keys) == 1 {
		return nil
	}
	// Look up the policy which applies to this set of MVCC values.
	_, decKey := encoding.DecodeBinary(keys[0])
	policy := gc.policyFn(decKey)
	if policy == nil || policy.TTLSeconds <= 0 {
		return nil
	}
	toDelete := make([]bool, len(keys))
	expiration := gc.now
	expiration.WallTime -= int64(policy.TTLSeconds) * 1E9

	var survivors bool
	for i, key := range keys {
		_, ts, isValue := mvccDecodeKey(key)
		if i == 0 {
			if isValue {
				log.Errorf("unexpected MVCC value encountered: %q", key)
				return make([]bool, len(keys))
			}
			continue
		}
		if !isValue {
			log.Errorf("unexpected MVCC metadata encountered: %q", key)
			return make([]bool, len(keys))
		}
		mvccVal := proto.MVCCValue{}
		if err := gogoproto.Unmarshal(values[i], &mvccVal); err != nil {
			log.Errorf("unable to unmarshal MVCC value %q: %v", key, err)
			return make([]bool, len(keys))
		}
		if i == 1 {
			// If the first value isn't a deletion tombstone, set survivors to true.
			if !mvccVal.Deleted {
				survivors = true
			}
		} else {
			if ts.Less(expiration) {
				// If we encounter a version older than our GC timestamp, mark for deletion.
				toDelete[i] = true
			} else if !mvccVal.Deleted {
				// Otherwise, if not marked for GC and not a tombstone, set survivors true.
				survivors = true
			}
		}
	}
	// If there are no remaining non-deleted, versioned entries, mark
	// all keys for deletion, including the MVCC metadata entry.
	if !survivors {
		for i := range keys {
			toDelete[i] = true
		}
	}
	return toDelete
}
Example #18
0
// Filter makes decisions about garbage collection based on the
// garbage collection policy for batches of values for the same key.
// Returns the timestamp including, and after which, all values should
// be garbage collected. If no values should be GC'd, returns
// roachpb.ZeroTimestamp.
func (gc *GarbageCollector) Filter(keys []MVCCKey, values [][]byte) roachpb.Timestamp {
	if gc.policy.TTLSeconds <= 0 {
		return roachpb.ZeroTimestamp
	}
	if len(keys) == 0 {
		return roachpb.ZeroTimestamp
	}

	// Loop over values. All should be MVCC versions.
	delTS := roachpb.ZeroTimestamp
	survivors := false
	for i, key := range keys {
		_, ts, isValue, err := MVCCDecodeKey(key)
		if err != nil {
			log.Errorf("unable to decode MVCC key: %q: %v", key, err)
			return roachpb.ZeroTimestamp
		}
		if !isValue {
			log.Errorf("unexpected MVCC metadata encountered: %q", key)
			return roachpb.ZeroTimestamp
		}
		mvccVal := MVCCValue{}
		if err := proto.Unmarshal(values[i], &mvccVal); err != nil {
			log.Errorf("unable to unmarshal MVCC value %q: %v", key, err)
			return roachpb.ZeroTimestamp
		}
		if i == 0 {
			// If the first value isn't a deletion tombstone, don't consider
			// it for GC. It should always survive if non-deleted.
			if !mvccVal.Deleted {
				survivors = true
				continue
			}
		}
		// If we encounter a version older than our GC timestamp, mark for deletion.
		if ts.Less(gc.expiration) {
			delTS = ts
			break
		} else if !mvccVal.Deleted {
			survivors = true
		}
	}
	// If there are no non-deleted survivors, return timestamp of first key
	// to delete all entries.
	if !survivors {
		_, ts, _, err := MVCCDecodeKey(keys[0])
		if err != nil {
			// TODO(tschottdorf): Perhaps we should be propagating an error
			// (e.g. ReplicaCorruptionError) up to the caller.
			log.Errorf("unable to decode MVCC key: %q: %v", keys[0], err)
			return roachpb.ZeroTimestamp
		}
		return ts
	}
	return delTS
}
Example #19
0
func runInit(cmd *cobra.Command, args []string) {
	driver, err := NewDriver(Context)
	if err != nil {
		log.Errorf("could not create driver: %v", err)
		return
	}

	nodes, err := docker.ListCockroachNodes()
	if err != nil {
		log.Errorf("failed to get list of existing cockroach nodes: %v", err)
		return
	}
	if len(nodes) != 0 {
		log.Errorf("init called but docker-machine has %d existing cockroach nodes: %v", len(nodes), nodes)
		return
	}

	nodeName := docker.MakeNodeName(0)

	// Create first node.
	err = docker.CreateMachine(driver, nodeName)
	if err != nil {
		log.Errorf("could not create machine %s: %v", nodeName, err)
		return
	}

	// Run driver steps after first-node creation.
	err = driver.AfterFirstNode()
	if err != nil {
		log.Errorf("could not run AfterFirstNode steps for: %v", err)
		return
	}

	// Lookup node info.
	nodeConfig, err := driver.GetNodeConfig(nodeName)
	if err != nil {
		log.Errorf("could not get node config for %s: %v", nodeName, err)
		return
	}

	// Initialize cockroach node.
	err = docker.RunDockerInit(driver, nodeName, nodeConfig)
	if err != nil {
		log.Errorf("could not initialize first cockroach node %s: %v", nodeName, err)
		return
	}

	// Do "start node" logic.
	err = driver.StartNode(nodeName, nodeConfig)
	if err != nil {
		log.Errorf("could not run StartNode steps for %s: %v", nodeName, err)
		return
	}

	// Start the cockroach node.
	err = docker.RunDockerStart(driver, nodeName, nodeConfig)
	if err != nil {
		log.Errorf("could not initialize first cockroach node %s: %v", nodeName, err)
	}
}
Example #20
0
func (l *LocalCluster) processEvent(e dockerclient.EventOrError, monitorStopper chan struct{}) bool {
	l.mu.Lock()
	defer l.mu.Unlock()

	if e.Error != nil {
		log.Errorf("monitoring error: %s", e.Error)
		l.events <- Event{NodeIndex: -1, Status: eventDie}
		return false
	}
	switch e.Status {
	case "pull":
		return false
	}

	for i, n := range l.Nodes {
		if n != nil && n.ID == e.Id {
			if log.V(1) {
				log.Errorf("node=%d status=%s", i, e.Status)
			}
			l.events <- Event{NodeIndex: i, Status: e.Status}
			return true
		}
	}

	// TODO(pmattis): When we add the ability to start/stop/restart nodes we'll
	// need to keep around a map of old node container ids in order to ignore
	// events on those containers.

	// An event on any other container is unexpected. Die.
	select {
	case <-l.stopper:
	case <-monitorStopper:
	default:
		// There is a very tiny race here: the signal handler might be closing the
		// stopper simultaneously.
		log.Errorf("stopping due to unexpected event: %+v", e)
		if r, err := l.client.ContainerLogs(e.Id, &dockerclient.LogOptions{
			Stdout: true,
			Stderr: true,
		}); err == nil {
			if _, err := io.Copy(os.Stderr, r); err != nil {
				log.Infof("error listing logs: %s", err)
			}
			r.Close()
		}
		close(l.stopper)
	}
	return false
}
Example #21
0
// runRmConfig invokes the REST API with DELETE action and key prefix as path.
// The type of config that is removed is based on the passed in prefix.
func runRmConfig(ctx *Context, prefix, keyPrefix string) {
	friendlyName := getFriendlyNameFromPrefix(prefix)
	req, err := http.NewRequest("DELETE", fmt.Sprintf("%s://%s%s/%s", ctx.RequestScheme(), ctx.Addr, prefix, keyPrefix),
		nil)
	if err != nil {
		log.Errorf("unable to create request to admin REST endpoint: %s", err)
		return
	}
	_, err = sendAdminRequest(ctx, req)
	if err != nil {
		log.Errorf("admin REST request failed: %s", err)
		return
	}
	fmt.Fprintf(os.Stdout, "removed %s config for key prefix %q\n", friendlyName, keyPrefix)
}
Example #22
0
// runGetConfig invokes the REST API with GET action and key prefix as path.
func runGetConfig(ctx *Context, prefix, keyPrefix string) {
	friendlyName := getFriendlyNameFromPrefix(prefix)
	req, err := http.NewRequest("GET", fmt.Sprintf("%s://%s%s/%s", ctx.RequestScheme(), ctx.Addr, prefix, keyPrefix), nil)
	if err != nil {
		log.Errorf("unable to create request to admin REST endpoint: %s", err)
		return
	}
	req.Header.Add(util.AcceptHeader, util.YAMLContentType)
	b, err := sendAdminRequest(ctx, req)
	if err != nil {
		log.Errorf("admin REST request failed: %s", err)
		return
	}
	fmt.Fprintf(os.Stdout, "%s config for key prefix %q:\n%s\n", friendlyName, keyPrefix, string(b))
}
Example #23
0
// Cleanup cleans up the transaction as appropriate based on err.
func (txn *Txn) Cleanup(err error) {
	if err != nil {
		if replyErr := txn.Rollback(); replyErr != nil {
			log.Errorf("failure aborting transaction: %s; abort caused by: %s", replyErr, err)
		}
	}
}
Example #24
0
func (bq *baseQueue) processOne(clock *hlc.Clock) {
	start := time.Now()
	bq.Lock()
	repl := bq.pop()
	bq.Unlock()
	if repl != nil {
		now := clock.Now()
		if log.V(1) {
			log.Infof("processing replica %s from %s queue...", repl, bq.name)
		}
		// If the queue requires a replica to have the range leader lease in
		// order to be processed, check whether this replica has leader lease
		// and renew or acquire if necessary.
		if bq.impl.needsLeaderLease() {
			// Create a "fake" get request in order to invoke redirectOnOrAcquireLease.
			args := &proto.GetRequest{RequestHeader: proto.RequestHeader{Timestamp: now}}
			if err := repl.redirectOnOrAcquireLeaderLease(nil /* Trace */, args.Header().Timestamp); err != nil {
				if log.V(1) {
					log.Infof("this replica of %s could not acquire leader lease; skipping...", repl)
				}
				return
			}
		}
		if err := bq.impl.process(now, repl); err != nil {
			log.Errorf("failure processing replica %s from %s queue: %s", repl, bq.name, err)
		} else if log.V(2) {
			log.Infof("processed replica %s from %s queue in %s", repl, bq.name, time.Now().Sub(start))
		}
	}
}
Example #25
0
// ChangeGroupMembership submits a proposed membership change to the cluster.
// Payload is an opaque blob that will be returned in EventMembershipChangeCommitted.
func (m *MultiRaft) ChangeGroupMembership(groupID proto.RaftID, commandID string,
	changeType raftpb.ConfChangeType, nodeID proto.RaftNodeID, payload []byte) <-chan error {
	if log.V(6) {
		log.Infof("node %v proposing membership change to group %v", m.nodeID, groupID)
	}
	ch := make(chan error, 1)
	m.proposalChan <- &proposal{
		groupID:   groupID,
		commandID: commandID,
		fn: func() {
			if err := m.multiNode.ProposeConfChange(context.Background(), uint64(groupID),
				raftpb.ConfChange{
					Type:    changeType,
					NodeID:  uint64(nodeID),
					Context: encodeCommand(commandID, payload),
				},
			); err != nil {
				log.Errorf("node %v: error proposing membership change to node %v: %s", m.nodeID,
					groupID, err)
			}

		},
		ch: ch,
	}
	return ch
}
Example #26
0
// start dials the remote addr and commences gossip once connected. Upon exit,
// the client is sent on the disconnected channel. This method starts client
// processing in a goroutine and returns immediately.
func (c *client) start(g *Gossip, disconnected chan *client, ctx *rpc.Context, stopper *stop.Stopper) {
	stopper.RunWorker(func() {
		defer func() {
			disconnected <- c
		}()

		// Note: avoid using `grpc.WithBlock` here. This code is already
		// asynchronous from the caller's perspective, so the only effect of
		// `WithBlock` here is blocking shutdown - at the time of this writing,
		// that ends ups up making `kv` tests take twice as long.
		conn, err := ctx.GRPCDial(c.addr.String())
		if err != nil {
			log.Errorf("failed to dial: %v", err)
			return
		}

		// Start gossiping.
		if err := c.gossip(g, NewGossipClient(conn), stopper); err != nil {
			if !grpcutil.IsClosedConnection(err) {
				g.mu.Lock()
				peerID := c.peerID
				g.mu.Unlock()
				if peerID != 0 {
					log.Infof("closing client to node %d (%s): %s", peerID, c.addr, err)
				} else {
					log.Infof("closing client to %s: %s", c.addr, err)
				}
			}
		}
	})
}
Example #27
0
// runExterminate destroys the data held in the specified stores.
func runExterminate(cmd *cobra.Command, args []string) {
	err := Context.Init("exterminate")
	if err != nil {
		log.Errorf("failed to initialize context: %s", err)
		return
	}

	// First attempt to shutdown the server. Note that an error of EOF just
	// means the HTTP server shutdown before the request to quit returned.
	if err := server.SendQuit(Context); err != nil {
		log.Infof("shutdown node %s: %s", Context.Addr, err)
	} else {
		log.Infof("shutdown node in anticipation of data extermination")
	}

	// Exterminate all data held in specified stores.
	for _, e := range Context.Engines {
		if rocksdb, ok := e.(*engine.RocksDB); ok {
			log.Infof("exterminating data from store %s", e)
			if err := rocksdb.Destroy(); err != nil {
				log.Fatalf("unable to destroy store %s: %s", e, err)
			}
		}
	}
	log.Infof("exterminated all data from stores %s", Context.Engines)
}
Example #28
0
// shouldQueue determines whether a replica should be queued for garbage
// collection, and if so, at what priority. Returns true for shouldQ
// in the event that the cumulative ages of GC'able bytes or extant
// intents exceed thresholds.
func (gcq *gcQueue) shouldQueue(now roachpb.Timestamp, repl *Replica,
	sysCfg *config.SystemConfig) (shouldQ bool, priority float64) {

	desc := repl.Desc()
	zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey)
	if err != nil {
		log.Errorf("could not find GC policy for range %s: %s", repl, err)
		return
	}
	policy := zone.GC

	// GC score is the total GC'able bytes age normalized by 1 MB * the replica's TTL in seconds.
	gcScore := float64(repl.stats.GetGCBytesAge(now.WallTime)) / float64(policy.TTLSeconds) / float64(gcByteCountNormalization)

	// Intent score. This computes the average age of outstanding intents
	// and normalizes.
	intentScore := repl.stats.GetAvgIntentAge(now.WallTime) / float64(intentAgeNormalization.Nanoseconds()/1E9)

	// Compute priority.
	if gcScore > 1 {
		priority += gcScore
	}
	if intentScore > 1 {
		priority += intentScore
	}
	shouldQ = priority > 0
	return
}
Example #29
0
func (g *Gossip) doCheckTimeout(stopper *stop.Stopper) {
	g.mu.Lock()
	defer g.mu.Unlock()
	// Check whether the graph needs to be tightened to
	// accommodate distant infos.
	distant := g.filterExtant(g.is.distant(g.maxToleratedHops()))
	if distant.len() > 0 {
		// If we have space, start a client immediately.
		if g.outgoing.hasSpace() {
			nodeID := distant.selectRandom()
			if nodeAddr, err := g.getNodeIDAddressLocked(nodeID); err != nil {
				log.Errorf("node %d: %s", nodeID, err)
			} else {
				g.startClient(nodeAddr, g.RPCContext, stopper)
			}
		} else {
			// Otherwise, find least useful peer and close it. Make sure
			// here that we only consider outgoing clients which are
			// connected.
			nodeID := g.is.leastUseful(g.outgoing)
			if nodeID != 0 {
				log.Infof("closing least useful client %d to tighten network graph", nodeID)
				g.closeClient(nodeID)
			}
		}
	}
	g.maybeSignalStalledLocked()
}
Example #30
0
// getNextBootstrapAddress returns the next available bootstrap
// address by consulting the first non-exhausted resolver from the
// slice supplied to the constructor or set using setBootstrap().
// The lock is assumed held.
func (g *Gossip) getNextBootstrapAddress() net.Addr {
	if len(g.resolvers) == 0 {
		log.Fatalf("no resolvers specified for gossip network")
	}

	// Run through resolvers round robin starting at last resolved index.
	for i := 0; i < len(g.resolvers); i++ {
		g.resolverIdx = (g.resolverIdx + 1) % len(g.resolvers)
		if g.resolverIdx == len(g.resolvers)-1 {
			g.triedAll = true
		}
		resolver := g.resolvers[g.resolverIdx]
		addr, err := resolver.GetAddress()
		if err != nil {
			log.Errorf("invalid bootstrap address: %+v, %v", resolver, err)
			continue
		} else if addr.String() == g.is.NodeAddr.String() {
			// Skip our own node address.
			continue
		}
		_, addrActive := g.bootstrapping[addr.String()]
		if !resolver.IsExhausted() || !addrActive {
			g.bootstrapping[addr.String()] = struct{}{}
			return addr
		}
	}

	return nil
}