func testPutInner(ctx context.Context, t *testing.T, c cluster.Cluster, cfg cluster.TestConfig) { db, err := c.NewClient(ctx, 0) if err != nil { t.Fatal(err) } errs := make(chan error, c.NumNodes()) start := timeutil.Now() deadline := start.Add(cfg.Duration) var count int64 for i := 0; i < c.NumNodes(); i++ { go func() { r, _ := randutil.NewPseudoRand() value := randutil.RandBytes(r, 8192) for timeutil.Now().Before(deadline) { k := atomic.AddInt64(&count, 1) v := value[:r.Intn(len(value))] if err := db.Put(ctx, fmt.Sprintf("%08d", k), v); err != nil { errs <- err return } } errs <- nil }() } for i := 0; i < c.NumNodes(); { baseCount := atomic.LoadInt64(&count) select { case <-stopper.ShouldStop(): t.Fatalf("interrupted") case err := <-errs: if err != nil { t.Fatal(err) } i++ case <-time.After(1 * time.Second): // Periodically print out progress so that we know the test is still // running. loadedCount := atomic.LoadInt64(&count) log.Infof(ctx, "%d (%d/s)", loadedCount, loadedCount-baseCount) c.Assert(ctx, t) if err := cluster.Consistent(ctx, c, 0); err != nil { t.Fatal(err) } } } elapsed := timeutil.Since(start) log.Infof(ctx, "%d %.1f/sec", count, float64(count)/elapsed.Seconds()) }
func checkRangeReplication(t *testing.T, c cluster.Cluster, d time.Duration) { if c.NumNodes() < 1 { // Looks silly, but we actually start zero-node clusters in the // reference tests. t.Log("replication test is a no-op for empty cluster") return } wantedReplicas := 3 if c.NumNodes() < 3 { wantedReplicas = c.NumNodes() } log.Infof(context.Background(), "waiting for first range to have %d replicas", wantedReplicas) util.SucceedsSoon(t, func() error { // Reconnect on every iteration; gRPC will eagerly tank the connection // on transport errors. Always talk to node 0 because it's guaranteed // to exist. client, dbStopper := c.NewClient(t, 0) defer dbStopper.Stop() select { case <-stopper: t.Fatalf("interrupted") return nil case <-time.After(1 * time.Second): } foundReplicas, err := countRangeReplicas(client) if err != nil { return err } if log.V(1) { log.Infof(context.Background(), "found %d replicas", foundReplicas) } if foundReplicas >= wantedReplicas { return nil } return fmt.Errorf("expected %d replicas, only found %d", wantedReplicas, foundReplicas) }) log.Infof(context.Background(), "found %d replicas", wantedReplicas) }
func testFreezeClusterInner(t *testing.T, c cluster.Cluster, cfg cluster.TestConfig) { minAffected := int64(server.ExpectedInitialRangeCount()) const long = time.Minute const short = 10 * time.Second mustPost := func(freeze bool) serverpb.ClusterFreezeResponse { reply, err := postFreeze(c, freeze, long) if err != nil { t.Fatal(errors.Errorf("%v", err)) } return reply } if reply := mustPost(false); reply.RangesAffected != 0 { t.Fatalf("expected initial unfreeze to affect no ranges, got %d", reply.RangesAffected) } if reply := mustPost(true); reply.RangesAffected < minAffected { t.Fatalf("expected >=%d frozen ranges, got %d", minAffected, reply.RangesAffected) } if reply := mustPost(true); reply.RangesAffected != 0 { t.Fatalf("expected second freeze to affect no ranges, got %d", reply.RangesAffected) } if reply := mustPost(false); reply.RangesAffected < minAffected { t.Fatalf("expected >=%d thawed ranges, got %d", minAffected, reply.RangesAffected) } num := c.NumNodes() if num < 3 { t.Skip("skipping remainder of test; needs at least 3 nodes") } // Kill the last node. if err := c.Kill(num - 1); err != nil { t.Fatal(err) } // Attempt to freeze should get stuck (since it does not get confirmation // of the last node receiving the freeze command). // Note that this is the freeze trigger stalling on the Replica, not the // Store-polling mechanism. acceptErrs := strings.Join([]string{ "timed out waiting for Range", "Timeout exceeded while", "connection is closing", "deadline", // error returned via JSON when the server-side gRPC stream times out (due to // lack of new input). Unmarshaling that JSON fails with a message referencing // unknown fields, unfortunately in map order. "unknown field .*", }, "|") if reply, err := postFreeze(c, true, short); !testutils.IsError(err, acceptErrs) { t.Fatalf("expected timeout, got %v: %v", err, reply) } // Shut down the remaining nodes and restart them. for i := 0; i < num-1; i++ { if err := c.Kill(i); err != nil { t.Fatal(err) } } for i := 0; i < num; i++ { if err := c.Restart(i); err != nil { t.Fatal(err) } } // The cluster should now be fully operational (at least after waiting // a little bit) since each node tries to unfreeze everything when it // starts. if err := util.RetryForDuration(time.Minute, func() error { if _, err := postFreeze(c, false, short); err != nil { if testutils.IsError(err, "404 Not Found") { // It can take a bit until the endpoint is available. return err } t.Fatal(err) } // TODO(tschottdorf): moving the client creation outside of the retry // loop will break the test with the following message: // // client/rpc_sender.go:61: roachpb.Batch RPC failed as client // connection was closed // // Perhaps the cluster updates the address too late after restarting // the node. db, dbStopper := c.NewClient(t, 0) defer dbStopper.Stop() if _, err := db.Scan(context.TODO(), keys.LocalMax, roachpb.KeyMax, 0); err != nil { t.Fatal(err) } return nil }); err != nil { t.Fatal(err) } // Unfreezing again should be a no-op. if reply, err := postFreeze(c, false, long); err != nil { t.Fatal(err) } else if reply.RangesAffected > 0 { t.Fatalf("still %d frozen ranges", reply.RangesAffected) } }
func testGossipRestartInner( ctx context.Context, t *testing.T, c cluster.Cluster, cfg cluster.TestConfig, ) { // This already replicates the first range (in the local setup). // The replication of the first range is important: as long as the // first range only exists on one node, that node can trivially // acquire the range lease. Once the range is replicated, however, // nodes must be able to discover each other over gossip before the // lease can be acquired. num := c.NumNodes() deadline := timeutil.Now().Add(cfg.Duration) waitTime := longWaitTime if cfg.Duration < waitTime { waitTime = shortWaitTime } for timeutil.Now().Before(deadline) { log.Infof(ctx, "waiting for initial gossip connections") CheckGossip(ctx, t, c, waitTime, HasPeers(num)) CheckGossip(ctx, t, c, waitTime, hasClusterID) CheckGossip(ctx, t, c, waitTime, hasSentinel) log.Infof(ctx, "killing all nodes") for i := 0; i < num; i++ { if err := c.Kill(ctx, i); err != nil { t.Fatal(err) } } log.Infof(ctx, "restarting all nodes") for i := 0; i < num; i++ { if err := c.Restart(ctx, i); err != nil { t.Fatal(err) } } log.Infof(ctx, "waiting for gossip to be connected") CheckGossip(ctx, t, c, waitTime, HasPeers(num)) CheckGossip(ctx, t, c, waitTime, hasClusterID) CheckGossip(ctx, t, c, waitTime, hasSentinel) for i := 0; i < num; i++ { db, err := c.NewClient(ctx, i) if err != nil { t.Fatal(err) } if i == 0 { if err := db.Del(ctx, "count"); err != nil { t.Fatal(err) } } var kv client.KeyValue if err := db.Txn(ctx, func(txn *client.Txn) error { var err error kv, err = txn.Inc("count", 1) return err }); err != nil { t.Fatal(err) } else if v := kv.ValueInt(); v != int64(i+1) { t.Fatalf("unexpected value %d for write #%d (expected %d)", v, i, i+1) } } } }
func testSingleKeyInner( ctx context.Context, t *testing.T, c cluster.Cluster, cfg cluster.TestConfig, ) { num := c.NumNodes() // Initialize the value for our test key to zero. const key = "test-key" initDB := c.NewClient(ctx, t, 0) if err := initDB.Put(ctx, key, 0); err != nil { t.Fatal(err) } type result struct { err error maxLatency time.Duration } resultCh := make(chan result, num) deadline := timeutil.Now().Add(cfg.Duration) var expected int64 // Start up num workers each reading and writing the same // key. Each worker is configured to talk to a different node in the // cluster. for i := 0; i < num; i++ { db := c.NewClient(ctx, t, i) go func() { var r result for timeutil.Now().Before(deadline) { start := timeutil.Now() err := db.Txn(ctx, func(txn *client.Txn) error { minExp := atomic.LoadInt64(&expected) r, err := txn.Get(key) if err != nil { return err } b := txn.NewBatch() v := r.ValueInt() b.Put(key, v+1) err = txn.CommitInBatch(b) // Atomic updates after the fact mean that we should read // exp or larger (since concurrent writers might have // committed but not yet performed their atomic update). if err == nil && v < minExp { return errors.Errorf("unexpected read: %d, expected >= %d", v, minExp) } return err }) if err != nil { resultCh <- result{err: err} return } atomic.AddInt64(&expected, 1) latency := timeutil.Since(start) if r.maxLatency < latency { r.maxLatency = latency } } resultCh <- r }() } // Verify that none of the workers encountered an error. var results []result for len(results) < num { select { case <-stopper.ShouldStop(): t.Fatalf("interrupted") case r := <-resultCh: if r.err != nil { t.Fatal(r.err) } results = append(results, r) case <-time.After(1 * time.Second): // Periodically print out progress so that we know the test is still // running. log.Infof(ctx, "%d", atomic.LoadInt64(&expected)) } } // Verify the resulting value stored at the key is what we expect. r, err := initDB.Get(ctx, key) if err != nil { t.Fatal(err) } v := r.ValueInt() if expected != v { t.Fatalf("expected %d, but found %d", expected, v) } var maxLatency []time.Duration for _, r := range results { maxLatency = append(maxLatency, r.maxLatency) } log.Infof(ctx, "%d increments: %s", v, maxLatency) }