func TestBuildInfo(t *testing.T) { l := localcluster.Create(1, stopper) l.Start() defer l.AssertAndStop(t) util.SucceedsWithin(t, 10*time.Second, func() error { select { case <-stopper: t.Fatalf("interrupted") return nil case <-time.After(200 * time.Millisecond): } var r struct { BuildInfo map[string]string } if err := l.Nodes[0].GetJSON("", "/_status/details/local", &r); err != nil { return err } for _, key := range []string{"goVersion", "tag", "time", "dependencies"} { if val, ok := r.BuildInfo[key]; !ok { t.Errorf("build info missing for \"%s\"", key) } else if val == "" { t.Errorf("build info not set for \"%s\"", key) } } return nil }) }
func TestRangeReplication(t *testing.T) { l := localcluster.Create(*numNodes, stopper) l.Start() defer l.Stop() checkRangeReplication(t, l, 20*time.Second) }
// TestGossipRestart verifies that the gossip network can be // re-bootstrapped after a time when all nodes were down // simultaneously. func TestGossipRestart(t *testing.T) { l := localcluster.Create(*numNodes, stopper) l.Start() defer l.Stop() log.Infof("waiting for initial gossip connections") checkGossip(t, l, 20*time.Second, hasPeers(len(l.Nodes))) checkGossip(t, l, time.Second, hasClusterID) checkGossip(t, l, time.Second, hasSentinel) // The replication of the first range is important: as long as the // first range only exists on one node, that node can trivially // acquire the leader lease. Once the range is replicated, however, // nodes must be able to discover each other over gossip before the // lease can be acquired. log.Infof("waiting for range replication") checkRangeReplication(t, l, 10*time.Second) log.Infof("stopping all nodes") for _, node := range l.Nodes { node.Stop(5) } log.Infof("restarting all nodes") for _, node := range l.Nodes { node.Restart(5) } log.Infof("waiting for gossip to be connected") checkGossip(t, l, 20*time.Second, hasPeers(len(l.Nodes))) checkGossip(t, l, time.Second, hasClusterID) checkGossip(t, l, time.Second, hasSentinel) }
// TestPut starts up an N node cluster and runs N workers that write // to independent keys. func TestPut(t *testing.T) { l := localcluster.Create(*numNodes, stopper) l.Start() defer l.Stop() db, dbStopper := makeDBClient(t, l, 0) defer dbStopper.Stop() if err := configutil.SetDefaultRangeMaxBytes(db, *rangeMaxBytes); err != nil { t.Fatal(err) } checkRangeReplication(t, l, 20*time.Second) errs := make(chan error, *numNodes) start := time.Now() deadline := start.Add(*duration) var count int64 for i := 0; i < *numNodes; i++ { go func() { r, _ := randutil.NewPseudoRand() value := randutil.RandBytes(r, 8192) for time.Now().Before(deadline) { k := atomic.AddInt64(&count, 1) v := value[:r.Intn(len(value))] if err := db.Put(fmt.Sprintf("%08d", k), v); err != nil { errs <- err return } } errs <- nil }() } for i := 0; i < *numNodes; { select { case <-stopper: t.Fatalf("interrupted") case err := <-errs: if err != nil { t.Fatal(err) } i++ case <-time.After(1 * time.Second): // Periodically print out progress so that we know the test is still // running. log.Infof("%d", atomic.LoadInt64(&count)) } } elapsed := time.Since(start) log.Infof("%d %.1f/sec", count, float64(count)/elapsed.Seconds()) }
// TestStatusServer starts up an N node cluster and tests the status server on // each node. func TestStatusServer(t *testing.T) { t.Skipf("TODO(Bram): Test is flaky - fix it.") l := localcluster.Create(*numNodes, stopper) l.ForceLogging = true l.Start() defer l.Stop() checkRangeReplication(t, l, 20*time.Second) client := &http.Client{ Timeout: 200 * time.Millisecond, Transport: &http.Transport{ TLSClientConfig: &tls.Config{ InsecureSkipVerify: true, }, }, } // Get the ids for each node. idMap := make(map[string]string) for _, node := range l.Nodes { body := get(t, client, node, "/_status/details/local") var detail details if err := json.Unmarshal(body, &detail); err != nil { t.Fatalf("unable to parse details - %s", err) } idMap[node.ID] = detail.NodeID.String() } // Check local response for the every node. for _, node := range l.Nodes { checkNode(t, client, node, idMap[node.ID], "local", idMap[node.ID]) get(t, client, node, "/_status/nodes") get(t, client, node, "/_status/stores") } // Proxy from the first node to the last node. firstNode := l.Nodes[0] lastNode := l.Nodes[len(l.Nodes)-1] firstID := idMap[firstNode.ID] lastID := idMap[lastNode.ID] checkNode(t, client, firstNode, firstID, lastID, lastID) // And from the last node to the first node. checkNode(t, client, lastNode, lastID, firstID, firstID) // And from the last node to the last node. checkNode(t, client, lastNode, lastID, lastID, lastID) }
// TestGossipRestart verifies that the gossip network can be // re-bootstrapped after a time when all nodes were down // simultaneously. func TestGossipRestart(t *testing.T) { l := localcluster.Create(*numNodes, stopper) l.Start() defer l.AssertAndStop(t) log.Infof("waiting for initial gossip connections") checkGossip(t, l, 20*time.Second, hasPeers(len(l.Nodes))) checkGossip(t, l, time.Second, hasClusterID) checkGossip(t, l, time.Second, hasSentinel) // The replication of the first range is important: as long as the // first range only exists on one node, that node can trivially // acquire the leader lease. Once the range is replicated, however, // nodes must be able to discover each other over gossip before the // lease can be acquired. log.Infof("waiting for range replication") checkRangeReplication(t, l, 10*time.Second) log.Infof("killing all nodes") for _, node := range l.Nodes { node.Kill() } log.Infof("restarting all nodes") for _, node := range l.Nodes { node.Restart(5) } log.Infof("waiting for gossip to be connected") checkGossip(t, l, 20*time.Second, hasPeers(len(l.Nodes))) checkGossip(t, l, time.Second, hasClusterID) checkGossip(t, l, time.Second, hasSentinel) for i := range l.Nodes { db, dbStopper := makeDBClient(t, l, i) if kv, err := db.Inc("count", 1); err != nil { t.Fatal(err) } else if v := kv.ValueInt(); v != int64(i+1) { t.Fatalf("unexpected value %d for write #%d (expected %d)", v, i, i+1) } dbStopper.Stop() } }
// TestStatusServer starts up an N node cluster and tests the status server on // each node. func TestStatusServer(t *testing.T) { l := localcluster.Create(*numNodes, stopper) l.ForceLogging = true l.Start() defer l.Stop() checkRangeReplication(t, l, 20*time.Second) // Get the ids for each node. idMap := make(map[string]string) for _, node := range l.Nodes { body := get(t, node, "/_status/details/local") var detail details if err := json.Unmarshal(body, &detail); err != nil { t.Fatalf("unable to parse details - %s", err) } idMap[node.ID] = detail.NodeID.String() } // Check local response for the every node. for _, node := range l.Nodes { checkNode(t, node, idMap[node.ID], "local", idMap[node.ID]) get(t, node, "/_status/nodes") get(t, node, "/_status/stores") } // Proxy from the first node to the last node. firstNode := l.Nodes[0] lastNode := l.Nodes[len(l.Nodes)-1] firstID := idMap[firstNode.ID] lastID := idMap[lastNode.ID] checkNode(t, firstNode, firstID, lastID, lastID) // And from the last node to the first node. checkNode(t, lastNode, lastID, firstID, firstID) // And from the last node to the last node. checkNode(t, lastNode, lastID, lastID, lastID) }
func TestGossipPeerings(t *testing.T) { l := localcluster.Create(*numNodes, stopper) l.Start() defer l.AssertAndStop(t) checkGossip(t, l, 20*time.Second, hasPeers(len(l.Nodes))) // Restart the first node. log.Infof("restarting node 0") if err := l.Nodes[0].Restart(5); err != nil { t.Fatal(err) } checkGossip(t, l, 20*time.Second, hasPeers(len(l.Nodes))) // Restart another node. rand.Seed(randutil.NewPseudoSeed()) pickedNode := rand.Intn(len(l.Nodes)-1) + 1 log.Infof("restarting node %d", pickedNode) if err := l.Nodes[pickedNode].Restart(5); err != nil { t.Fatal(err) } checkGossip(t, l, 20*time.Second, hasPeers(len(l.Nodes))) }
// TestChaos starts up a cluster and, for each node, a worker writing to // independent keys, while nodes are being killed and restarted continuously. // The test measures not write performance, but cluster recovery. func TestChaos(t *testing.T) { t.Skip("TODO(tschottdorf): currently unstable") l := localcluster.Create(*numNodes, stopper) l.Start() defer l.AssertAndStop(t) checkRangeReplication(t, l, 20*time.Second) errs := make(chan error, *numNodes) start := time.Now() deadline := start.Add(*duration) var count int64 counts := make([]int64, *numNodes) clients := make([]struct { sync.RWMutex db *client.DB stopper *stop.Stopper }, *numNodes) initClient := func(i int) { db, dbStopper := makeDBClient(t, l, i) if clients[i].stopper != nil { clients[i].stopper.Stop() } clients[i].db, clients[i].stopper = db, dbStopper } for i := 0; i < *numNodes; i++ { initClient(i) go func(i int) { r, _ := randutil.NewPseudoRand() value := randutil.RandBytes(r, 8192) for time.Now().Before(deadline) { clients[i].RLock() k := atomic.AddInt64(&count, 1) atomic.AddInt64(&counts[i], 1) v := value[:r.Intn(len(value))] if err := clients[i].db.Put(fmt.Sprintf("%08d", k), v); err != nil { // These originate from DistSender when, for example, the // leader is down. With more realistic retry options, we // should probably not see them. if _, ok := err.(*roachpb.SendError); ok { log.Warning(err) } else { errs <- err clients[i].RUnlock() return } } clients[i].RUnlock() } errs <- nil }(i) } teardown := make(chan struct{}) defer func() { <-teardown for i := range clients { clients[i].stopper.Stop() clients[i].stopper = nil } }() // Chaos monkey. go func() { defer close(teardown) rnd, seed := randutil.NewPseudoRand() log.Warningf("monkey starts (seed %d)", seed) for round := 1; time.Now().Before(deadline); round++ { select { case <-stopper: return default: } nodes := rnd.Perm(*numNodes)[:rnd.Intn(*numNodes)+1] log.Infof("round %d: restarting nodes %v", round, nodes) for _, i := range nodes { clients[i].Lock() } for _, i := range nodes { log.Infof("restarting %v", i) l.Nodes[i].Kill() l.Nodes[i].Restart(5) initClient(i) clients[i].Unlock() } for cur := atomic.LoadInt64(&count); time.Now().Before(deadline) && atomic.LoadInt64(&count) == cur; time.Sleep(time.Second) { l.Assert(t) log.Warningf("monkey sleeping while cluster recovers...") } } }() for i := 0; i < *numNodes; { select { case <-teardown: case <-stopper: t.Fatal("interrupted") case err := <-errs: if err != nil { t.Error(err) } i++ case <-time.After(1 * time.Second): // Periodically print out progress so that we know the test is still // running. cur := make([]string, *numNodes) for i := range cur { cur[i] = fmt.Sprintf("%d", atomic.LoadInt64(&counts[i])) } log.Infof("%d (%s)", atomic.LoadInt64(&count), strings.Join(cur, ", ")) } } elapsed := time.Since(start) log.Infof("%d %.1f/sec", count, float64(count)/elapsed.Seconds()) }
// TestMultiuser starts up an N node cluster and performs various ops // using different users. func TestMultiuser(t *testing.T) { l := localcluster.Create(*numNodes, stopper) l.Start() defer l.Stop() // Create client certificates for "foo" and "other". if err := security.RunCreateClientCert(l.CertsDir, 512, "foo"); err != nil { t.Fatal(err) } if err := security.RunCreateClientCert(l.CertsDir, 512, "other"); err != nil { t.Fatal(err) } checkRangeReplication(t, l, 20*time.Second) // Make clients. rootClient := makeDBClientForUser(t, l, "root", 0) fooClient := makeDBClientForUser(t, l, "foo", 0) otherClient := makeDBClientForUser(t, l, "other", 0) // Set permissions configs. configs := []struct { prefix string readers []string writers []string }{ // Good to know: "root" is always allowed to read and write. {"foo", []string{"foo"}, []string{"foo"}}, {"foo/public", []string{"foo", "other"}, []string{"foo"}}, {"tmp", []string{"foo", "other"}, []string{"foo", "other"}}, } for i, cfg := range configs { protoConfig := &config.PermConfig{Read: cfg.readers, Write: cfg.writers} if err := putPermConfig(rootClient, cfg.prefix, protoConfig); err != nil { t.Fatalf("#%d: failed to write config %+v for prefix %q: %v", i, protoConfig, cfg.prefix, err) } } // Write some data. The value is just the key. writes := []struct { key string db *client.DB success bool }{ {"some-file", rootClient, true}, {"some-file", fooClient, false}, {"some-file", otherClient, false}, {"foo/a", rootClient, true}, {"foo/a", fooClient, true}, {"foo/a", otherClient, false}, {"foo/public/b", rootClient, true}, {"foo/public/b", fooClient, true}, {"foo/public/b", otherClient, false}, {"tmp/c", rootClient, true}, {"tmp/c", fooClient, true}, {"tmp/c", otherClient, true}, } for i, w := range writes { err := w.db.Put(w.key, w.key) if (err == nil) != w.success { t.Errorf("test case #%d: %+v, got err=%v", i, w, err) } } // Read the previously-written files. They all succeeded at least once. reads := []struct { key string db *client.DB success bool }{ {"some-file", rootClient, true}, {"some-file", fooClient, false}, {"some-file", otherClient, false}, {"foo/a", rootClient, true}, {"foo/a", fooClient, true}, {"foo/a", otherClient, false}, {"foo/public/b", rootClient, true}, {"foo/public/b", fooClient, true}, {"foo/public/b", otherClient, true}, {"tmp/c", rootClient, true}, {"tmp/c", fooClient, true}, {"tmp/c", otherClient, true}, } for i, r := range reads { _, err := r.db.Get(r.key) if (err == nil) != r.success { t.Errorf("test case #%d: %+v, got err=%v", i, r, err) } } }
// TestSingleKey stresses the transaction retry machinery by starting // up an N node cluster and running N workers that are all // incrementing the value associated with a single key. func TestSingleKey(t *testing.T) { l := localcluster.Create(*numNodes, stopper) l.Start() defer l.Stop() checkRangeReplication(t, l, 20*time.Second) // Initialize the value for our test key to zero. const key = "test-key" db := makeDBClient(t, l, 0) if err := db.Put(key, testVal(0)); err != nil { t.Fatal(err) } type result struct { err error count int maxLatency time.Duration } resultCh := make(chan result, *numNodes) deadline := time.Now().Add(*duration) var expected int64 // Start up numNodes workers each reading and writing the same // key. Each worker is configured to talk to a different node in the // cluster. for i := 0; i < *numNodes; i++ { db := makeDBClient(t, l, i) go func() { var r result for time.Now().Before(deadline) { start := time.Now() err := db.Txn(func(txn *client.Txn) error { r, err := txn.Get(key) if err != nil { return err } var v testVal if err := v.UnmarshalBinary(r.ValueBytes()); err != nil { return err } b := &client.Batch{} b.Put(key, v+1) return txn.CommitInBatch(b) }) if err != nil { resultCh <- result{err: err} return } atomic.AddInt64(&expected, 1) r.count++ latency := time.Since(start) if r.maxLatency < latency { r.maxLatency = latency } } resultCh <- r }() } // Verify that none of the workers encountered an error. var results []result for len(results) < *numNodes { select { case <-stopper: t.Fatalf("interrupted") case r := <-resultCh: if r.err != nil { t.Fatal(r.err) } results = append(results, r) case <-time.After(1 * time.Second): // Periodically print out progress so that we know the test is still // running. log.Infof("%d", atomic.LoadInt64(&expected)) } } // Verify the resulting value stored at the key is what we expect. r, err := db.Get(key) if err != nil { t.Fatal(err) } var v testVal if err := v.UnmarshalBinary(r.ValueBytes()); err != nil { t.Fatal(err) } if expected != int64(v) { t.Fatalf("expected %d, but found %d", expected, v) } var maxLatency []time.Duration for _, r := range results { maxLatency = append(maxLatency, r.maxLatency) } log.Infof("%d increments: %s", v, maxLatency) }