Esempio n. 1
0
func TestWordCount(t *testing.T) {
	t.Parallel()
	cache := etcache.NewTestCache()
	// Setup 2 shards
	shard1 := NewShard("", "TestWordCountData-0-2", "TestWordCountPipelines-0-2", 0, 2, cache)
	require.NoError(t, shard1.EnsureRepos())
	s1 := httptest.NewServer(NewShardHTTPHandler(shard1))
	defer s1.Close()
	shard2 := NewShard("", "TestWordCountData-1-2", "TestWordCountPipelines-1-2", 1, 2, cache)
	require.NoError(t, shard2.EnsureRepos())
	s2 := httptest.NewServer(NewShardHTTPHandler(shard2))
	defer s2.Close()

	checkWriteFile(t, s1.URL, path.Join("data", "1"), "master",
		`Mr and Mrs Dursley, of number four, Privet Drive, were proud to say that they
were perfectly normal, thank you very much. They were the last people you'd
expect to be involved in anything strange or mysterious, because they just
didn't hold with such nonsense.`)
	checkWriteFile(t, s2.URL, path.Join("data", "2"), "master",
		`Mr Dursley was the director of a firm called Grunnings, which made drills.
He was a big, beefy man with hardly any neck, although he did have a very
large moustache. Mrs Dursley was thin and blonde and had nearly twice the
usual amount of neck, which came in very useful as she spent so much of her
time craning over garden fences, spying on the neighbours. The Dursleys had
a small son called Dudley and in their opinion there was no finer boy
anywhere.`)

	// Spoof the shards in etcache
	cache.SpoofMany("/pfs/master", []string{s1.URL, s2.URL}, false)

	pipeline := `
image ubuntu

input data

run mkdir /out/counts
run cat /in/data/* | tr -cs "A-Za-z'" "\n" | sort | uniq -c | sort -n -r | while read count; do echo ${count% *} >/out/counts/${count#* }; done
shuffle counts
run find /out/counts | while read count; do cat $count | awk '{ sum+=$1} END {print sum}' >/tmp/count; mv /tmp/count $count; done
`
	res, err := http.Post(s1.URL+"/pipeline/wc", "application/text", strings.NewReader(pipeline))
	require.NoError(t, err)
	res.Body.Close()
	res, err = http.Post(s2.URL+"/pipeline/wc", "application/text", strings.NewReader(pipeline))
	require.NoError(t, err)
	res.Body.Close()

	res, err = http.Post(s1.URL+"/commit?commit=commit1", "", nil)
	require.NoError(t, err)
	res.Body.Close()
	res, err = http.Post(s2.URL+"/commit?commit=commit1", "", nil)
	require.NoError(t, err)
	res.Body.Close()

	// There should be 3 occurances of Dursley
	checkFile(t, s1.URL+"/pipeline/wc", path.Join("counts", "Dursley"), "commit1", "3\n")
}
Esempio n. 2
0
func TestShuffle(t *testing.T) {
	t.Parallel()
	cache := etcache.NewTestCache()
	// Setup 2 shards
	shard1 := NewShard("", "TestShuffleData-0-2", "TestShufflePipelines-0-2", 0, 2, cache)
	require.NoError(t, shard1.EnsureRepos())
	s1 := httptest.NewServer(NewShardHTTPHandler(shard1))
	defer s1.Close()
	shard2 := NewShard("", "TestShuffleData-1-2", "TestShufflePipelines-1-2", 1, 2, cache)
	require.NoError(t, shard2.EnsureRepos())
	s2 := httptest.NewServer(NewShardHTTPHandler(shard2))
	defer s2.Close()

	files := []string{"foo", "bar", "fizz", "buzz"}

	for _, file := range files {
		checkWriteFile(t, s1.URL, path.Join("data", file), "master", file)
		checkWriteFile(t, s2.URL, path.Join("data", file), "master", file)
	}

	// Spoof the shards in etcache
	cache.SpoofMany("/pfs/master", []string{s1.URL, s2.URL}, false)

	pipeline := `
image ubuntu

input data

run cp -r /in/data /out

shuffle data
`
	res, err := http.Post(s1.URL+"/pipeline/shuffle", "application/text", strings.NewReader(pipeline))
	require.NoError(t, err)
	res.Body.Close()
	res, err = http.Post(s2.URL+"/pipeline/shuffle", "application/text", strings.NewReader(pipeline))
	require.NoError(t, err)
	res.Body.Close()

	res, err = http.Post(s1.URL+"/commit?commit=commit1", "", nil)
	require.NoError(t, err)
	res, err = http.Post(s2.URL+"/commit?commit=commit1", "", nil)
	require.NoError(t, err)

	for _, file := range files {
		match, err := route.Match(path.Join("data", file), "0-2")
		require.NoError(t, err)
		if match {
			log.Print("shard: s1 file: ", file)
			checkFile(t, s1.URL+"/pipeline/shuffle", path.Join("data", file), "commit1", file+file)
		} else {
			log.Print("shard: s2 file: ", file)
			checkFile(t, s2.URL+"/pipeline/shuffle", path.Join("data", file), "commit1", file+file)
		}
	}
}
Esempio n. 3
0
func TestWordCount(t *testing.T) {
	t.Parallel()
	maxCount := 2
	if testing.Short() {
		maxCount = 1
	}
	// First setup the WordCount pipeline
	pipeline := `
image ubuntu

input data

run mkdir -p /out/counts
run cat /in/data/* | tr -cs "A-Za-z'" "\n" | sort | uniq -c | sort -n -r | while read count; do echo ${count% *} >/out/counts/${count#* }; done
shuffle counts
run find /out/counts | while read count; do cat $count | awk '{ sum+=$1} END {print sum}' >/tmp/count; mv /tmp/count $count; done
`
	// used to prevent collisions
	counter := 0
	f := func(w traffic.Workload) bool {
		defer func() { counter++ }()
		cluster := newCluster(t, fmt.Sprintf("TestWordCount-%d", counter), 4, etcache.NewTestCache())
		defer cluster.Close()
		// Run the workload
		storage.RunWorkload(t, cluster.router.URL, w)
		// Install the pipeline
		response, err := http.Post(cluster.router.URL+"/pipeline/wc", "application/text", strings.NewReader(pipeline))
		defer response.Body.Close()
		assert.NoError(t, err)
		// Make a commit
		response, err = http.Post(cluster.router.URL+"/commit?commit=commit1", "", nil)
		defer response.Body.Close()
		assert.NoError(t, err)
		// TODO(jd) make this check for correctness, not just that the request
		// completes. It's a bit hard because the input is random. Probably the
		// right idea is to modify the traffic package so that it keeps track of
		// this.
		response, err = http.Get(cluster.router.URL + "/pipeline/wc/file/counts/*?commit=commit1")
		defer response.Body.Close()
		assert.NoError(t, err)
		require.Equal(t, http.StatusOK, response.StatusCode)
		return true
	}
	if err := quick.Check(f, &quick.Config{MaxCount: maxCount}); err != nil {
		t.Error(err)
	}
}
Esempio n. 4
0
func TestTwoShards(t *testing.T) {
	t.Parallel()
	maxCount := 5
	if testing.Short() {
		maxCount = 1
	}
	// used to prevent collisions
	counter := 0
	f := func(w traffic.Workload) bool {
		defer func() { counter++ }()
		cluster := newCluster(t, fmt.Sprintf("TestTwoShards-%d", counter), 2, etcache.NewTestCache())
		defer cluster.Close()
		// Run the workload
		storage.RunWorkload(t, cluster.router.URL, w)
		// Make sure we see the changes we should
		facts := w.Facts()
		storage.RunWorkload(t, cluster.router.URL, facts)
		//increment the counter
		return true
	}
	if err := quick.Check(f, &quick.Config{MaxCount: maxCount}); err != nil {
		t.Error(err)
	}
}