func TestShardFilter(t *testing.T) { t.Parallel() shard := NewShard("", "TestShardFilterData", "TestShardFilterPipelines", 0, 1, etcache.NewCache()) require.NoError(t, shard.EnsureRepos()) s := httptest.NewServer(NewShardHTTPHandler(shard)) defer s.Close() res, err := http.Post(s.URL+"/pipeline/files", "application/text", strings.NewReader(` image ubuntu run touch /out/foo run touch /out/bar run touch /out/buzz run touch /out/bizz `)) require.NoError(t, err) res.Body.Close() res, err = http.Post(s.URL+"/commit?commit=commit1", "", nil) require.NoError(t, err) // Map to store files we receive files := make(map[string]struct{}) res, err = http.Get(s.URL + path.Join("/pipeline", "files", "file", "*") + "?commit=commit1&shard=0-2") require.NoError(t, err) require.Equal(t, http.StatusOK, res.StatusCode) reader := multipart.NewReader(res.Body, res.Header.Get("Boundary")) for p, err := reader.NextPart(); err != io.EOF; p, err = reader.NextPart() { require.NoError(t, err) match, err := route.Match(p.FileName(), "0-2") require.NoError(t, err) require.True(t, match, fmt.Sprintf("%s should match", p.FileName())) if _, ok := files[p.FileName()]; ok == true { t.Fatalf("File: %s received twice.", p.FileName()) } files[p.FileName()] = struct{}{} } res, err = http.Get(s.URL + path.Join("/pipeline", "files", "file", "*") + "?commit=commit1&shard=1-2") require.NoError(t, err) require.Equal(t, http.StatusOK, res.StatusCode) reader = multipart.NewReader(res.Body, res.Header.Get("Boundary")) for p, err := reader.NextPart(); err != io.EOF; p, err = reader.NextPart() { require.NoError(t, err) match, err := route.Match(p.FileName(), "1-2") require.NoError(t, err) require.True(t, match, fmt.Sprintf("%s should match", p.FileName())) if _, ok := files[p.FileName()]; ok == true { t.Fatalf("File: %s received twice.", p.FileName()) } files[p.FileName()] = struct{}{} } }
func (s *shard) PipelineFileGetAll(pipelineName string, fileName string, commit string, shard string) ([]File, error) { matches, err := btrfs.Glob(path.Join(s.pipelinePrefix, pipelineName, commit, fileName)) if err != nil { return nil, err } var result []File for _, match := range matches { prefix := path.Join("/", s.pipelinePrefix, pipelineName, commit) if !strings.HasSuffix(prefix, "/") { prefix = prefix + "/" } name := strings.TrimPrefix(match, prefix) if shard != "" { ok, err := route.Match(name, shard) if err != nil { return nil, err } if !ok { continue } } file, err := s.PipelineFileGet(pipelineName, name, commit) if err == ErrIsDirectory { continue } if err != nil { return nil, err } result = append(result, file) } return result, nil }
func TestShuffle(t *testing.T) { t.Parallel() cache := etcache.NewTestCache() // Setup 2 shards shard1 := NewShard("", "TestShuffleData-0-2", "TestShufflePipelines-0-2", 0, 2, cache) require.NoError(t, shard1.EnsureRepos()) s1 := httptest.NewServer(NewShardHTTPHandler(shard1)) defer s1.Close() shard2 := NewShard("", "TestShuffleData-1-2", "TestShufflePipelines-1-2", 1, 2, cache) require.NoError(t, shard2.EnsureRepos()) s2 := httptest.NewServer(NewShardHTTPHandler(shard2)) defer s2.Close() files := []string{"foo", "bar", "fizz", "buzz"} for _, file := range files { checkWriteFile(t, s1.URL, path.Join("data", file), "master", file) checkWriteFile(t, s2.URL, path.Join("data", file), "master", file) } // Spoof the shards in etcache cache.SpoofMany("/pfs/master", []string{s1.URL, s2.URL}, false) pipeline := ` image ubuntu input data run cp -r /in/data /out shuffle data ` res, err := http.Post(s1.URL+"/pipeline/shuffle", "application/text", strings.NewReader(pipeline)) require.NoError(t, err) res.Body.Close() res, err = http.Post(s2.URL+"/pipeline/shuffle", "application/text", strings.NewReader(pipeline)) require.NoError(t, err) res.Body.Close() res, err = http.Post(s1.URL+"/commit?commit=commit1", "", nil) require.NoError(t, err) res, err = http.Post(s2.URL+"/commit?commit=commit1", "", nil) require.NoError(t, err) for _, file := range files { match, err := route.Match(path.Join("data", file), "0-2") require.NoError(t, err) if match { log.Print("shard: s1 file: ", file) checkFile(t, s1.URL+"/pipeline/shuffle", path.Join("data", file), "commit1", file+file) } else { log.Print("shard: s2 file: ", file) checkFile(t, s2.URL+"/pipeline/shuffle", path.Join("data", file), "commit1", file+file) } } }
// inject injects data from an external source into the output directory func (p *pipeline) inject(name string, public bool) error { switch { case strings.HasPrefix(name, "s3://"): bucket, err := s3utils.GetBucket(name) if err != nil { return err } client := s3utils.NewClient(public) var wg sync.WaitGroup s3utils.ForEachFile(name, public, "", func(file string, modtime time.Time) error { // Grab the path, it's handy later _path, err := s3utils.GetPath(name) if err != nil { return err } if err != nil { return err } // Check if the file belongs on shit shard match, err := route.Match(file, p.shard) if err != nil { return err } if !match { return nil } // Check if the file has changed changed, err := btrfs.Changed(path.Join(p.outRepo, p.branch, strings.TrimPrefix(file, _path)), modtime) if err != nil { return err } if !changed { return nil } // TODO match the on disk timestamps to s3's timestamps and make // sure we only pull data that has changed wg.Add(1) go func() { defer wg.Done() response, err := client.GetObject(&s3.GetObjectInput{ Bucket: &bucket, Key: &file, }) if err != nil { return } src := response.Body dst, err := btrfs.CreateAll(path.Join(p.outRepo, p.branch, strings.TrimPrefix(file, _path))) if err != nil { return } defer dst.Close() _, err = io.Copy(dst, src) if err != nil { return } err = btrfs.Chtimes(path.Join(p.outRepo, p.branch, strings.TrimPrefix(file, _path)), modtime, modtime) if err != nil { return } }() return nil }) wg.Wait() default: log.Print("Unknown protocol: ", name) return ErrUnknownProtocol } return nil }