// TestRecover runs a pipeline with an error. Then fixes the pipeline to not // include an error and reruns it. func TestRecover(t *testing.T) { t.Parallel() inRepo := "TestRecover_in" require.NoError(t, btrfs.Init(inRepo)) outPrefix := "TestRecover_out" // Create the Pachfile require.NoError(t, btrfs.WriteFile(path.Join(inRepo, "master", "pipeline", "recover"), []byte(` image ubuntu run touch /out/foo run touch /out/bar && cp /in/foo /out/bar `))) // Last line should fail here. // Commit to the inRepo require.NoError(t, btrfs.Commit(inRepo, "commit1", "master")) // Run the pipelines err := RunPipelines("pipeline", inRepo, outPrefix, "commit1", "master", "0-1", etcache.NewCache()) require.Error(t, err, "Running pipeline should error.") // Fix the Pachfile require.NoError(t, btrfs.WriteFile(path.Join(inRepo, "master", "pipeline", "recover"), []byte(` image ubuntu run touch /out/foo run touch /out/bar `))) // Commit to the inRepo require.NoError(t, btrfs.Commit(inRepo, "commit2", "master")) // Run the pipelines err = RunPipelines("pipeline", inRepo, outPrefix, "commit2", "master", "0-1", etcache.NewCache()) // this time the pipelines should not err require.NoError(t, err) // These are the most important 2 checks: // If this one fails it means that dirty state isn't properly saved checkExists(t, path.Join(outPrefix, "recover", "commit1-fail/bar")) // If this one fails it means that dirty state isn't properly cleared checkNoExists(t, path.Join(outPrefix, "recover", "commit2-0/bar")) // These commits are mostly covered by other tests checkExists(t, path.Join(outPrefix, "recover", "commit1-fail/foo")) checkExists(t, path.Join(outPrefix, "recover", "commit1-0/foo")) checkNoExists(t, path.Join(outPrefix, "recover", "commit1-1")) checkNoExists(t, path.Join(outPrefix, "recover", "commit1")) checkExists(t, path.Join(outPrefix, "recover", "commit2-0/foo")) checkExists(t, path.Join(outPrefix, "recover", "commit2-1/foo")) checkExists(t, path.Join(outPrefix, "recover", "commit2-1/bar")) checkExists(t, path.Join(outPrefix, "recover", "commit2/foo")) checkExists(t, path.Join(outPrefix, "recover", "commit2/bar")) }
func TestInputOutput(t *testing.T) { t.Parallel() // create the in repo pipeline := newTestPipeline(t, "inputOutput", "commit", "master", "0-1", true) // add data to it err := btrfs.WriteFile(path.Join(pipeline.inRepo, "master", "data", "foo"), []byte("foo")) require.NoError(t, err) // commit data err = btrfs.Commit(pipeline.inRepo, "commit", "master") require.NoError(t, err) pachfile := ` image ubuntu input data run cp /in/data/foo /out/foo ` err = pipeline.runPachFile(strings.NewReader(pachfile)) require.NoError(t, err) exists, err := btrfs.FileExists(path.Join(pipeline.outRepo, "commit-0", "foo")) require.NoError(t, err) require.True(t, exists, "File `foo` doesn't exist when it should.") }
func TestRunnerInputs(t *testing.T) { t.Parallel() inRepo := "TestRunnerInputs_in" require.NoError(t, btrfs.Init(inRepo)) p1 := ` image ubuntu input foo input bar ` require.NoError(t, btrfs.WriteFile(path.Join(inRepo, "master", "pipeline", "p1"), []byte(p1))) p2 := ` image ubuntu input fizz input buzz ` require.NoError(t, btrfs.WriteFile(path.Join(inRepo, "master", "pipeline", "p2"), []byte(p2))) require.NoError(t, btrfs.Commit(inRepo, "commit", "master")) outPrefix := "TestRunnerInputs" runner := NewRunner("pipeline", inRepo, outPrefix, "commit", "master", "0-1", etcache.NewCache()) inputs, err := runner.Inputs() require.NoError(t, err) require.Equal(t, []string{"foo", "bar", "fizz", "buzz"}, inputs) }
func TestDependency(t *testing.T) { t.Parallel() inRepo := "TestDependency_in" require.NoError(t, btrfs.Init(inRepo)) p1 := ` image ubuntu run echo foo >/out/foo ` require.NoError(t, btrfs.WriteFile(path.Join(inRepo, "master", "pipeline", "p1"), []byte(p1))) p2 := ` image ubuntu input pps://p1 run cp /in/p1/foo /out/foo ` require.NoError(t, btrfs.WriteFile(path.Join(inRepo, "master", "pipeline", "p2"), []byte(p2))) require.NoError(t, btrfs.Commit(inRepo, "commit", "master")) outPrefix := "TestDependency" runner := NewRunner("pipeline", inRepo, outPrefix, "commit", "master", "0-1", etcache.NewCache()) require.NoError(t, runner.Run()) res, err := btrfs.ReadFile(path.Join(outPrefix, "p2", "commit", "foo")) require.NoError(t, err) require.Equal(t, "foo\n", string(res)) }
func TestCancel(t *testing.T) { t.Parallel() inRepo := "TestCancel_in" require.NoError(t, btrfs.Init(inRepo)) outPrefix := "TestCancel_out" // Create the Pachfile require.NoError(t, btrfs.WriteFile(path.Join(inRepo, "master", "pipeline", "cancel"), []byte(` image ubuntu run sleep 100 `))) require.NoError(t, btrfs.Commit(inRepo, "commit", "master")) r := NewRunner("pipeline", inRepo, outPrefix, "commit", "master", "0-1", etcache.NewCache()) go func() { err := r.Run() require.Equal(t, ErrCancelled, err) }() // This is just to make sure we don't trigger the early exit case in Run // and actually exercise the code. time.Sleep(time.Second * 2) require.NoError(t, r.Cancel()) }
// TestError makes sure that we handle commands that error correctly. func TestError(t *testing.T) { t.Parallel() inRepo := "TestError_in" require.NoError(t, btrfs.Init(inRepo)) outPrefix := "TestError_out" // Create the Pachfile require.NoError(t, btrfs.WriteFile(path.Join(inRepo, "master", "pipeline", "error"), []byte(` image ubuntu run touch /out/foo run cp /in/foo /out/bar `))) // Last line should fail here. // Commit to the inRepo require.NoError(t, btrfs.Commit(inRepo, "commit", "master")) err := RunPipelines("pipeline", inRepo, outPrefix, "commit", "master", "0-1", etcache.NewCache()) require.Error(t, err, "Running pipeline should error.") // Check that foo exists exists, err := btrfs.FileExists(path.Join(outPrefix, "error", "commit-0", "foo")) require.NoError(t, err) require.True(t, exists, "File foo should exist.") // Check that commit doesn't exist exists, err = btrfs.FileExists(path.Join(outPrefix, "error", "commit")) require.NoError(t, err) require.False(t, exists, "Commit \"commit\" should not get created when a command fails.") }
// TestPipelines runs a 2 step pipeline. func TestPipelines(t *testing.T) { t.Parallel() inRepo := "TestPipelines_in" require.NoError(t, btrfs.Init(inRepo)) outPrefix := "TestPipelines_out" // Create a data file: require.NoError(t, btrfs.WriteFile(path.Join(inRepo, "master", "data", "foo"), []byte("foo"))) // Create the Pachfile require.NoError(t, btrfs.WriteFile(path.Join(inRepo, "master", "pipeline", "cp"), []byte(` image ubuntu input data run cp /in/data/foo /out/foo run echo "foo" `))) require.NoError(t, btrfs.Commit(inRepo, "commit", "master")) require.NoError(t, RunPipelines("pipeline", inRepo, outPrefix, "commit", "master", "0-1", etcache.NewCache())) data, err := btrfs.ReadFile(path.Join(outPrefix, "cp", "commit", "foo")) require.NoError(t, err) require.Equal(t, "foo", string(data)) }
// TestScrape tests a the scraper pipeline func TestScrape(t *testing.T) { // TODO(any): what?? wget is not found in the container if parallel is set //t.Parallel() pipeline := newTestPipeline(t, "scrape", "commit", "master", "0-1", true) // Create a url to scrape require.NoError(t, btrfs.WriteFile(path.Join(pipeline.inRepo, "master", "urls", "1"), []byte("pachyderm.io"))) // Commit the data require.NoError(t, btrfs.Commit(pipeline.inRepo, "commit", "master")) // Create a pipeline to run pachfile := ` image pachyderm/scraper input urls run cat /in/urls/* | xargs wget -P /out ` err := pipeline.runPachFile(strings.NewReader(pachfile)) exists, err := btrfs.FileExists(path.Join(pipeline.outRepo, "commit", "index.html")) require.NoError(t, err) require.True(t, exists, "pachyderm.io should exist") }
func (s *shard) CommitCreate(name string, branch string) (Commit, error) { if err := btrfs.Commit(s.dataRepo, name, branch); err != nil { return Commit{}, err } // We lock the guard so that we can remove the oldRunner from the map // and add the newRunner in. s.guard.Lock() oldRunner, ok := s.runners[branch] newRunner := pipeline.NewRunner("pipeline", s.dataRepo, s.pipelinePrefix, name, branch, s.shardStr, s.cache) s.runners[branch] = newRunner s.guard.Unlock() go func() { // cancel oldRunner if it exists if ok { err := oldRunner.Cancel() if err != nil { log.Print(err) } } err := newRunner.Run() if err != nil { log.Print(err) } }() go s.syncToPeers() return s.CommitGet(name) }
// finish makes the final commit for the pipeline func (p *pipeline) finish() error { exists, err := btrfs.FileExists(path.Join(p.outRepo, p.commit)) if err != nil { return err } if exists { return nil } return btrfs.Commit(p.outRepo, p.commit, p.branch) }
// Run runs a command in the container, it assumes that `branch` has already // been created. // Notice that any failure in this function leads to the branch having // uncommitted dirty changes. This state needs to be cleaned up before the // pipeline is rerun. The reason we don't do it here is that even if we try our // best the process crashing at the wrong time could still leave it in an // inconsistent state. func (p *pipeline) run(cmd []string) error { // this function always increments counter defer func() { p.counter++ }() // Check if the commit already exists exists, err := btrfs.FileExists(path.Join(p.outRepo, p.runCommit())) if err != nil { return err } // if the commit exists there's no work to be done if exists { return nil } // Set the command p.config.Config.Cmd = []string{"sh"} //p.config.Config.Volumes["/out"] = emptyStruct() // Map the out directory in as a bind hostPath := btrfs.HostPath(path.Join(p.outRepo, p.branch)) bind := fmt.Sprintf("%s:/out", hostPath) p.config.HostConfig.Binds = append(p.config.HostConfig.Binds, bind) log.Print(p.config.HostConfig.Binds) // Make sure this bind is only visible for the duration of run defer func() { p.config.HostConfig.Binds = p.config.HostConfig.Binds[:len(p.config.HostConfig.Binds)-1] }() // Start the container p.container, err = startContainer(p.config) if err != nil { return err } if err := pipeToStdin(p.container, strings.NewReader(strings.Join(cmd, " ")+"\n")); err != nil { return err } // Create a place to put the logs f, err := btrfs.CreateAll(path.Join(p.outRepo, p.branch, ".log")) if err != nil { return err } defer f.Close() // Copy the logs from the container in to the file. if err = containerLogs(p.container, f); err != nil { return err } // Wait for the command to finish: exit, err := waitContainer(p.container) if err != nil { return err } if exit != 0 { // The command errored return fmt.Errorf("Command:\n\t%s\nhad exit code: %d.\n", strings.Join(cmd, " "), exit) } return btrfs.Commit(p.outRepo, p.runCommit(), p.branch) }
// Shuffle rehashes an output directory. // If 2 shards each have a copy of the file `foo` with the content: `bar`. // Then after shuffling 1 of those nodes will have a file `foo` with content // `barbar` and the other will have no file foo. func (p *pipeline) shuffle(dir string) error { // this function always increments counter defer func() { p.counter++ }() // First we clear the directory, notice that the previous commit from // which we're pulling has already been made so this doesn't destroy the // data that others are trying to pull. // TODO(jd) #performance this is a seriously unperformant part of the code // since it messes up our ability to do incremental results. We should do // something smarter here. if err := btrfs.RemoveAll(path.Join(p.outRepo, p.branch, dir)); err != nil { return err } if err := btrfs.MkdirAll(path.Join(p.outRepo, p.branch, dir)); err != nil { return err } // We want to pull files from the previous commit commit := fmt.Sprintf("%s-%d", p.commit, p.counter-1) // Notice we're just passing "host" here. Multicast will fill in the host // field so we don't actually need to specify it. req, err := http.NewRequest("GET", "http://host/"+path.Join("pipeline", p.name, "file", dir, "*")+"?commit="+commit+"&shard="+p.shard, nil) if err != nil { return err } // Dispatch the request resps, err := route.Multicast(p.cache, req, "/pfs/master") if err != nil { return err } // Set up some concurrency structures. errors := make(chan error, len(resps)) var wg sync.WaitGroup wg.Add(len(resps)) lock := util.NewPathLock() // for _, resp := range resps { // We used to iterate like the above but it exhibited racy behavior. I // don't fully understand why this was. Something to look in to. for _, resp := range resps { go func(resp *http.Response) { defer wg.Done() reader := multipart.NewReader(resp.Body, resp.Header.Get("Boundary")) for part, err := reader.NextPart(); err != io.EOF; part, err = reader.NextPart() { lock.Lock(part.FileName()) _, err := btrfs.Append(path.Join(p.outRepo, p.branch, part.FileName()), part) lock.Unlock(part.FileName()) if err != nil { errors <- err return } } }(resp) } wg.Wait() close(errors) // Check for errors for err := range errors { if err != nil { return err } } return btrfs.Commit(p.outRepo, p.runCommit(), p.branch) }