Ejemplo n.º 1
0
// TestRecover runs a pipeline with an error. Then fixes the pipeline to not
// include an error and reruns it.
func TestRecover(t *testing.T) {
	t.Parallel()
	inRepo := "TestRecover_in"
	require.NoError(t, btrfs.Init(inRepo))
	outPrefix := "TestRecover_out"

	// Create the Pachfile
	require.NoError(t, btrfs.WriteFile(path.Join(inRepo, "master", "pipeline", "recover"), []byte(`
image ubuntu

run touch /out/foo
run touch /out/bar && cp /in/foo /out/bar
`)))
	// Last line should fail here.

	// Commit to the inRepo
	require.NoError(t, btrfs.Commit(inRepo, "commit1", "master"))

	// Run the pipelines
	err := RunPipelines("pipeline", inRepo, outPrefix, "commit1", "master", "0-1", etcache.NewCache())
	require.Error(t, err, "Running pipeline should error.")

	// Fix the Pachfile
	require.NoError(t, btrfs.WriteFile(path.Join(inRepo, "master", "pipeline", "recover"), []byte(`
image ubuntu

run touch /out/foo
run touch /out/bar
`)))

	// Commit to the inRepo
	require.NoError(t, btrfs.Commit(inRepo, "commit2", "master"))

	// Run the pipelines
	err = RunPipelines("pipeline", inRepo, outPrefix, "commit2", "master", "0-1", etcache.NewCache())
	// this time the pipelines should not err
	require.NoError(t, err)

	// These are the most important 2 checks:

	// If this one fails it means that dirty state isn't properly saved
	checkExists(t, path.Join(outPrefix, "recover", "commit1-fail/bar"))
	// If this one fails it means that dirty state isn't properly cleared
	checkNoExists(t, path.Join(outPrefix, "recover", "commit2-0/bar"))

	// These commits are mostly covered by other tests
	checkExists(t, path.Join(outPrefix, "recover", "commit1-fail/foo"))
	checkExists(t, path.Join(outPrefix, "recover", "commit1-0/foo"))
	checkNoExists(t, path.Join(outPrefix, "recover", "commit1-1"))
	checkNoExists(t, path.Join(outPrefix, "recover", "commit1"))
	checkExists(t, path.Join(outPrefix, "recover", "commit2-0/foo"))
	checkExists(t, path.Join(outPrefix, "recover", "commit2-1/foo"))
	checkExists(t, path.Join(outPrefix, "recover", "commit2-1/bar"))
	checkExists(t, path.Join(outPrefix, "recover", "commit2/foo"))
	checkExists(t, path.Join(outPrefix, "recover", "commit2/bar"))
}
Ejemplo n.º 2
0
func TestInputOutput(t *testing.T) {
	t.Parallel()
	// create the in repo
	pipeline := newTestPipeline(t, "inputOutput", "commit", "master", "0-1", true)

	// add data to it
	err := btrfs.WriteFile(path.Join(pipeline.inRepo, "master", "data", "foo"), []byte("foo"))
	require.NoError(t, err)

	// commit data
	err = btrfs.Commit(pipeline.inRepo, "commit", "master")
	require.NoError(t, err)

	pachfile := `
image ubuntu

input data

run cp /in/data/foo /out/foo
`
	err = pipeline.runPachFile(strings.NewReader(pachfile))
	require.NoError(t, err)

	exists, err := btrfs.FileExists(path.Join(pipeline.outRepo, "commit-0", "foo"))
	require.NoError(t, err)
	require.True(t, exists, "File `foo` doesn't exist when it should.")
}
Ejemplo n.º 3
0
func TestRunnerInputs(t *testing.T) {
	t.Parallel()
	inRepo := "TestRunnerInputs_in"
	require.NoError(t, btrfs.Init(inRepo))
	p1 := `
image ubuntu

input foo
input bar
`
	require.NoError(t, btrfs.WriteFile(path.Join(inRepo, "master", "pipeline", "p1"), []byte(p1)))
	p2 := `
image ubuntu

input fizz
input buzz
`
	require.NoError(t, btrfs.WriteFile(path.Join(inRepo, "master", "pipeline", "p2"), []byte(p2)))
	require.NoError(t, btrfs.Commit(inRepo, "commit", "master"))

	outPrefix := "TestRunnerInputs"
	runner := NewRunner("pipeline", inRepo, outPrefix, "commit", "master", "0-1", etcache.NewCache())
	inputs, err := runner.Inputs()
	require.NoError(t, err)
	require.Equal(t, []string{"foo", "bar", "fizz", "buzz"}, inputs)
}
Ejemplo n.º 4
0
func TestDependency(t *testing.T) {
	t.Parallel()
	inRepo := "TestDependency_in"
	require.NoError(t, btrfs.Init(inRepo))
	p1 := `
image ubuntu

run echo foo >/out/foo
`
	require.NoError(t, btrfs.WriteFile(path.Join(inRepo, "master", "pipeline", "p1"), []byte(p1)))
	p2 := `
image ubuntu

input pps://p1

run cp /in/p1/foo /out/foo
`
	require.NoError(t, btrfs.WriteFile(path.Join(inRepo, "master", "pipeline", "p2"), []byte(p2)))
	require.NoError(t, btrfs.Commit(inRepo, "commit", "master"))

	outPrefix := "TestDependency"
	runner := NewRunner("pipeline", inRepo, outPrefix, "commit", "master", "0-1", etcache.NewCache())
	require.NoError(t, runner.Run())

	res, err := btrfs.ReadFile(path.Join(outPrefix, "p2", "commit", "foo"))
	require.NoError(t, err)
	require.Equal(t, "foo\n", string(res))
}
Ejemplo n.º 5
0
func TestCancel(t *testing.T) {
	t.Parallel()
	inRepo := "TestCancel_in"
	require.NoError(t, btrfs.Init(inRepo))
	outPrefix := "TestCancel_out"

	// Create the Pachfile
	require.NoError(t, btrfs.WriteFile(path.Join(inRepo, "master", "pipeline", "cancel"), []byte(`
image ubuntu

run sleep 100
`)))
	require.NoError(t, btrfs.Commit(inRepo, "commit", "master"))

	r := NewRunner("pipeline", inRepo, outPrefix, "commit", "master", "0-1", etcache.NewCache())
	go func() {
		err := r.Run()
		require.Equal(t, ErrCancelled, err)
	}()

	// This is just to make sure we don't trigger the early exit case in Run
	// and actually exercise the code.
	time.Sleep(time.Second * 2)
	require.NoError(t, r.Cancel())
}
Ejemplo n.º 6
0
// TestError makes sure that we handle commands that error correctly.
func TestError(t *testing.T) {
	t.Parallel()
	inRepo := "TestError_in"
	require.NoError(t, btrfs.Init(inRepo))
	outPrefix := "TestError_out"

	// Create the Pachfile
	require.NoError(t, btrfs.WriteFile(path.Join(inRepo, "master", "pipeline", "error"), []byte(`
image ubuntu

run touch /out/foo
run cp /in/foo /out/bar
`)))
	// Last line should fail here.

	// Commit to the inRepo
	require.NoError(t, btrfs.Commit(inRepo, "commit", "master"))

	err := RunPipelines("pipeline", inRepo, outPrefix, "commit", "master", "0-1", etcache.NewCache())
	require.Error(t, err, "Running pipeline should error.")

	// Check that foo exists
	exists, err := btrfs.FileExists(path.Join(outPrefix, "error", "commit-0", "foo"))
	require.NoError(t, err)
	require.True(t, exists, "File foo should exist.")

	// Check that commit doesn't exist
	exists, err = btrfs.FileExists(path.Join(outPrefix, "error", "commit"))
	require.NoError(t, err)
	require.False(t, exists, "Commit \"commit\" should not get created when a command fails.")
}
Ejemplo n.º 7
0
// TestPipelines runs a 2 step pipeline.
func TestPipelines(t *testing.T) {
	t.Parallel()
	inRepo := "TestPipelines_in"
	require.NoError(t, btrfs.Init(inRepo))
	outPrefix := "TestPipelines_out"

	// Create a data file:
	require.NoError(t, btrfs.WriteFile(path.Join(inRepo, "master", "data", "foo"), []byte("foo")))

	// Create the Pachfile
	require.NoError(t, btrfs.WriteFile(path.Join(inRepo, "master", "pipeline", "cp"), []byte(`
image ubuntu

input data

run cp /in/data/foo /out/foo
run echo "foo"
`)))
	require.NoError(t, btrfs.Commit(inRepo, "commit", "master"))

	require.NoError(t, RunPipelines("pipeline", inRepo, outPrefix, "commit", "master", "0-1", etcache.NewCache()))

	data, err := btrfs.ReadFile(path.Join(outPrefix, "cp", "commit", "foo"))
	require.NoError(t, err)
	require.Equal(t, "foo", string(data))
}
Ejemplo n.º 8
0
// TestScrape tests a the scraper pipeline
func TestScrape(t *testing.T) {
	// TODO(any): what?? wget is not found in the container if parallel is set
	//t.Parallel()
	pipeline := newTestPipeline(t, "scrape", "commit", "master", "0-1", true)

	// Create a url to scrape
	require.NoError(t, btrfs.WriteFile(path.Join(pipeline.inRepo, "master", "urls", "1"), []byte("pachyderm.io")))

	// Commit the data
	require.NoError(t, btrfs.Commit(pipeline.inRepo, "commit", "master"))

	// Create a pipeline to run
	pachfile := `
image pachyderm/scraper

input urls

run cat /in/urls/* | xargs wget -P /out
`
	err := pipeline.runPachFile(strings.NewReader(pachfile))

	exists, err := btrfs.FileExists(path.Join(pipeline.outRepo, "commit", "index.html"))
	require.NoError(t, err)
	require.True(t, exists, "pachyderm.io should exist")
}
Ejemplo n.º 9
0
func (s *shard) CommitCreate(name string, branch string) (Commit, error) {
	if err := btrfs.Commit(s.dataRepo, name, branch); err != nil {
		return Commit{}, err
	}
	// We lock the guard so that we can remove the oldRunner from the map
	// and add the newRunner in.
	s.guard.Lock()
	oldRunner, ok := s.runners[branch]
	newRunner := pipeline.NewRunner("pipeline", s.dataRepo, s.pipelinePrefix, name, branch, s.shardStr, s.cache)
	s.runners[branch] = newRunner
	s.guard.Unlock()
	go func() {
		// cancel oldRunner if it exists
		if ok {
			err := oldRunner.Cancel()
			if err != nil {
				log.Print(err)
			}
		}
		err := newRunner.Run()
		if err != nil {
			log.Print(err)
		}
	}()
	go s.syncToPeers()
	return s.CommitGet(name)
}
Ejemplo n.º 10
0
// finish makes the final commit for the pipeline
func (p *pipeline) finish() error {
	exists, err := btrfs.FileExists(path.Join(p.outRepo, p.commit))
	if err != nil {
		return err
	}
	if exists {
		return nil
	}
	return btrfs.Commit(p.outRepo, p.commit, p.branch)
}
Ejemplo n.º 11
0
// Run runs a command in the container, it assumes that `branch` has already
// been created.
// Notice that any failure in this function leads to the branch having
// uncommitted dirty changes. This state needs to be cleaned up before the
// pipeline is rerun. The reason we don't do it here is that even if we try our
// best the process crashing at the wrong time could still leave it in an
// inconsistent state.
func (p *pipeline) run(cmd []string) error {
	// this function always increments counter
	defer func() { p.counter++ }()
	// Check if the commit already exists
	exists, err := btrfs.FileExists(path.Join(p.outRepo, p.runCommit()))
	if err != nil {
		return err
	}
	// if the commit exists there's no work to be done
	if exists {
		return nil
	}
	// Set the command
	p.config.Config.Cmd = []string{"sh"}
	//p.config.Config.Volumes["/out"] = emptyStruct()
	// Map the out directory in as a bind
	hostPath := btrfs.HostPath(path.Join(p.outRepo, p.branch))
	bind := fmt.Sprintf("%s:/out", hostPath)
	p.config.HostConfig.Binds = append(p.config.HostConfig.Binds, bind)
	log.Print(p.config.HostConfig.Binds)
	// Make sure this bind is only visible for the duration of run
	defer func() { p.config.HostConfig.Binds = p.config.HostConfig.Binds[:len(p.config.HostConfig.Binds)-1] }()
	// Start the container
	p.container, err = startContainer(p.config)
	if err != nil {
		return err
	}
	if err := pipeToStdin(p.container, strings.NewReader(strings.Join(cmd, " ")+"\n")); err != nil {
		return err
	}
	// Create a place to put the logs
	f, err := btrfs.CreateAll(path.Join(p.outRepo, p.branch, ".log"))
	if err != nil {
		return err
	}
	defer f.Close()
	// Copy the logs from the container in to the file.
	if err = containerLogs(p.container, f); err != nil {
		return err
	}
	// Wait for the command to finish:
	exit, err := waitContainer(p.container)
	if err != nil {
		return err
	}
	if exit != 0 {
		// The command errored
		return fmt.Errorf("Command:\n\t%s\nhad exit code: %d.\n",
			strings.Join(cmd, " "), exit)
	}
	return btrfs.Commit(p.outRepo, p.runCommit(), p.branch)
}
Ejemplo n.º 12
0
// Shuffle rehashes an output directory.
// If 2 shards each have a copy of the file `foo` with the content: `bar`.
// Then after shuffling 1 of those nodes will have a file `foo` with content
// `barbar` and the other will have no file foo.
func (p *pipeline) shuffle(dir string) error {
	// this function always increments counter
	defer func() { p.counter++ }()
	// First we clear the directory, notice that the previous commit from
	// which we're pulling has already been made so this doesn't destroy the
	// data that others are trying to pull.
	// TODO(jd) #performance this is a seriously unperformant part of the code
	// since it messes up our ability to do incremental results. We should do
	// something smarter here.
	if err := btrfs.RemoveAll(path.Join(p.outRepo, p.branch, dir)); err != nil {
		return err
	}
	if err := btrfs.MkdirAll(path.Join(p.outRepo, p.branch, dir)); err != nil {
		return err
	}
	// We want to pull files from the previous commit
	commit := fmt.Sprintf("%s-%d", p.commit, p.counter-1)
	// Notice we're just passing "host" here. Multicast will fill in the host
	// field so we don't actually need to specify it.
	req, err := http.NewRequest("GET", "http://host/"+path.Join("pipeline", p.name, "file", dir, "*")+"?commit="+commit+"&shard="+p.shard, nil)
	if err != nil {
		return err
	}
	// Dispatch the request
	resps, err := route.Multicast(p.cache, req, "/pfs/master")
	if err != nil {
		return err
	}

	// Set up some concurrency structures.
	errors := make(chan error, len(resps))
	var wg sync.WaitGroup
	wg.Add(len(resps))
	lock := util.NewPathLock()
	// for _, resp := range resps {
	// We used to iterate like the above but it exhibited racy behavior. I
	// don't fully understand why this was. Something to look in to.
	for _, resp := range resps {
		go func(resp *http.Response) {
			defer wg.Done()
			reader := multipart.NewReader(resp.Body, resp.Header.Get("Boundary"))

			for part, err := reader.NextPart(); err != io.EOF; part, err = reader.NextPart() {
				lock.Lock(part.FileName())
				_, err := btrfs.Append(path.Join(p.outRepo, p.branch, part.FileName()), part)
				lock.Unlock(part.FileName())
				if err != nil {
					errors <- err
					return
				}
			}
		}(resp)
	}
	wg.Wait()
	close(errors)

	// Check for errors
	for err := range errors {
		if err != nil {
			return err
		}
	}
	return btrfs.Commit(p.outRepo, p.runCommit(), p.branch)
}