Example #1
0
// Shuffle rehashes an output directory.
// If 2 shards each have a copy of the file `foo` with the content: `bar`.
// Then after shuffling 1 of those nodes will have a file `foo` with content
// `barbar` and the other will have no file foo.
func (p *pipeline) shuffle(dir string) error {
	// this function always increments counter
	defer func() { p.counter++ }()
	// First we clear the directory, notice that the previous commit from
	// which we're pulling has already been made so this doesn't destroy the
	// data that others are trying to pull.
	// TODO(jd) #performance this is a seriously unperformant part of the code
	// since it messes up our ability to do incremental results. We should do
	// something smarter here.
	if err := btrfs.RemoveAll(path.Join(p.outRepo, p.branch, dir)); err != nil {
		return err
	}
	if err := btrfs.MkdirAll(path.Join(p.outRepo, p.branch, dir)); err != nil {
		return err
	}
	// We want to pull files from the previous commit
	commit := fmt.Sprintf("%s-%d", p.commit, p.counter-1)
	// Notice we're just passing "host" here. Multicast will fill in the host
	// field so we don't actually need to specify it.
	req, err := http.NewRequest("GET", "http://host/"+path.Join("pipeline", p.name, "file", dir, "*")+"?commit="+commit+"&shard="+p.shard, nil)
	if err != nil {
		return err
	}
	// Dispatch the request
	resps, err := route.Multicast(p.cache, req, "/pfs/master")
	if err != nil {
		return err
	}

	// Set up some concurrency structures.
	errors := make(chan error, len(resps))
	var wg sync.WaitGroup
	wg.Add(len(resps))
	lock := util.NewPathLock()
	// for _, resp := range resps {
	// We used to iterate like the above but it exhibited racy behavior. I
	// don't fully understand why this was. Something to look in to.
	for _, resp := range resps {
		go func(resp *http.Response) {
			defer wg.Done()
			reader := multipart.NewReader(resp.Body, resp.Header.Get("Boundary"))

			for part, err := reader.NextPart(); err != io.EOF; part, err = reader.NextPart() {
				lock.Lock(part.FileName())
				_, err := btrfs.Append(path.Join(p.outRepo, p.branch, part.FileName()), part)
				lock.Unlock(part.FileName())
				if err != nil {
					errors <- err
					return
				}
			}
		}(resp)
	}
	wg.Wait()
	close(errors)

	// Check for errors
	for err := range errors {
		if err != nil {
			return err
		}
	}
	return btrfs.Commit(p.outRepo, p.runCommit(), p.branch)
}
Example #2
0
// Run runs all of the pipelines it finds in pipelineDir. Returns the
// first error it encounters.
func (r *Runner) Run() error {
	if err := btrfs.MkdirAll(r.outPrefix); err != nil {
		return err
	}
	if err := r.startInputPipelines(); err != nil {
		return err
	}
	pipelines, err := btrfs.ReadDir(path.Join(r.inRepo, r.commit, r.pipelineDir))
	if err != nil {
		// Notice we don't return this error but instead no-op. It's fine to not
		// have a pipeline dir.
		return nil
	}
	// A chanel for the errors, notice that it's capacity is the same as the
	// number of pipelines. The below code should make sure that each pipeline only
	// sends 1 error otherwise deadlock may occur.
	errors := make(chan error, len(pipelines))
	// Make sure we don't race with cancel this is held while we add pipelines.
	r.lock.Lock()
	if r.cancelled {
		// we were cancelled before we even started
		r.lock.Unlock()
		return ErrCancelled
	}
	for _, pInfo := range pipelines {
		if err := r.makeOutRepo(pInfo.Name()); err != nil {
			return err
		}
		p := newPipeline(pInfo.Name(), r.inRepo, path.Join(r.outPrefix, pInfo.Name()), r.commit, r.branch, r.shard, r.outPrefix, r.cache)
		r.pipelines = append(r.pipelines, p)
	}
	// unlocker lets us defer unlocking and explicitly unlock
	var unlocker sync.Once
	defer unlocker.Do(r.lock.Unlock)
	for _, p := range r.pipelines {
		p := p
		r.wait.Add(1)
		go func() {
			defer r.wait.Done()
			f, err := btrfs.Open(path.Join(r.inRepo, r.commit, r.pipelineDir, p.name))
			if err != nil {
				errors <- err
				return
			}
			defer f.Close()
			err = p.runPachFile(f)
			if err != nil {
				errors <- err
				return
			}
		}()
	}
	// We're done adding pipelines so unlock
	unlocker.Do(r.lock.Unlock)
	// Wait for the pipelines to finish
	r.wait.Wait()
	close(errors)
	if r.cancelled {
		// Pipelines finished because we were cancelled
		return ErrCancelled
	}
	for err := range errors {
		return err
	}
	return nil
}
Example #3
0
func (s *shard) FileCreate(name string, content io.Reader, branch string) error {
	filePath := path.Join(s.dataRepo, branch, name)
	btrfs.MkdirAll(path.Dir(filePath))
	_, err := btrfs.CreateFromReader(filePath, content)
	return err
}