// Walks `basePath`, hashing it, pushing the encoded tar to `file`, and returning the final hash. func Save(file io.Writer, basePath string, filterset filter.FilterSet, hasherFactory func() hash.Hash) string { // walk filesystem, copying and accumulating data for integrity check bucket := &fshash.MemoryBucket{} tarWriter := tar.NewWriter(file) defer tarWriter.Close() if err := saveWalk(basePath, tarWriter, filterset, bucket, hasherFactory); err != nil { panic(err) // TODO this is not well typed, and does not clearly indicate whether scanning or committing had the problem } // hash whole tree actualTreeHash := fshash.Hash(bucket, hasherFactory) // report return base64.URLEncoding.EncodeToString(actualTreeHash) }
/* Arenas produced by Dir Transmats may be relocated by simple `mv`. */ func (t *DirTransmat) Materialize( kind integrity.TransmatKind, dataHash integrity.CommitID, siloURIs []integrity.SiloURI, options ...integrity.MaterializerConfigurer, ) integrity.Arena { var arena dirArena try.Do(func() { // Basic validation and config config := integrity.EvaluateConfig(options...) if kind != Kind { panic(errors.ProgrammerError.New("This transmat supports definitions of type %q, not %q", Kind, kind)) } // Ping silos if len(siloURIs) < 1 { panic(integrity.ConfigError.New("Materialization requires at least one data source!")) // Note that it's possible a caching layer will satisfy things even without data sources... // but if that was going to happen, it already would have by now. } // Our policy is to take the first path that exists. // This lets you specify a series of potential locations, var siloURI integrity.SiloURI for _, givenURI := range siloURIs { // TODO still assuming all local paths and not doing real uri parsing localPath := string(givenURI) _, err := os.Stat(localPath) if os.IsNotExist(err) { continue } siloURI = givenURI break } if siloURI == "" { panic(integrity.WarehouseConnectionError.New("No warehouses were available!")) } // Create staging arena to produce data into. var err error arena.path, err = ioutil.TempDir(t.workPath, "") if err != nil { panic(integrity.TransmatError.New("Unable to create arena: %s", err)) } // walk filesystem, copying and accumulating data for integrity check hasherFactory := sha512.New384 bucket := &fshash.MemoryBucket{} localPath := string(siloURI) if err := fshash.FillBucket(localPath, arena.Path(), bucket, filter.FilterSet{}, hasherFactory); err != nil { panic(err) } // hash whole tree actualTreeHash := fshash.Hash(bucket, hasherFactory) // verify total integrity expectedTreeHash, err := base64.URLEncoding.DecodeString(string(dataHash)) if err != nil { panic(integrity.ConfigError.New("Could not parse hash: %s", err)) } if bytes.Equal(actualTreeHash, expectedTreeHash) { // excellent, got what we asked for. arena.hash = dataHash } else { // this may or may not be grounds for panic, depending on configuration. if config.AcceptHashMismatch { // if we're tolerating mismatches, report the actual hash through different mechanisms. // you probably only ever want to use this in tests or debugging; in prod it's just asking for insanity. arena.hash = integrity.CommitID(actualTreeHash) } else { panic(integrity.NewHashMismatchError(string(dataHash), base64.URLEncoding.EncodeToString(actualTreeHash))) } } }).Catch(integrity.Error, func(err *errors.Error) { panic(err) }).CatchAll(func(err error) { panic(integrity.UnknownError.Wrap(err)) }).Done() return arena }
/* Arenas produced by Dir Transmats may be relocated by simple `mv`. */ func (t *S3Transmat) Materialize( kind integrity.TransmatKind, dataHash integrity.CommitID, siloURIs []integrity.SiloURI, options ...integrity.MaterializerConfigurer, ) integrity.Arena { var arena dirArena try.Do(func() { // Basic validation and config config := integrity.EvaluateConfig(options...) if kind != Kind { panic(errors.ProgrammerError.New("This transmat supports definitions of type %q, not %q", Kind, kind)) } // Parse URI; Find warehouses. if len(siloURIs) < 1 { panic(integrity.ConfigError.New("Materialization requires at least one data source!")) // Note that it's possible a caching layer will satisfy things even without data sources... // but if that was going to happen, it already would have by now. } // Our policy is to take the first path that exists. // This lets you specify a series of potential locations, and if one is unavailable we'll just take the next. var warehouseBucketName string var warehousePathPrefix string var warehouseCtntAddr bool for _, givenURI := range siloURIs { u, err := url.Parse(string(givenURI)) if err != nil { panic(integrity.ConfigError.New("failed to parse URI: %s", err)) } warehouseBucketName = u.Host warehousePathPrefix = u.Path switch u.Scheme { case "s3": warehouseCtntAddr = false case "s3+splay": warehouseCtntAddr = true default: panic(integrity.ConfigError.New("unrecognized scheme: %q", u.Scheme)) } // TODO figure out how to check for data (or at least warehouse!) presence; // currently just assuming the first one's golden, and blowing up later if it's not. break } if warehouseBucketName == "" { panic(integrity.WarehouseConnectionError.New("No warehouses were available!")) } // load keys from env // TODO someday URIs should grow smart enough to control this in a more general fashion -- but for now, host ENV is actually pretty feasible and plays easily with others. // TODO should not require keys! we're just reading, after all; anon access is 100% valid. // Buuuuut s3gof3r doesn't seem to understand empty keys; it still sends them as if to login, and AWS says 403. So, foo. keys, err := s3gof3r.EnvKeys() if err != nil { panic(S3CredentialsMissingError.Wrap(err)) } // initialize reader from s3! getPath := warehousePathPrefix if warehouseCtntAddr { getPath = path.Join(warehousePathPrefix, string(dataHash)) } s3reader := makeS3reader(warehouseBucketName, getPath, keys) defer s3reader.Close() // prepare decompression as necessary reader, err := tartrans.Decompress(s3reader) if err != nil { panic(integrity.WarehouseConnectionError.New("could not start decompressing: %s", err)) } tarReader := tar.NewReader(reader) // Create staging arena to produce data into. arena.path, err = ioutil.TempDir(t.workPath, "") if err != nil { panic(integrity.TransmatError.New("Unable to create arena: %s", err)) } // walk input tar stream, placing data and accumulating hashes and metadata for integrity check bucket := &fshash.MemoryBucket{} tartrans.Extract(tarReader, arena.Path(), bucket, hasherFactory) // bucket processing may have created a root node if missing. if so, we need to apply its props. fs.PlaceFile(arena.Path(), bucket.Root().Metadata, nil) // hash whole tree actualTreeHash := fshash.Hash(bucket, hasherFactory) // verify total integrity expectedTreeHash, err := base64.URLEncoding.DecodeString(string(dataHash)) if err != nil { panic(integrity.ConfigError.New("Could not parse hash: %s", err)) } if bytes.Equal(actualTreeHash, expectedTreeHash) { // excellent, got what we asked for. arena.hash = dataHash } else { // this may or may not be grounds for panic, depending on configuration. if config.AcceptHashMismatch { // if we're tolerating mismatches, report the actual hash through different mechanisms. // you probably only ever want to use this in tests or debugging; in prod it's just asking for insanity. arena.hash = integrity.CommitID(actualTreeHash) } else { panic(integrity.NewHashMismatchError(string(dataHash), base64.URLEncoding.EncodeToString(actualTreeHash))) } } }).Catch(integrity.Error, func(err *errors.Error) { panic(err) }).CatchAll(func(err error) { panic(integrity.UnknownError.Wrap(err)) }).Done() return arena }
func (t DirTransmat) Scan( kind integrity.TransmatKind, subjectPath string, siloURIs []integrity.SiloURI, options ...integrity.MaterializerConfigurer, ) integrity.CommitID { var commitID integrity.CommitID try.Do(func() { // Basic validation and config config := integrity.EvaluateConfig(options...) if kind != Kind { panic(errors.ProgrammerError.New("This transmat supports definitions of type %q, not %q", Kind, kind)) } // If scan area doesn't exist, bail immediately. // No need to even start dialing warehouses if we've got nothing for em. _, err := os.Stat(subjectPath) if err != nil { if os.IsNotExist(err) { return // empty commitID } else { panic(err) } } // Parse save locations. // This transmat only supports one output location at a time due // to Old code we haven't invested in refactoring yet. var localPath string if len(siloURIs) == 0 { localPath = "" // empty string is a well known value to `fshash.FillBucket`: means just hash, don't copy. } else if len(siloURIs) == 1 { // TODO still assuming all local paths and not doing real uri parsing localPath = string(siloURIs[0]) err := os.MkdirAll(filepath.Dir(localPath), 0755) if err != nil { panic(integrity.WarehouseConnectionError.New("Unable to write file: %s", err)) } } else { panic(integrity.ConfigError.New("%s transmat only supports shipping to 1 warehouse", Kind)) } // walk filesystem, copying and accumulating data for integrity check bucket := &fshash.MemoryBucket{} err = fshash.FillBucket(subjectPath, localPath, bucket, config.FilterSet, hasherFactory) if err != nil { panic(err) // TODO this is not well typed, and does not clearly indicate whether scanning or committing had the problem } // hash whole tree actualTreeHash := fshash.Hash(bucket, hasherFactory) // report commitID = integrity.CommitID(base64.URLEncoding.EncodeToString(actualTreeHash)) }).Catch(integrity.Error, func(err *errors.Error) { panic(err) }).CatchAll(func(err error) { panic(integrity.UnknownError.Wrap(err)) }).Done() return commitID }
/* Arenas produced by Tar Transmats may be relocated by simple `mv`. */ func (t *TarTransmat) Materialize( kind integrity.TransmatKind, dataHash integrity.CommitID, siloURIs []integrity.SiloURI, options ...integrity.MaterializerConfigurer, ) integrity.Arena { var arena tarArena try.Do(func() { // Basic validation and config config := integrity.EvaluateConfig(options...) if kind != Kind { panic(errors.ProgrammerError.New("This transmat supports definitions of type %q, not %q", Kind, kind)) } // Ping silos if len(siloURIs) < 1 { panic(integrity.ConfigError.New("Materialization requires at least one data source!")) // Note that it's possible a caching layer will satisfy things even without data sources... // but if that was going to happen, it already would have by now. } // Our policy is to take the first path that exists. // This lets you specify a series of potential locations, and if one is unavailable we'll just take the next. var stream io.Reader for _, uri := range siloURIs { try.Do(func() { stream = makeReader(dataHash, uri) }).Catch(integrity.DataDNE, func(err *errors.Error) { // fine, we'll just try the next one // TODO LOGGING }).Catch(integrity.WarehouseConnectionError, func(err *errors.Error) { // ... this does kind of seem to indicate we should have "warehouse offline or DNE" be separate from "tcp flaked after we shook on it yo" // for now we consider both fatal. revist this when we get smarter logging, etc panic(err) }).Done() if stream != nil { break } } if stream == nil { panic(integrity.WarehouseConnectionError.New("No warehouses were available!")) } // Wrap input stream with decompression as necessary reader, err := Decompress(stream) if err != nil { panic(integrity.WarehouseConnectionError.New("could not start decompressing: %s", err)) } tarReader := tar.NewReader(reader) // Create staging arena to produce data into. arena.path, err = ioutil.TempDir(t.workPath, "") if err != nil { panic(integrity.TransmatError.New("Unable to create arena: %s", err)) } // walk input tar stream, placing data and accumulating hashes and metadata for integrity check bucket := &fshash.MemoryBucket{} Extract(tarReader, arena.Path(), bucket, hasherFactory) // bucket processing may have created a root node if missing. if so, we need to apply its props. fs.PlaceFile(arena.Path(), bucket.Root().Metadata, nil) // hash whole tree actualTreeHash := fshash.Hash(bucket, hasherFactory) // verify total integrity expectedTreeHash, err := base64.URLEncoding.DecodeString(string(dataHash)) if err != nil { panic(integrity.ConfigError.New("Could not parse hash: %s", err)) } if bytes.Equal(actualTreeHash, expectedTreeHash) { // excellent, got what we asked for. arena.hash = dataHash } else { // this may or may not be grounds for panic, depending on configuration. if config.AcceptHashMismatch { // if we're tolerating mismatches, report the actual hash through different mechanisms. // you probably only ever want to use this in tests or debugging; in prod it's just asking for insanity. arena.hash = integrity.CommitID(actualTreeHash) } else { panic(integrity.NewHashMismatchError(string(dataHash), base64.URLEncoding.EncodeToString(actualTreeHash))) } } }).Catch(integrity.Error, func(err *errors.Error) { panic(err) }).CatchAll(func(err error) { panic(integrity.UnknownError.Wrap(err)) }).Done() return arena }