/* Walks a filesystem. This is much like the standard library's `path/filepath.Walk`, except it's based on `treewalk`, which means it supports both pre- and post-order traversals. All paths begin in `./`, and directory names are slash-suffixed. E.g. you'll see a series like `{"./", "./a/", "./a/b"}`, etc. This matches the behaviors described by `Normalize` in the `lib/fshash`. Symlinks are not followed. The traversal order of siblings is *not* guaranteed, and is *not* necessarily stable. Caveat: calling `node.NextChild()` during your walk results in undefined behavior. */ func Walk(basePath string, preVisit WalkFunc, postVisit WalkFunc) error { return treewalk.Walk( newFileWalkNode(basePath, "./"), func(node treewalk.Node) error { filenode := node.(*FilewalkNode) if preVisit != nil { if err := preVisit(filenode); err != nil { return err } } return filenode.prepareChildren(basePath) }, func(node treewalk.Node) error { filenode := node.(*FilewalkNode) var err error if postVisit != nil { err = postVisit(filenode) } filenode.forgetChildren() return err }, ) }
/* Walks the tree of files and metadata arrayed in a `Bucket` and constructs a tree hash over them. The root of the tree hash is returned. The returned root has can be said to verify the integrity of the entire tree (much like a Merkle tree). The serial structure is expressed something like the following: {"node": $dir.metadata.hash, "leaves": [ {"node": $file1.metadata.hash, "content": $file1.contentHash}, {"node": $subdir.metadata.hash, "leaves": [ ... ]}, ] } This expression is made in cbor (rfc7049) format with indefinite-length arrays and a fixed order for all map fields. Every structure starting with "node" is itself hashed and that value substituted in before hashing the parent. Since the metadata hash contains the file/dir name, and the tree itself is traversed in sorted order, the entire structure is computed deterministically and unambiguously. */ func Hash(bucket Bucket, hasherFactory func() hash.Hash) []byte { // Hack around codec not exporting things very usefully -.- const magic_RAW = 0 const magic_UTF8 = 1 // Keep a count of how many nodes visited. Cheap sanity check. var visitCount int // At every point in the visitation, children need to submit their hashes back up the tree. // Prime the pump with a special reaction for when the root returns; every directory preVisit attaches hoppers for children thereon. upsubs := make(upsubStack, 0) var finalAnswer []byte upsubs.Push(func(x []byte) { finalAnswer = x }) // Also keep a stack of hashers in use because they jump across the pre/post visit gap. hashers := make(hasherStack, 0) // Visitor definitions preVisit := func(node treewalk.Node) error { record := node.(RecordIterator).Record() visitCount++ hasher := hasherFactory() _, enc := codec.GenHelperEncoder(codec.NewEncoder(hasher, new(codec.CborHandle))) enc.EncodeMapStart(2) // either way it's header + one of leaves or contenthash enc.EncodeString(magic_UTF8, "m") record.Metadata.Marshal(hasher) //fmt.Printf(":::: %q ->\n\t%#v\n\t%s\n", record.Metadata.Name, record.Metadata, base64.URLEncoding.EncodeToString(hasher.Sum(nil))) // non-cascading if record.Metadata.Typeflag == tar.TypeDir { // open the "leaves" array // this may end up being an empty dir, but we act the same regardless // (and we don't have that information here since the iterator has tunnel vision) enc.EncodeString(magic_UTF8, "l") hasher.Write([]byte{codec.CborStreamArray}) upsubs.Push(func(x []byte) { enc.EncodeStringBytes(magic_RAW, x) }) hashers.Push(hasher) } else { // heap the object's content hash in enc.EncodeString(magic_UTF8, "h") enc.EncodeStringBytes(magic_RAW, record.ContentHash) // finalize our hash here and upsub to save us the work of hanging onto the hasher until the postvisit call upsubs.Peek()(hasher.Sum(nil)) } return nil } postVisit := func(node treewalk.Node) error { record := node.(RecordIterator).Record() if record.Metadata.Typeflag == tar.TypeDir { hasher := hashers.Pop() // close off the "leaves" array // No map-close necessary because we used a fixed length map. hasher.Write([]byte{0xff}) // should be `codec.CborStreamBreak` but upstream has an export bug :/ hash := hasher.Sum(nil) // debug // if len(strings.Split(record.Metadata.Name, "/")) == 3 { // fmt.Printf("::: hashing -- %q \t=> %s\n", record.Metadata.Name, base64.URLEncoding.EncodeToString(hash)) // } // pop out this dir's hoppers for children data upsubs.Pop() // hash and upsub upsubs.Peek()(hash) } return nil } // Traverse if err := treewalk.Walk(bucket.Iterator(), preVisit, postVisit); err != nil { panic(err) // none of our code has known believable error returns. } // Sanity check no node left behind _ = upsubs.Pop() if !upsubs.Empty() || !hashers.Empty() { panic(errors.ProgrammerError.New("invariant failed after bucket records walk: stacks not empty")) } if visitCount != bucket.Length() { panic(errors.ProgrammerError.New("invariant failed after bucket records walk: visited %d of %d nodes", visitCount, bucket.Length())) } // return the result upsubbed by the root return finalAnswer }
func Extract(tr *tar.Reader, destBasePath string, bucket fshash.Bucket, hasherFactory func() hash.Hash) { for { thdr, err := tr.Next() if err == io.EOF { break // end of archive } if err != nil { panic(integrity.WarehouseConnectionError.New("corrupt tar: %s", err)) } hdr := fs.Metadata(*thdr) // filter/sanify values: // - names must be clean, relative dot-slash prefixed, and dirs slash-suffixed // - times should never be go's zero value; replace those with epoch // Note that names at this point should be handled by `path` (not `filepath`; these are canonical form for feed to hashing) hdr.Name = path.Clean(hdr.Name) if strings.HasPrefix(hdr.Name, "../") { panic(integrity.WarehouseConnectionError.New("corrupt tar: paths that use '../' to leave the base dir are invalid")) } if hdr.Name != "." { hdr.Name = "./" + hdr.Name } if hdr.ModTime.IsZero() { hdr.ModTime = def.Epochwhen } if hdr.AccessTime.IsZero() { hdr.AccessTime = def.Epochwhen } // conjure parents, if necessary. tar format allows implicit parent dirs. // Note that if any of the implicitly conjured dirs is specified later, unpacking won't notice, // but bucket hashing iteration will (correctly) blow up for repeat entries. // It may well be possible to construct a tar like that, but it's already well established that // tars with repeated filenames are just asking for trouble and shall be rejected without // ceremony because they're just a ridiculous idea. parts := strings.Split(hdr.Name, "/") for i := range parts[:len(parts)-1] { i++ _, err := os.Lstat(filepath.Join(append([]string{destBasePath}, parts[:i]...)...)) // if it already exists, move along; if the error is anything interesting, let PlaceFile decide how to deal with it if err == nil || !os.IsNotExist(err) { continue } // if we're missing a dir, conjure a node with defaulted values (same as we do for "./") conjuredHdr := fshash.DefaultDirRecord().Metadata conjuredHdr.Name = strings.Join(parts[:i], "/") + "/" // path.Join does cleaning; unwanted. fs.PlaceFile(destBasePath, conjuredHdr, nil) bucket.Record(conjuredHdr, nil) } // place the file switch hdr.Typeflag { case tar.TypeReg: reader := &flak.HashingReader{tr, hasherFactory()} fs.PlaceFile(destBasePath, hdr, reader) bucket.Record(hdr, reader.Hasher.Sum(nil)) case tar.TypeDir: hdr.Name += "/" fallthrough default: fs.PlaceFile(destBasePath, hdr, nil) bucket.Record(hdr, nil) } } // cleanup dir times with a post-order traversal over the bucket if err := treewalk.Walk(bucket.Iterator(), nil, func(node treewalk.Node) error { record := node.(fshash.RecordIterator).Record() if record.Metadata.Typeflag == tar.TypeDir { fs.PlaceDirTime(destBasePath, record.Metadata) } return nil }); err != nil { panic(err) } }