func blobDetails(contents io.ReadSeeker) (bref blob.Ref, size uint32, err error) { s1 := sha1.New() if _, err = contents.Seek(0, 0); err != nil { return } defer func() { if _, seekErr := contents.Seek(0, 0); seekErr != nil { if err == nil { err = seekErr } else { err = fmt.Errorf("%s, cannot seek back: %v", err, seekErr) } } }() sz, err := io.CopyN(s1, contents, constants.MaxBlobSize+1) if err == nil || err == io.EOF { bref, err = blob.RefFromHash(s1), nil } else { err = fmt.Errorf("error reading contents: %v", err) return } if sz > constants.MaxBlobSize { err = fmt.Errorf("blob size cannot be bigger than %d", constants.MaxBlobSize) } size = uint32(sz) return }
func (pt *packTest) testOpenWholeRef(t *testing.T, wholeRef blob.Ref, wantSize int64) { rc, gotSize, err := pt.sto.OpenWholeRef(wholeRef, 0) if err != nil { t.Errorf("OpenWholeRef = %v", err) return } defer rc.Close() if gotSize != wantSize { t.Errorf("OpenWholeRef size = %v; want %v", gotSize, wantSize) return } h := blob.NewHash() n, err := io.Copy(h, rc) if err != nil { t.Errorf("OpenWholeRef read error: %v", err) return } if n != wantSize { t.Errorf("OpenWholeRef read %v bytes; want %v", n, wantSize) return } gotRef := blob.RefFromHash(h) if gotRef != wholeRef { t.Errorf("OpenWholeRef read contents = %v; want %v", gotRef, wholeRef) } }
// This is the simple 1MB chunk version. The rolling checksum version is below. func writeFileMapOld(bs blobserver.StatReceiver, file *Builder, r io.Reader) (blob.Ref, error) { parts, size := []BytesPart{}, int64(0) var buf bytes.Buffer for { buf.Reset() n, err := io.Copy(&buf, io.LimitReader(r, maxBlobSize)) if err != nil { return blob.Ref{}, err } if n == 0 { break } hash := blob.NewHash() io.Copy(hash, bytes.NewReader(buf.Bytes())) br := blob.RefFromHash(hash) hasBlob, err := serverHasBlob(bs, br) if err != nil { return blob.Ref{}, err } if !hasBlob { sb, err := bs.ReceiveBlob(br, &buf) if err != nil { return blob.Ref{}, err } if want := (blob.SizedRef{br, uint32(n)}); sb != want { return blob.Ref{}, fmt.Errorf("schema/filewriter: wrote %s, expect", sb, want) } } size += n parts = append(parts, BytesPart{ BlobRef: br, Size: uint64(n), Offset: 0, // into BlobRef to read from (not of dest) }) } err := file.PopulateParts(size, parts) if err != nil { return blob.Ref{}, err } json := file.Blob().JSON() if err != nil { return blob.Ref{}, err } br := blob.SHA1FromString(json) sb, err := bs.ReceiveBlob(br, strings.NewReader(json)) if err != nil { return blob.Ref{}, err } if expect := (blob.SizedRef{br, uint32(len(json))}); expect != sb { return blob.Ref{}, fmt.Errorf("schema/filewriter: wrote %s bytes, got %s ack'd", expect, sb) } return br, nil }
func (pk *packer) pack() error { if err := pk.scanChunks(); err != nil { return err } // TODO: decide as a fuction of schemaRefs and dataRefs // already in s.large whether it makes sense to still compact // this from a savings standpoint. For now we just always do. // Maybe we'd have knobs in the future. Ideally not. // Don't pack a file if we already have its wholeref stored // otherwise (perhaps under a different filename). But that // means we have to compute its wholeref first. We assume the // blob source will cache these lookups so it's not too // expensive to do two passes over the input. h := blob.NewHash() var err error pk.wholeSize, err = io.Copy(h, pk.fr) if err != nil { return err } pk.wholeRef = blob.RefFromHash(h) wholeKey := wholeMetaPrefix + pk.wholeRef.String() _, err = pk.s.meta.Get(wholeKey) if err == nil { // Nil error means there was some knowledge of this wholeref. return fmt.Errorf("already have wholeref %v packed; not packing again", pk.wholeRef) } else if err != sorted.ErrNotFound { return err } pk.chunksRemain = pk.dataRefs var trunc blob.Ref MakingZips: for len(pk.chunksRemain) > 0 { if err := pk.writeAZip(trunc); err != nil { if needTrunc, ok := err.(needsTruncatedAfterError); ok { trunc = needTrunc.Ref if fn := testHookSawTruncate; fn != nil { fn(trunc) } continue MakingZips } return err } trunc = blob.Ref{} } // Record the final wholeMetaPrefix record: err = pk.s.meta.Set(wholeKey, fmt.Sprintf("%d %d", pk.wholeSize, len(pk.zips))) if err != nil { return fmt.Errorf("Error setting %s: %v", wholeKey, err) } return nil }
func blobDetails(contents io.ReadSeeker) (bref blob.Ref, size int64, err error) { s1 := sha1.New() contents.Seek(0, 0) size, err = io.Copy(s1, contents) if err == nil { bref = blob.RefFromHash(s1) } contents.Seek(0, 0) return }
func (d *defaultStatHasher) Hash(fileName string) (blob.Ref, error) { s1 := sha1.New() file, err := os.Open(fileName) if err != nil { return blob.Ref{}, err } defer file.Close() _, err = io.Copy(s1, file) if err != nil { return blob.Ref{}, err } return blob.RefFromHash(s1), nil }
func testSizedBlob(t *testing.T, r io.Reader, b1 blob.Ref, size int64) { h := b1.Hash() n, err := io.Copy(h, r) if err != nil { t.Fatalf("error reading from %s: %v", r, err) } if n != size { t.Fatalf("read %d bytes from %s, metadata said %d!", n, r, size) } b2 := blob.RefFromHash(h) if b2 != b1 { t.Fatalf("content mismatch (awaited %s, got %s)", b1, b2) } }
func (h *DeployHandler) storeInstanceConf(conf *InstanceConf) (blob.Ref, error) { contents, err := json.Marshal(conf) if err != nil { return blob.Ref{}, fmt.Errorf("could not json encode instance config: %v", err) } hash := blob.NewHash() _, err = io.Copy(hash, bytes.NewReader(contents)) if err != nil { return blob.Ref{}, fmt.Errorf("could not hash blob contents: %v", err) } br := blob.RefFromHash(hash) if _, err := blobserver.Receive(h.instConf, br, bytes.NewReader(contents)); err != nil { return blob.Ref{}, fmt.Errorf("could not store instance config blob: %v", err) } return br, nil }
// Blob builds the Blob. The builder continues to be usable after a call to Build. func (bb *Builder) Blob() *Blob { json, err := mapJSON(bb.m) if err != nil { panic(err) } ss, err := parseSuperset(strings.NewReader(json)) if err != nil { panic(err) } h := blob.NewHash() h.Write([]byte(json)) return &Blob{ str: json, ss: ss, br: blob.RefFromHash(h), } }
func TestPackLarge(t *testing.T) { if testing.Short() { t.Skip("skipping in short mode") } const fileSize = 17 << 20 // more than 16 MB, so more than one zip const fileName = "foo.dat" fileContents := randBytes(fileSize) hash := blob.NewHash() hash.Write(fileContents) wholeRef := blob.RefFromHash(hash) pt := testPack(t, func(sto blobserver.Storage) error { _, err := schema.WriteFileFromReader(sto, fileName, bytes.NewReader(fileContents)) return err }, wantNumLargeBlobs(2), wantNumSmallBlobs(0), ) // Verify we wrote the correct "w:*" meta rows. got := map[string]string{} want := map[string]string{ "w:" + wholeRef.String(): "17825792 2", "w:" + wholeRef.String() + ":0": "sha1-9b4a3d114c059988075c87293c86ee7cbc6f4af5 37 0 16709479", "w:" + wholeRef.String() + ":1": "sha1-fe6326ac6b389ffe302623e4a501bfc8c6272e8e 37 16709479 1116313", } if err := sorted.Foreach(pt.sto.meta, func(key, value string) error { if strings.HasPrefix(key, "b:") { return nil } got[key] = value return nil }); err != nil { t.Fatal(err) } if !reflect.DeepEqual(got, want) { t.Errorf("'w:*' meta rows = %v; want %v", got, want) } // And verify we can read it back out. pt.testOpenWholeRef(t, wholeRef, fileSize) }
func stdinBlobHandle() (uh *client.UploadHandle, err error) { var buf bytes.Buffer size, err := io.Copy(&buf, cmdmain.Stdin) if err != nil { return } // TODO(bradfitz,mpl): limit this buffer size? file := buf.Bytes() h := blob.NewHash() size, err = io.Copy(h, bytes.NewReader(file)) if err != nil { return } return &client.UploadHandle{ BlobRef: blob.RefFromHash(h), Size: size, Contents: io.LimitReader(bytes.NewReader(file), size), }, nil }
func TestPackNormal(t *testing.T) { const fileSize = 5 << 20 const fileName = "foo.dat" fileContents := randBytes(fileSize) hash := blob.NewHash() hash.Write(fileContents) wholeRef := blob.RefFromHash(hash) pt := testPack(t, func(sto blobserver.Storage) error { _, err := schema.WriteFileFromReader(sto, fileName, bytes.NewReader(fileContents)) return err }, wantNumLargeBlobs(1), wantNumSmallBlobs(0), ) // And verify we can read it back out. pt.testOpenWholeRef(t, wholeRef, fileSize) }
func stdinBlobHandle() (uh *client.UploadHandle, err error) { var buf bytes.Buffer size, err := io.CopyN(&buf, cmdmain.Stdin, constants.MaxBlobSize+1) if err != nil { return } if size > constants.MaxBlobSize { err = fmt.Errorf("blob size cannot be bigger than %d", constants.MaxBlobSize) } file := buf.Bytes() h := blob.NewHash() size, err = io.Copy(h, bytes.NewReader(file)) if err != nil { return } return &client.UploadHandle{ BlobRef: blob.RefFromHash(h), Size: uint32(size), Contents: io.LimitReader(bytes.NewReader(file), size), }, nil }
func TestReindex(t *testing.T) { if testing.Short() { t.Skip("skipping in short mode") } type file struct { size int64 name string contents []byte } files := []file{ {17 << 20, "foo.dat", randBytesSrc(17<<20, 42)}, {10 << 20, "bar.dat", randBytesSrc(10<<20, 43)}, {5 << 20, "baz.dat", randBytesSrc(5<<20, 44)}, } pt := testPack(t, func(sto blobserver.Storage) error { for _, f := range files { if _, err := schema.WriteFileFromReader(sto, f.name, bytes.NewReader(f.contents)); err != nil { return err } } return nil }, wantNumLargeBlobs(4), wantNumSmallBlobs(0), ) // backup the meta that is supposed to be lost/erased. // pt.sto.reindex allocates a new pt.sto.meta, so meta != pt.sto.meta after it is called. meta := pt.sto.meta // and build new meta index if err := pt.sto.reindex(context.TODO(), func() (sorted.KeyValue, error) { return sorted.NewMemoryKeyValue(), nil }); err != nil { t.Fatal(err) } // check that new meta is identical to "lost" one newRows := 0 if err := sorted.Foreach(pt.sto.meta, func(key, newValue string) error { oldValue, err := meta.Get(key) if err != nil { t.Fatalf("Could not get value for %v in old meta: %v", key, err) } if oldValue != newValue { t.Fatalf("Reindexing error: for key %v, got %v, want %v", key, newValue, oldValue) } newRows++ return nil }); err != nil { t.Fatal(err) } // make sure they have the same number of entries too, to be sure that the reindexing // did not miss entries that the old meta had. oldRows := countSortedRows(t, meta) if oldRows != newRows { t.Fatalf("index number of entries mismatch: got %d entries in new index, wanted %d (as in index before reindexing)", newRows, oldRows) } // And verify we can read one of the files back out. hash := blob.NewHash() hash.Write(files[0].contents) pt.testOpenWholeRef(t, blob.RefFromHash(hash), files[0].size) }
// b: the parsed file schema blob // mm: keys to populate func (ix *Index) populateFile(fetcher blob.Fetcher, b *schema.Blob, mm *mutationMap) (err error) { var times []time.Time // all creation or mod times seen; may be zero times = append(times, b.ModTime()) blobRef := b.BlobRef() fr, err := b.NewFileReader(fetcher) if err != nil { return err } defer fr.Close() mime, reader := magic.MIMETypeFromReader(fr) sha1 := sha1.New() var copyDest io.Writer = sha1 var imageBuf *keepFirstN // or nil if strings.HasPrefix(mime, "image/") { // Emperically derived 1MiB assuming CR2 images require more than any // other filetype we support: // https://gist.github.com/wathiede/7982372 imageBuf = &keepFirstN{N: 1 << 20} copyDest = io.MultiWriter(copyDest, imageBuf) } size, err := io.Copy(copyDest, reader) if err != nil { return err } wholeRef := blob.RefFromHash(sha1) if imageBuf != nil { if conf, err := images.DecodeConfig(bytes.NewReader(imageBuf.Bytes)); err == nil { mm.Set(keyImageSize.Key(blobRef), keyImageSize.Val(fmt.Sprint(conf.Width), fmt.Sprint(conf.Height))) } if ft, err := schema.FileTime(bytes.NewReader(imageBuf.Bytes)); err == nil { log.Printf("filename %q exif = %v, %v", b.FileName(), ft, err) times = append(times, ft) } else { log.Printf("filename %q exif = %v, %v", b.FileName(), ft, err) } indexEXIF(wholeRef, imageBuf.Bytes, mm) } var sortTimes []time.Time for _, t := range times { if !t.IsZero() { sortTimes = append(sortTimes, t) } } sort.Sort(types.ByTime(sortTimes)) var time3339s string switch { case len(sortTimes) == 1: time3339s = types.Time3339(sortTimes[0]).String() case len(sortTimes) >= 2: oldest, newest := sortTimes[0], sortTimes[len(sortTimes)-1] time3339s = types.Time3339(oldest).String() + "," + types.Time3339(newest).String() } mm.Set(keyWholeToFileRef.Key(wholeRef, blobRef), "1") mm.Set(keyFileInfo.Key(blobRef), keyFileInfo.Val(size, b.FileName(), mime)) mm.Set(keyFileTimes.Key(blobRef), keyFileTimes.Val(time3339s)) if strings.HasPrefix(mime, "audio/") { indexMusic(io.NewSectionReader(fr, 0, fr.Size()), wholeRef, mm) } return nil }
// b: the parsed file schema blob // mm: keys to populate func (ix *Index) populateFile(fetcher blob.Fetcher, b *schema.Blob, mm *mutationMap) (err error) { var times []time.Time // all creation or mod times seen; may be zero times = append(times, b.ModTime()) blobRef := b.BlobRef() fr, err := b.NewFileReader(fetcher) if err != nil { return err } defer fr.Close() mime, mr := magic.MIMETypeFromReader(fr) sha1 := sha1.New() var copyDest io.Writer = sha1 var imageBuf *keepFirstN // or nil if strings.HasPrefix(mime, "image/") { imageBuf = &keepFirstN{N: 512 << 10} copyDest = io.MultiWriter(copyDest, imageBuf) } size, err := io.Copy(copyDest, mr) if err != nil { return err } wholeRef := blob.RefFromHash(sha1) if imageBuf != nil { var conf images.Config decodeConfig := func(r filePrefixReader) error { conf, err = images.DecodeConfig(r) return err } if err := readPrefixOrFile(imageBuf.Bytes, fetcher, b, decodeConfig); err == nil { mm.Set(keyImageSize.Key(blobRef), keyImageSize.Val(fmt.Sprint(conf.Width), fmt.Sprint(conf.Height))) } var ft time.Time fileTime := func(r filePrefixReader) error { ft, err = schema.FileTime(r) return err } if err = readPrefixOrFile(imageBuf.Bytes, fetcher, b, fileTime); err == nil { times = append(times, ft) } if exifDebug { log.Printf("filename %q exif = %v, %v", b.FileName(), ft, err) } // TODO(mpl): find (generate?) more broken EXIF images to experiment with. indexEXIFData := func(r filePrefixReader) error { return indexEXIF(wholeRef, r, mm) } if err = readPrefixOrFile(imageBuf.Bytes, fetcher, b, indexEXIFData); err != nil { if exifDebug { log.Printf("error parsing EXIF: %v", err) } } } var sortTimes []time.Time for _, t := range times { if !t.IsZero() { sortTimes = append(sortTimes, t) } } sort.Sort(types.ByTime(sortTimes)) var time3339s string switch { case len(sortTimes) == 1: time3339s = types.Time3339(sortTimes[0]).String() case len(sortTimes) >= 2: oldest, newest := sortTimes[0], sortTimes[len(sortTimes)-1] time3339s = types.Time3339(oldest).String() + "," + types.Time3339(newest).String() } mm.Set(keyWholeToFileRef.Key(wholeRef, blobRef), "1") mm.Set(keyFileInfo.Key(blobRef), keyFileInfo.Val(size, b.FileName(), mime, wholeRef)) mm.Set(keyFileTimes.Key(blobRef), keyFileTimes.Val(time3339s)) if strings.HasPrefix(mime, "audio/") { indexMusic(io.NewSectionReader(fr, 0, fr.Size()), wholeRef, mm) } return nil }
// b: the parsed file schema blob // mm: keys to populate func (ix *Index) populateFile(b *schema.Blob, mm *mutationMap) (err error) { var times []time.Time // all creation or mod times seen; may be zero times = append(times, b.ModTime()) blobRef := b.BlobRef() fetcher := &seekFetcherMissTracker{ // TODO(bradfitz): cache this SeekFetcher on ix so it // it's have to be re-made each time? Probably small. src: blob.SeekerFromStreamingFetcher(ix.BlobSource), } defer func() { if err == nil { return } fetcher.mu.Lock() defer fetcher.mu.Unlock() if len(fetcher.missing) == 0 { return } // TODO(bradfitz): there was an error indexing this file, and // we failed to load the blobs in f.missing. Add those as dependencies // somewhere so when we get one of those missing blobs, we kick off // a re-index of this file for whenever the indexer is idle. }() fr, err := b.NewFileReader(fetcher) if err != nil { // TODO(bradfitz): propagate up a transient failure // error type, so we can retry indexing files in the // future if blobs are only temporarily unavailable. // Basically the same as the TODO just below. // // We'll also want to bump the schemaVersion after this, // to fix anybody's index which is only partial due to // this old bug where it would return nil instead of doing // the necessary work. log.Printf("index: error indexing file, creating NewFileReader %s: %v", blobRef, err) return nil } defer fr.Close() mime, reader := magic.MIMETypeFromReader(fr) sha1 := sha1.New() var copyDest io.Writer = sha1 var imageBuf *keepFirstN // or nil if strings.HasPrefix(mime, "image/") { // Emperically derived 1MiB assuming CR2 images require more than any // other filetype we support: // https://gist.github.com/wathiede/7982372 imageBuf = &keepFirstN{N: 1 << 20} copyDest = io.MultiWriter(copyDest, imageBuf) } size, err := io.Copy(copyDest, reader) if err != nil { // TODO: job scheduling system to retry this spaced // out max n times. Right now our options are // ignoring this error (forever) or returning the // error and making the indexing try again (likely // forever failing). Both options suck. For now just // log and act like all's okay. // // See TODOs above, and the fetcher.missing stuff. log.Printf("index: error indexing file %s: %v", blobRef, err) return nil } wholeRef := blob.RefFromHash(sha1) if imageBuf != nil { if conf, err := images.DecodeConfig(bytes.NewReader(imageBuf.Bytes)); err == nil { mm.Set(keyImageSize.Key(blobRef), keyImageSize.Val(fmt.Sprint(conf.Width), fmt.Sprint(conf.Height))) } if ft, err := schema.FileTime(bytes.NewReader(imageBuf.Bytes)); err == nil { log.Printf("filename %q exif = %v, %v", b.FileName(), ft, err) times = append(times, ft) } else { log.Printf("filename %q exif = %v, %v", b.FileName(), ft, err) } indexEXIF(wholeRef, imageBuf.Bytes, mm) } var sortTimes []time.Time for _, t := range times { if !t.IsZero() { sortTimes = append(sortTimes, t) } } sort.Sort(types.ByTime(sortTimes)) var time3339s string switch { case len(sortTimes) == 1: time3339s = types.Time3339(sortTimes[0]).String() case len(sortTimes) >= 2: oldest, newest := sortTimes[0], sortTimes[len(sortTimes)-1] time3339s = types.Time3339(oldest).String() + "," + types.Time3339(newest).String() } mm.Set(keyWholeToFileRef.Key(wholeRef, blobRef), "1") mm.Set(keyFileInfo.Key(blobRef), keyFileInfo.Val(size, b.FileName(), mime)) mm.Set(keyFileTimes.Key(blobRef), keyFileTimes.Val(time3339s)) if strings.HasPrefix(mime, "audio/") { indexMusic(io.NewSectionReader(fr, 0, fr.Size()), wholeRef, mm) } return nil }
func (r *run) updatePhotoInAlbum(ctx context.Context, albumNode *importer.Object, photo picago.Photo) (ret error) { if photo.ID == "" { return errors.New("photo has no ID") } getMediaBytes := func() (io.ReadCloser, error) { log.Printf("Importing media from %v", photo.URL) resp, err := ctxutil.Client(ctx).Get(photo.URL) if err != nil { return nil, fmt.Errorf("importing photo %s: %v", photo.ID, err) } if resp.StatusCode != http.StatusOK { resp.Body.Close() return nil, fmt.Errorf("importing photo %s: status code = %d", photo.ID, resp.StatusCode) } return resp.Body, nil } var fileRefStr string idFilename := photo.ID + "-" + photo.Filename photoNode, err := albumNode.ChildPathObjectOrFunc(idFilename, func() (*importer.Object, error) { h := blob.NewHash() rc, err := getMediaBytes() if err != nil { return nil, err } fileRef, err := schema.WriteFileFromReader(r.Host.Target(), photo.Filename, io.TeeReader(rc, h)) if err != nil { return nil, err } fileRefStr = fileRef.String() wholeRef := blob.RefFromHash(h) if pn, err := findExistingPermanode(r.Host.Searcher(), wholeRef); err == nil { return r.Host.ObjectFromRef(pn) } return r.Host.NewObject() }) if err != nil { return err } const attrMediaURL = "picasaMediaURL" if fileRefStr == "" { fileRefStr = photoNode.Attr(nodeattr.CamliContent) // Only re-download the source photo if its URL has changed. // Empirically this seems to work: cropping a photo in the // photos.google.com UI causes its URL to change. And it makes // sense, looking at the ugliness of the URLs with all their // encoded/signed state. if !mediaURLsEqual(photoNode.Attr(attrMediaURL), photo.URL) { rc, err := getMediaBytes() if err != nil { return err } fileRef, err := schema.WriteFileFromReader(r.Host.Target(), photo.Filename, rc) rc.Close() if err != nil { return err } fileRefStr = fileRef.String() } } title := strings.TrimSpace(photo.Description) if strings.Contains(title, "\n") { title = title[:strings.Index(title, "\n")] } if title == "" && schema.IsInterestingTitle(photo.Filename) { title = photo.Filename } // TODO(tgulacsi): add more attrs (comments ?) // for names, see http://schema.org/ImageObject and http://schema.org/CreativeWork attrs := []string{ nodeattr.CamliContent, fileRefStr, attrPicasaId, photo.ID, nodeattr.Title, title, nodeattr.Description, photo.Description, nodeattr.LocationText, photo.Location, nodeattr.DateModified, schema.RFC3339FromTime(photo.Updated), nodeattr.DatePublished, schema.RFC3339FromTime(photo.Published), nodeattr.URL, photo.PageURL, } if photo.Latitude != 0 || photo.Longitude != 0 { attrs = append(attrs, nodeattr.Latitude, fmt.Sprintf("%f", photo.Latitude), nodeattr.Longitude, fmt.Sprintf("%f", photo.Longitude), ) } if err := photoNode.SetAttrs(attrs...); err != nil { return err } if err := photoNode.SetAttrValues("tag", photo.Keywords); err != nil { return err } if photo.Position > 0 { if err := albumNode.SetAttr( nodeattr.CamliPathOrderColon+strconv.Itoa(photo.Position-1), photoNode.PermanodeRef().String()); err != nil { return err } } // Do this last, after we're sure the "camliContent" attribute // has been saved successfully, because this is the one that // causes us to do it again in the future or not. if err := photoNode.SetAttrs(attrMediaURL, photo.URL); err != nil { return err } return nil }
func TestPackLarge(t *testing.T) { if testing.Short() { t.Skip("skipping in short mode") } const fileSize = 17 << 20 // more than 16 MB, so more than one zip const fileName = "foo.dat" fileContents := randBytes(fileSize) hash := blob.NewHash() hash.Write(fileContents) wholeRef := blob.RefFromHash(hash) pt := testPack(t, func(sto blobserver.Storage) error { _, err := schema.WriteFileFromReader(sto, fileName, bytes.NewReader(fileContents)) return err }, wantNumLargeBlobs(2), wantNumSmallBlobs(0), ) // Gather the "w:*" meta rows we wrote. got := map[string]string{} if err := sorted.Foreach(pt.sto.meta, func(key, value string) error { if strings.HasPrefix(key, "b:") { return nil } got[key] = value return nil }); err != nil { t.Fatal(err) } // Verify the two zips are correctly described. // There should be one row to say that we have two zip, and // that the overall file is 17MB: keyBase := "w:" + wholeRef.String() if g, w := got[keyBase], "17825792 2"; g != w { t.Fatalf("meta row for key %q = %q; want %q", keyBase, g, w) } // ... (and a little helper) ... parseMeta := func(n int) (zipOff, dataOff, dataLen int64) { key := keyBase + ":" + strconv.Itoa(n) v := got[key] f := strings.Fields(v) if len(f) != 4 { t.Fatalf("meta for key %q = %q; expected 4 space-separated fields", key, v) } i64 := func(n int) int64 { i, err := strconv.ParseInt(f[n], 10, 64) if err != nil { t.Fatalf("error parsing int64 %q in field index %d of meta key %q (value %q): %v", f[n], n, key, v, err) } return i } zipOff, dataOff, dataLen = i64(1), i64(2), i64(3) return } // And then verify if we have the two "w:<wholeref>:0" and // "w:<wholeref>:1" rows and that they're consistent. z0, d0, l0 := parseMeta(0) z1, d1, l1 := parseMeta(1) if z0 != z1 { t.Errorf("expected zip offset in zip0 and zip1 to match. got %d and %d", z0, z0) } if d0 != 0 { t.Errorf("zip0's data offset = %d; want 0", d0) } if d1 != l0 { t.Errorf("zip1 data offset %d != zip0 data length %d", d1, l0) } if d1+l1 != fileSize { t.Errorf("zip1's offset %d + length %d = %d; want %d (fileSize)", d1, l1, d1+l1, fileSize) } // And verify we can read it back out. pt.testOpenWholeRef(t, wholeRef, fileSize) }
func TestReindex(t *testing.T) { if testing.Short() { t.Skip("skipping in short mode") } type file struct { size int64 name string contents []byte } files := []file{ {17 << 20, "foo.dat", randBytesSrc(17<<20, 42)}, {10 << 20, "bar.dat", randBytesSrc(10<<20, 43)}, {5 << 20, "baz.dat", randBytesSrc(5<<20, 44)}, } pt := testPack(t, func(sto blobserver.Storage) error { for _, f := range files { if _, err := schema.WriteFileFromReader(sto, f.name, bytes.NewReader(f.contents)); err != nil { return err } } return nil }, wantNumLargeBlobs(4), wantNumSmallBlobs(0), ) // backup the meta that is supposed to be lost/erased. // pt.sto.reindex allocates a new pt.sto.meta, so meta != pt.sto.meta after it is called. meta := pt.sto.meta // and build new meta index if err := pt.sto.reindex(context.TODO(), func() (sorted.KeyValue, error) { return sorted.NewMemoryKeyValue(), nil }); err != nil { t.Fatal(err) } validBlobKey := func(key, value string) error { if !strings.HasPrefix(key, "b:") { return errors.New("not a blob meta key") } wantRef, ok := blob.Parse(key[2:]) if !ok { return errors.New("bogus blobref in key") } m, err := parseMetaRow([]byte(value)) if err != nil { return err } rc, err := pt.large.SubFetch(m.largeRef, int64(m.largeOff), int64(m.size)) if err != nil { return err } defer rc.Close() h := wantRef.Hash() n, err := io.Copy(h, rc) if err != nil { return err } if !wantRef.HashMatches(h) { return errors.New("content doesn't match") } if n != int64(m.size) { return errors.New("size doesn't match") } return nil } // check that new meta is identical to "lost" one newRows := 0 if err := sorted.Foreach(pt.sto.meta, func(key, newValue string) error { oldValue, err := meta.Get(key) if err != nil { t.Fatalf("Could not get value for %v in old meta: %v", key, err) } newRows++ // Exact match is fine. if oldValue == newValue { return nil } // If it differs, it should at least be correct. (blob metadata // can now point to different packed zips, depending on sorting) err = validBlobKey(key, newValue) if err == nil { return nil } t.Errorf("Reindexing error: for key %v: %v\n got: %q\nwant: %q", key, err, newValue, oldValue) return nil // keep enumerating, regardless of errors }); err != nil { t.Fatal(err) } // make sure they have the same number of entries too, to be sure that the reindexing // did not miss entries that the old meta had. oldRows := countSortedRows(t, meta) if oldRows != newRows { t.Fatalf("index number of entries mismatch: got %d entries in new index, wanted %d (as in index before reindexing)", newRows, oldRows) } // And verify we can read one of the files back out. hash := blob.NewHash() hash.Write(files[0].contents) pt.testOpenWholeRef(t, blob.RefFromHash(hash), files[0].size) }
// blobref: of the file or schema blob // blob: the parsed file schema blob // bm: keys to populate func (ix *Index) populateFile(b *schema.Blob, bm BatchMutation) error { var times []time.Time // all creation or mod times seen; may be zero times = append(times, b.ModTime()) blobRef := b.BlobRef() seekFetcher := blob.SeekerFromStreamingFetcher(ix.BlobSource) fr, err := b.NewFileReader(seekFetcher) if err != nil { // TODO(bradfitz): propagate up a transient failure // error type, so we can retry indexing files in the // future if blobs are only temporarily unavailable. // Basically the same as the TODO just below. log.Printf("index: error indexing file, creating NewFileReader %s: %v", blobRef, err) return nil } defer fr.Close() mime, reader := magic.MIMETypeFromReader(fr) sha1 := sha1.New() var copyDest io.Writer = sha1 var imageBuf *keepFirstN // or nil if strings.HasPrefix(mime, "image/") { imageBuf = &keepFirstN{N: 256 << 10} copyDest = io.MultiWriter(copyDest, imageBuf) } size, err := io.Copy(copyDest, reader) if err != nil { // TODO: job scheduling system to retry this spaced // out max n times. Right now our options are // ignoring this error (forever) or returning the // error and making the indexing try again (likely // forever failing). Both options suck. For now just // log and act like all's okay. log.Printf("index: error indexing file %s: %v", blobRef, err) return nil } if imageBuf != nil { if conf, err := images.DecodeConfig(bytes.NewReader(imageBuf.Bytes)); err == nil { bm.Set(keyImageSize.Key(blobRef), keyImageSize.Val(fmt.Sprint(conf.Width), fmt.Sprint(conf.Height))) } if ft, err := schema.FileTime(bytes.NewReader(imageBuf.Bytes)); err == nil { log.Printf("filename %q exif = %v, %v", b.FileName(), ft, err) times = append(times, ft) } else { log.Printf("filename %q exif = %v, %v", b.FileName(), ft, err) } } var sortTimes []time.Time for _, t := range times { if !t.IsZero() { sortTimes = append(sortTimes, t) } } sort.Sort(types.ByTime(sortTimes)) var time3339s string switch { case len(sortTimes) == 1: time3339s = types.Time3339(sortTimes[0]).String() case len(sortTimes) >= 2: oldest, newest := sortTimes[0], sortTimes[len(sortTimes)-1] time3339s = types.Time3339(oldest).String() + "," + types.Time3339(newest).String() } wholeRef := blob.RefFromHash(sha1) bm.Set(keyWholeToFileRef.Key(wholeRef, blobRef), "1") bm.Set(keyFileInfo.Key(blobRef), keyFileInfo.Val(size, b.FileName(), mime)) bm.Set(keyFileTimes.Key(blobRef), keyFileTimes.Val(time3339s)) if strings.HasPrefix(mime, "audio/") { tag, err := taglib.Decode(fr, fr.Size()) if err == nil { indexMusic(tag, wholeRef, bm) } else { log.Print("index: error parsing tag: ", err) } } return nil }
func (tb *Blob) BlobRef() blob.Ref { h := sha1.New() h.Write([]byte(tb.Contents)) return blob.RefFromHash(h) }
func TestPackerBoundarySplits(t *testing.T) { if testing.Short() { t.Skip("skipping slow test") } // Test a file of three chunk sizes, totalling near the 16 MB // boundary: // - 1st chunk is 6 MB. ("blobA") // - 2nd chunk is 6 MB. ("blobB") // - 3rd chunk ("blobC") is binary-searched (up to 4MB) to find // which size causes the packer to write two zip files. // During the test we set zip overhead boundaries to 0, to // force the test to into its pathological misprediction code paths, // where it needs to back up and rewrite the zip with one part less. // That's why the test starts with two zip files: so there's at // least one that can be removed to make room. defer setIntTemporarily(&zipPerEntryOverhead, 0)() const sizeAB = 12 << 20 const maxBlobSize = 16 << 20 bytesAB := randBytes(sizeAB) blobA := &test.Blob{string(bytesAB[:sizeAB/2])} blobB := &test.Blob{string(bytesAB[sizeAB/2:])} refA := blobA.BlobRef() refB := blobB.BlobRef() bytesCFull := randBytes(maxBlobSize - sizeAB) // will be sliced down // Mechanism to verify we hit the back-up code path: var ( mu sync.Mutex sawTruncate blob.Ref stoppedBeforeOverflow bool ) testHookSawTruncate = func(after blob.Ref) { if after != refB { t.Errorf("unexpected truncate point %v", after) } mu.Lock() defer mu.Unlock() sawTruncate = after } testHookStopBeforeOverflowing = func() { mu.Lock() defer mu.Unlock() stoppedBeforeOverflow = true } defer func() { testHookSawTruncate = nil testHookStopBeforeOverflowing = nil }() generatesTwoZips := func(sizeC int) (ret bool) { large := new(test.Fetcher) s := &storage{ small: new(test.Fetcher), large: large, meta: sorted.NewMemoryKeyValue(), log: test.NewLogger(t, "blobpacked: ", // Ignore these phrases: "Packing file ", "Packed file ", ), } s.init() // Upload first two chunks blobA.MustUpload(t, s) blobB.MustUpload(t, s) // Upload second chunk bytesC := bytesCFull[:sizeC] h := blob.NewHash() h.Write(bytesC) refC := blob.RefFromHash(h) _, err := s.ReceiveBlob(refC, bytes.NewReader(bytesC)) if err != nil { t.Fatal(err) } // Upload the file schema blob. m := schema.NewFileMap("foo.dat") m.PopulateParts(sizeAB+int64(sizeC), []schema.BytesPart{ schema.BytesPart{ Size: sizeAB / 2, BlobRef: refA, }, schema.BytesPart{ Size: sizeAB / 2, BlobRef: refB, }, schema.BytesPart{ Size: uint64(sizeC), BlobRef: refC, }, }) fjson, err := m.JSON() if err != nil { t.Fatalf("schema filemap JSON: %v", err) } fb := &test.Blob{Contents: fjson} fb.MustUpload(t, s) num := large.NumBlobs() if num < 1 || num > 2 { t.Fatalf("for size %d, num packed zip blobs = %d; want 1 or 2", sizeC, num) } return num == 2 } maxC := maxBlobSize - sizeAB smallestC := sort.Search(maxC, generatesTwoZips) if smallestC == maxC { t.Fatalf("never found a point at which we generated 2 zip files") } t.Logf("After 12 MB of data (in 2 chunks), the smallest blob that generates two zip files is %d bytes (%.03f MB)", smallestC, float64(smallestC)/(1<<20)) t.Logf("Zip overhead (for this two chunk file) = %d bytes", maxBlobSize-1-smallestC-sizeAB) mu.Lock() if sawTruncate != refB { t.Errorf("truncate after = %v; want %v", sawTruncate, refB) } if !stoppedBeforeOverflow { t.Error("never hit the code path where it calculates that another data chunk would push it over the 16MB boundary") } }
// b: the parsed file schema blob // mm: keys to populate func (ix *Index) populateFile(fetcher blob.Fetcher, b *schema.Blob, mm *mutationMap) (err error) { var times []time.Time // all creation or mod times seen; may be zero times = append(times, b.ModTime()) blobRef := b.BlobRef() fr, err := b.NewFileReader(fetcher) if err != nil { return err } defer fr.Close() mime, reader := magic.MIMETypeFromReader(fr) sha1 := sha1.New() var copyDest io.Writer = sha1 var imageBuf *keepFirstN // or nil if strings.HasPrefix(mime, "image/") { imageBuf = &keepFirstN{N: 512 << 10} copyDest = io.MultiWriter(copyDest, imageBuf) } size, err := io.Copy(copyDest, reader) if err != nil { return err } wholeRef := blob.RefFromHash(sha1) if imageBuf != nil { conf, err := images.DecodeConfig(bytes.NewReader(imageBuf.Bytes)) // If our optimistic 512KB in-memory prefix from above was too short to get the dimensions, pass the whole thing instead and try again. if err == io.ErrUnexpectedEOF { var fr *schema.FileReader fr, err = b.NewFileReader(fetcher) if err == nil { conf, err = images.DecodeConfig(fr) fr.Close() } } if err == nil { mm.Set(keyImageSize.Key(blobRef), keyImageSize.Val(fmt.Sprint(conf.Width), fmt.Sprint(conf.Height))) } if ft, err := schema.FileTime(bytes.NewReader(imageBuf.Bytes)); err == nil { log.Printf("filename %q exif = %v, %v", b.FileName(), ft, err) times = append(times, ft) } else { log.Printf("filename %q exif = %v, %v", b.FileName(), ft, err) } // TODO(mpl): find (generate?) more broken EXIF images to experiment with. err = indexEXIF(wholeRef, bytes.NewReader(imageBuf.Bytes), mm) if err == io.EOF { var fr *schema.FileReader fr, err = b.NewFileReader(fetcher) if err == nil { err = indexEXIF(wholeRef, fr, mm) fr.Close() } } if err != nil { log.Printf("error parsing EXIF: %v", err) } } var sortTimes []time.Time for _, t := range times { if !t.IsZero() { sortTimes = append(sortTimes, t) } } sort.Sort(types.ByTime(sortTimes)) var time3339s string switch { case len(sortTimes) == 1: time3339s = types.Time3339(sortTimes[0]).String() case len(sortTimes) >= 2: oldest, newest := sortTimes[0], sortTimes[len(sortTimes)-1] time3339s = types.Time3339(oldest).String() + "," + types.Time3339(newest).String() } mm.Set(keyWholeToFileRef.Key(wholeRef, blobRef), "1") mm.Set(keyFileInfo.Key(blobRef), keyFileInfo.Val(size, b.FileName(), mime, wholeRef)) mm.Set(keyFileTimes.Key(blobRef), keyFileTimes.Val(time3339s)) if strings.HasPrefix(mime, "audio/") { indexMusic(io.NewSectionReader(fr, 0, fr.Size()), wholeRef, mm) } return nil }
// indexMusic adds mutations to index the wholeRef by attached metadata and other properties. func indexMusic(r types.SizeReaderAt, wholeRef blob.Ref, mm *mutationMap) { tag, err := taglib.Decode(r, r.Size()) if err != nil { log.Print("index: error parsing tag: ", err) return } var footerLength int64 = 0 if hasTag, err := media.HasID3v1Tag(r); err != nil { log.Print("index: unable to check for ID3v1 tag: ", err) return } else if hasTag { footerLength = media.ID3v1TagLength } // Generate a hash of the audio portion of the file (i.e. excluding ID3v1 and v2 tags). audioStart := int64(tag.TagSize()) audioSize := r.Size() - audioStart - footerLength hash := sha1.New() if _, err := io.Copy(hash, io.NewSectionReader(r, audioStart, audioSize)); err != nil { log.Print("index: error generating SHA1 from audio data: ", err) return } mediaRef := blob.RefFromHash(hash) duration, err := media.GetMPEGAudioDuration(io.NewSectionReader(r, audioStart, audioSize)) if err != nil { log.Print("index: unable to calculate audio duration: ", err) duration = 0 } var yearStr, trackStr, discStr, durationStr string if !tag.Year().IsZero() { const justYearLayout = "2006" yearStr = tag.Year().Format(justYearLayout) } if tag.Track() != 0 { trackStr = fmt.Sprintf("%d", tag.Track()) } if tag.Disc() != 0 { discStr = fmt.Sprintf("%d", tag.Disc()) } if duration != 0 { durationStr = fmt.Sprintf("%d", duration/time.Millisecond) } // Note: if you add to this map, please update // pkg/search/query.go's MediaTagConstraint Tag docs. tags := map[string]string{ "title": tag.Title(), "artist": tag.Artist(), "album": tag.Album(), "genre": tag.Genre(), "musicbrainzalbumid": tag.CustomFrames()["MusicBrainz Album Id"], "year": yearStr, "track": trackStr, "disc": discStr, "mediaref": mediaRef.String(), "durationms": durationStr, } for tag, value := range tags { if value != "" { mm.Set(keyMediaTag.Key(wholeRef, tag), keyMediaTag.Val(value)) } } }
// trunc is a hint about which blob to truncate after. It may be zero. // If the returned error is of type 'needsTruncatedAfterError', then // the zip should be attempted to be written again, but truncating the // data after the listed blob. func (pk *packer) writeAZip(trunc blob.Ref) (err error) { defer func() { if e := recover(); e != nil { if v, ok := e.(error); ok && err == nil { err = v } else { panic(e) } } }() mf := Manifest{ WholeRef: pk.wholeRef, WholeSize: pk.wholeSize, WholePartIndex: len(pk.zips), } var zbuf bytes.Buffer cw := &countWriter{w: &zbuf} zw := zip.NewWriter(cw) var approxSize = zipFixedOverhead // can't use zbuf.Len because zw buffers var dataRefsWritten []blob.Ref var dataBytesWritten int64 var schemaBlobSeen = map[blob.Ref]bool{} var schemaBlobs []blob.Ref // to add after the main file baseFileName := pk.fr.FileName() if strings.Contains(baseFileName, "/") || strings.Contains(baseFileName, "\\") { return fmt.Errorf("File schema blob %v filename had a slash in it: %q", pk.fr.SchemaBlobRef(), baseFileName) } fh := &zip.FileHeader{ Name: baseFileName, Method: zip.Store, // uncompressed } fh.SetModTime(pk.fr.ModTime()) fh.SetMode(0644) fw, err := zw.CreateHeader(fh) check(err) check(zw.Flush()) dataStart := cw.n approxSize += zipPerEntryOverhead // for the first FileHeader w/ the data zipMax := pk.s.maxZipBlobSize() chunks := pk.chunksRemain chunkWholeHash := blob.NewHash() for len(chunks) > 0 { dr := chunks[0] // the next chunk to maybe write if trunc.Valid() && trunc == dr { if approxSize == 0 { return errors.New("first blob is too large to pack, once you add the zip overhead") } break } schemaBlobsSave := schemaBlobs for _, parent := range pk.schemaParent[dr] { if !schemaBlobSeen[parent] { schemaBlobSeen[parent] = true schemaBlobs = append(schemaBlobs, parent) approxSize += int(pk.schemaBlob[parent].Size()) + zipPerEntryOverhead } } thisSize := pk.dataSize[dr] approxSize += int(thisSize) if approxSize+mf.approxSerializedSize() > zipMax { if fn := testHookStopBeforeOverflowing; fn != nil { fn() } schemaBlobs = schemaBlobsSave // restore it break } // Copy the data to the zip. rc, size, err := pk.s.Fetch(dr) check(err) if size != thisSize { rc.Close() return errors.New("unexpected size") } if n, err := io.Copy(io.MultiWriter(fw, chunkWholeHash), rc); err != nil || n != int64(size) { rc.Close() return fmt.Errorf("copy to zip = %v, %v; want %v bytes", n, err, size) } rc.Close() dataRefsWritten = append(dataRefsWritten, dr) dataBytesWritten += int64(size) chunks = chunks[1:] } mf.DataBlobsOrigin = blob.RefFromHash(chunkWholeHash) // zipBlobs is where a schema or data blob is relative to the beginning // of the zip file. var zipBlobs []BlobAndPos var dataOffset int64 for _, br := range dataRefsWritten { size := pk.dataSize[br] mf.DataBlobs = append(mf.DataBlobs, BlobAndPos{blob.SizedRef{Ref: br, Size: size}, dataOffset}) zipBlobs = append(zipBlobs, BlobAndPos{blob.SizedRef{Ref: br, Size: size}, dataStart + dataOffset}) dataOffset += int64(size) } for _, br := range schemaBlobs { fw, err := zw.CreateHeader(&zip.FileHeader{ Name: "camlistore/" + br.String() + ".json", Method: zip.Store, // uncompressed }) check(err) check(zw.Flush()) b := pk.schemaBlob[br] zipBlobs = append(zipBlobs, BlobAndPos{blob.SizedRef{Ref: br, Size: b.Size()}, cw.n}) rc := b.Open() n, err := io.Copy(fw, rc) rc.Close() check(err) if n != int64(b.Size()) { return fmt.Errorf("failed to write all of schema blob %v: %d bytes, not wanted %d", br, n, b.Size()) } } // Manifest file fw, err = zw.Create(zipManifestPath) check(err) enc, err := json.MarshalIndent(mf, "", " ") check(err) _, err = fw.Write(enc) check(err) err = zw.Close() check(err) if zbuf.Len() > zipMax { // We guessed wrong. Back up. Find out how many blobs we went over. overage := zbuf.Len() - zipMax for i := len(dataRefsWritten) - 1; i >= 0; i-- { dr := dataRefsWritten[i] if overage <= 0 { return needsTruncatedAfterError{dr} } overage -= int(pk.dataSize[dr]) } return errors.New("file is unpackable; first blob is too big to fit") } zipRef := blob.SHA1FromBytes(zbuf.Bytes()) zipSB, err := blobserver.ReceiveNoHash(pk.s.large, zipRef, bytes.NewReader(zbuf.Bytes())) if err != nil { return err } bm := pk.s.meta.BeginBatch() bm.Set(fmt.Sprintf("%s%s:%d", wholeMetaPrefix, pk.wholeRef, len(pk.zips)), fmt.Sprintf("%s %d %d %d", zipRef, dataStart, pk.wholeBytesWritten, dataBytesWritten)) pk.wholeBytesWritten += dataBytesWritten pk.zips = append(pk.zips, writtenZip{ SizedRef: zipSB, dataRefs: dataRefsWritten, }) for _, zb := range zipBlobs { bm.Set(blobMetaPrefix+zb.Ref.String(), fmt.Sprintf("%d %v %d", zb.Size, zipRef, zb.Offset)) } if err := pk.s.meta.CommitBatch(bm); err != nil { return err } // Delete from small if !pk.s.skipDelete { toDelete := make([]blob.Ref, 0, len(dataRefsWritten)+len(schemaBlobs)) toDelete = append(toDelete, dataRefsWritten...) toDelete = append(toDelete, schemaBlobs...) if err := pk.s.small.RemoveBlobs(toDelete); err != nil { // Can't really do anything about it and doesn't really matter, so // just log for now. pk.s.Logf("Error removing blobs from %s: %v", pk.s.small, err) } } // On success, consume the chunks we wrote from pk.chunksRemain. pk.chunksRemain = pk.chunksRemain[len(dataRefsWritten):] return nil }