func populatePacked2(t *testing.T, s *storage) (wants []storagetest.StreamerTestOpt) { const fileSize = 1 << 20 data := randBytes(fileSize) _, err := schema.WriteFileFromReader(s, "first-half.dat", bytes.NewReader(data[:fileSize/2])) if err != nil { t.Fatalf("WriteFileFromReader: %v", err) } _, err = schema.WriteFileFromReader(s, "second-half.dat", bytes.NewReader(data[fileSize/2:])) if err != nil { t.Fatalf("WriteFileFromReader: %v", err) } return nil }
// urlFileRef slurps urlstr from the net, writes to a file and returns its // fileref or "" on error func (r *run) urlFileRef(urlstr, filename string) string { im := r.im im.mu.Lock() if br, ok := im.imageFileRef[urlstr]; ok { im.mu.Unlock() return br.String() } im.mu.Unlock() res, err := r.HTTPClient().Get(urlstr) if err != nil { log.Printf("couldn't get image: %v", err) return "" } defer res.Body.Close() fileRef, err := schema.WriteFileFromReader(r.Host.Target(), filename, res.Body) if err != nil { r.errorf("couldn't write file: %v", err) return "" } im.mu.Lock() defer im.mu.Unlock() im.imageFileRef[urlstr] = fileRef return fileRef.String() }
func TestHandlerRightRef(t *testing.T) { b := test.Blob{Contents: "Foo"} storage := new(test.Fetcher) ref, err := schema.WriteFileFromReader(storage, "", b.Reader()) if err != nil { t.Fatal(err) } if err != nil { t.Fatal(err) } ts := httptest.NewServer(createVideothumbnailHandler(ref, storage)) defer ts.Close() resp, err := http.Get(ts.URL + "/" + ref.String()) if err != nil { t.Fatal(err) } if resp.StatusCode != 200 { t.Fatalf("expected 200 status: %v", resp) } content, err := ioutil.ReadAll(resp.Body) if err != nil { t.Fatal(err) } if string(content) != b.Contents { t.Errorf("excepted handler to serve data") } }
// urlFileRef slurps urlstr from the net, writes to a file and returns its // fileref or "" on error func (r *run) urlFileRef(urlstr string) string { if urlstr == "" { return "" } im := r.im im.mu.Lock() if br, ok := im.urlFileRef[urlstr]; ok { im.mu.Unlock() return br.String() } im.mu.Unlock() res, err := r.HTTPClient().Get(urlstr) if err != nil { log.Printf("couldn't get file: %v", err) return "" } defer res.Body.Close() filename := urlstr[strings.LastIndex(urlstr, "/")+1:] fileRef, err := schema.WriteFileFromReader(r.Host.Target(), filename, res.Body) if err != nil { log.Printf("couldn't write file: %v", err) return "" } im.mu.Lock() defer im.mu.Unlock() im.urlFileRef[urlstr] = fileRef return fileRef.String() }
func (h *mutFileHandle) Release(req *fuse.ReleaseRequest, intr fuse.Intr) fuse.Error { if h.tmp == nil { log.Printf("Release called on camli mutFileHandle without a tempfile set") return fuse.EIO } log.Printf("mutFileHandle release.") _, err := h.tmp.Seek(0, 0) if err != nil { log.Println("mutFileHandle.Release:", err) return fuse.EIO } var n int64 br, err := schema.WriteFileFromReader(h.f.fs.client, h.f.name, readerutil.CountingReader{Reader: h.tmp, N: &n}) if err != nil { log.Println("mutFileHandle.Release:", err) return fuse.EIO } h.f.setContent(br, n) h.tmp.Close() os.Remove(h.tmp.Name()) h.tmp = nil return nil }
// urlFileRef slurps urlstr from the net, writes to a file and returns its // fileref or "" on error or if urlstr was empty. func (r *run) urlFileRef(urlstr, filename string) string { im := r.im im.mu.Lock() if br, ok := im.imageFileRef[urlstr]; ok { im.mu.Unlock() return br.String() } im.mu.Unlock() if urlstr == "" { return "" } res, err := ctxutil.Client(r.Context()).Get(urlstr) if err != nil { log.Printf("foursquare: couldn't fetch image %q: %v", urlstr, err) return "" } defer res.Body.Close() fileRef, err := schema.WriteFileFromReader(r.Host.Target(), filename, res.Body) if err != nil { r.errorf("couldn't write file: %v", err) return "" } im.mu.Lock() defer im.mu.Unlock() im.imageFileRef[urlstr] = fileRef return fileRef.String() }
func (ih *ImageHandler) writeToCache(tr io.Reader, name string) (blob.Ref, error) { br, err := schema.WriteFileFromReader(ih.Cache, name, tr) if err != nil { return br, errors.New("failed to cache " + name + ": " + err.Error()) } if imageDebug { log.Printf("Image Cache: saved as %v\n", br) } return br, nil }
func populatePacked(t *testing.T, s *storage) (wants []storagetest.StreamerTestOpt) { const fileSize = 5 << 20 const fileName = "foo.dat" fileContents := randBytes(fileSize) _, err := schema.WriteFileFromReader(s, fileName, bytes.NewReader(fileContents)) if err != nil { t.Fatalf("WriteFileFromReader: %v", err) } return nil }
func TestPackTwoIdenticalfiles(t *testing.T) { const fileSize = 1 << 20 fileContents := randBytes(fileSize) testPack(t, func(sto blobserver.Storage) (err error) { if _, err = schema.WriteFileFromReader(sto, "a.txt", bytes.NewReader(fileContents)); err != nil { return } if _, err = schema.WriteFileFromReader(sto, "b.txt", bytes.NewReader(fileContents)); err != nil { return } return }, func(pt *packTest) { pt.sto.packGate = syncutil.NewGate(1) }, // one pack at a time wantNumLargeBlobs(1), wantNumSmallBlobs(1), // just the "b.txt" file schema blob okayWithoutMeta("sha1-cb4399f6b3b31ace417e1ec9326f9818bb3f8387"), ) }
func storageAndBlobRef(t *testing.T) (blobserver.Storage, blob.Ref) { storage := new(test.Fetcher) inFile, err := os.Open(testFilepath) if err != nil { t.Fatal(err) } ref, err := schema.WriteFileFromReader(storage, "small.webm", inFile) if err != nil { t.Fatal(err) } return storage, ref }
func TestPackNoDelete(t *testing.T) { const fileSize = 1 << 20 const fileName = "foo.dat" fileContents := randBytes(fileSize) testPack(t, func(sto blobserver.Storage) error { _, err := schema.WriteFileFromReader(sto, fileName, bytes.NewReader(fileContents)) return err }, func(pt *packTest) { pt.sto.skipDelete = true }, wantNumLargeBlobs(1), wantNumSmallBlobs(15), // empirically ) }
func (ui *UIHandler) serveUploadHelper(rw http.ResponseWriter, req *http.Request) { ret := make(map[string]interface{}) defer httputil.ReturnJSON(rw, ret) if ui.root.Storage == nil { ret["error"] = "No BlobRoot configured" ret["errorType"] = "server" return } mr, err := req.MultipartReader() if err != nil { ret["error"] = "reading body: " + err.Error() ret["errorType"] = "server" return } got := make([]map[string]interface{}, 0) for { part, err := mr.NextPart() if err == io.EOF { break } if err != nil { ret["error"] = "reading body: " + err.Error() ret["errorType"] = "server" break } fileName := part.FileName() if fileName == "" { continue } br, err := schema.WriteFileFromReader(ui.root.Storage, fileName, part) if err == nil { got = append(got, map[string]interface{}{ "filename": part.FileName(), "formname": part.FormName(), "fileref": br.String(), }) } else { ret["error"] = "writing to blobserver: " + err.Error() return } } ret["got"] = got }
func writeToCache(cache blobserver.Storage, thumbBytes []byte, name string) (br blob.Ref, err error) { tr := bytes.NewReader(thumbBytes) if len(thumbBytes) < constants.MaxBlobSize { br = blob.SHA1FromBytes(thumbBytes) _, err = blobserver.Receive(cache, br, tr) } else { // TODO: don't use rolling checksums when writing this. Tell // the filewriter to use 16 MB chunks instead. br, err = schema.WriteFileFromReader(cache, name, tr) } if err != nil { return br, errors.New("failed to cache " + name + ": " + err.Error()) } if imageDebug { log.Printf("Image Cache: saved as %v\n", br) } return br, nil }
func TestPackLarge(t *testing.T) { if testing.Short() { t.Skip("skipping in short mode") } const fileSize = 17 << 20 // more than 16 MB, so more than one zip const fileName = "foo.dat" fileContents := randBytes(fileSize) hash := blob.NewHash() hash.Write(fileContents) wholeRef := blob.RefFromHash(hash) pt := testPack(t, func(sto blobserver.Storage) error { _, err := schema.WriteFileFromReader(sto, fileName, bytes.NewReader(fileContents)) return err }, wantNumLargeBlobs(2), wantNumSmallBlobs(0), ) // Verify we wrote the correct "w:*" meta rows. got := map[string]string{} want := map[string]string{ "w:" + wholeRef.String(): "17825792 2", "w:" + wholeRef.String() + ":0": "sha1-9b4a3d114c059988075c87293c86ee7cbc6f4af5 37 0 16709479", "w:" + wholeRef.String() + ":1": "sha1-fe6326ac6b389ffe302623e4a501bfc8c6272e8e 37 16709479 1116313", } if err := sorted.Foreach(pt.sto.meta, func(key, value string) error { if strings.HasPrefix(key, "b:") { return nil } got[key] = value return nil }); err != nil { t.Fatal(err) } if !reflect.DeepEqual(got, want) { t.Errorf("'w:*' meta rows = %v; want %v", got, want) } // And verify we can read it back out. pt.testOpenWholeRef(t, wholeRef, fileSize) }
func (im *imp) Run(intr importer.Interrupt) (err error) { log.Printf("Running dummy importer.") defer func() { log.Printf("Dummy importer returned: %v", err) }() root, err := im.host.RootObject() if err != nil { return err } fileRef, err := schema.WriteFileFromReader(im.host.Target(), "foo.txt", strings.NewReader("Some file.\n")) if err != nil { return err } obj, err := root.ChildPathObject("foo.txt") if err != nil { return err } return obj.SetAttr("camliContent", fileRef.String()) }
// urlFileRef slurps urlstr from the net, writes to a file and returns its // fileref or "" on error func (im *imp) urlFileRef(urlstr string) string { if br, ok := im.imageFileRef[urlstr]; ok { return br.String() } res, err := im.host.HTTPClient().Get(urlstr) if err != nil { log.Printf("couldn't get image: %v", err) return "" } defer res.Body.Close() fileRef, err := schema.WriteFileFromReader(im.host.Target(), "category.png", res.Body) if err != nil { log.Printf("couldn't write file: %v", err) return "" } im.imageFileRef[urlstr] = fileRef return fileRef.String() }
func TestPackNormal(t *testing.T) { const fileSize = 5 << 20 const fileName = "foo.dat" fileContents := randBytes(fileSize) hash := blob.NewHash() hash.Write(fileContents) wholeRef := blob.RefFromHash(hash) pt := testPack(t, func(sto blobserver.Storage) error { _, err := schema.WriteFileFromReader(sto, fileName, bytes.NewReader(fileContents)) return err }, wantNumLargeBlobs(1), wantNumSmallBlobs(0), ) // And verify we can read it back out. pt.testOpenWholeRef(t, wholeRef, fileSize) }
// TODO(aa): // * Parallelize: http://golang.org/doc/effective_go.html#concurrency // * Do more than one "page" worth of results // * Report progress and errors back through host interface // * All the rest of the metadata (see photoMeta) // * Conflicts: For all metadata changes, prefer any non-imported claims // * Test! func (im *imp) importPhoto(parent *importer.Object, photo *photosSearchItem) error { filename := fmt.Sprintf("%s.%s", photo.Id, photo.Originalformat) photoNode, err := parent.ChildPathObject(filename) if err != nil { return err } // Import all the metadata. SetAttrs() is a no-op if the value hasn't changed, so there's no cost to doing these on every run. // And this way if we add more things to import, they will get picked up. if err := photoNode.SetAttrs( "flickrId", photo.Id, "title", photo.Title, "description", photo.Description.Content); err != nil { return err } // Import the photo itself. Since it is expensive to fetch the image, we store its lastupdate and only refetch if it might have changed. if photoNode.Attr("flickrLastupdate") == photo.Lastupdate { return nil } res, err := im.flickrRequest(photo.URL, url.Values{}) if err != nil { log.Printf("Flickr importer: Could not fetch %s: %s", photo.URL, err) return err } defer res.Body.Close() fileRef, err := schema.WriteFileFromReader(im.host.Target(), filename, res.Body) if err != nil { return err } if err := photoNode.SetAttr("camliContent", fileRef.String()); err != nil { return err } // Write lastupdate last, so that if any of the preceding fails, we will try again next time. if err := photoNode.SetAttr("flickrLastupdate", photo.Lastupdate); err != nil { return err } return nil }
func storePhoto(p photo) (string, error) { srcFile := localPathOf(p) f, err := os.Open(srcFile) if err != nil { return "", err } defer f.Close() fileRef, err := schema.WriteFileFromReader(camliClient, p.Id+"."+p.Extension, f) res, err := camliClient.UploadNewPermanode() if err != nil { return "", err } perma := res.BlobRef p.Description = cleanHTML(p.Description) claims := []*schema.Builder{} claims = append(claims, schema.NewSetAttributeClaim(perma, "camliContent", fileRef.String())) claims = append(claims, schema.NewSetAttributeClaim(perma, "title", mkTitle(p.Description))) claims = append(claims, schema.NewSetAttributeClaim(perma, "description", p.Description)) for _, t := range p.Tags { claims = append(claims, schema.NewAddAttributeClaim(perma, "tag", t)) } if p.Cat == "Public" { claims = append(claims, schema.NewSetAttributeClaim(perma, "camliAccess", "public")) } grp := syncutil.Group{} for _, claimBuilder := range claims { claim := claimBuilder.Blob() grp.Go(func() error { _, err := camliClient.UploadAndSignBlob(claim) return err }) } return perma.String(), grp.Err() }
func (ui *UIHandler) serveUploadHelper(rw http.ResponseWriter, req *http.Request) { if ui.root.Storage == nil { httputil.ServeJSONError(rw, httputil.ServerError("No BlobRoot configured")) return } mr, err := req.MultipartReader() if err != nil { httputil.ServeJSONError(rw, httputil.ServerError("reading body: "+err.Error())) return } var got []*uploadHelperGotItem for { part, err := mr.NextPart() if err == io.EOF { break } if err != nil { httputil.ServeJSONError(rw, httputil.ServerError("reading body: "+err.Error())) break } fileName := part.FileName() if fileName == "" { continue } br, err := schema.WriteFileFromReader(ui.root.Storage, fileName, part) if err != nil { httputil.ServeJSONError(rw, httputil.ServerError("writing to blobserver: "+err.Error())) return } got = append(got, &uploadHelperGotItem{ FileName: part.FileName(), FormName: part.FormName(), FileRef: br, }) } httputil.ReturnJSON(rw, &uploadHelperResponse{Got: got}) }
func (r *run) importItem(parent *importer.Object, item *item) error { itemNode, err := parent.ChildPathObject(item.ID) if err != nil { return err } fileRef, err := schema.WriteFileFromReader(r.Host.Target(), "", bytes.NewBufferString(item.Content)) if err != nil { return err } if err := itemNode.SetAttrs( "feedItemId", item.ID, "camliNodeType", "feed:item", "title", item.Title, "link", item.Link, "author", item.Author, "camliContent", fileRef.String(), "feedMediaContentURL", item.MediaContent, ); err != nil { return err } return nil }
func (im *imp) Run(ctx *importer.RunContext) (err error) { log.Printf("Running dummy importer.") defer func() { log.Printf("Dummy importer returned: %v", err) }() root := ctx.RootNode() fileRef, err := schema.WriteFileFromReader(ctx.Host.Target(), "foo.txt", strings.NewReader("Some file.\n")) if err != nil { return err } obj, err := root.ChildPathObject("foo.txt") if err != nil { return err } if err = obj.SetAttr("camliContent", fileRef.String()); err != nil { return err } n, _ := strconv.Atoi(ctx.AccountNode().Attr(acctAttrRunNumber)) n++ ctx.AccountNode().SetAttr(acctAttrRunNumber, fmt.Sprint(n)) // Update the title each time, just to show it working. You // wouldn't actually do this: return root.SetAttr("title", fmt.Sprintf("dummy: %s import #%d", ctx.AccountNode().Attr(acctAttrUsername), n)) }
// Flush is called to let the file system clean up any data buffers // and to pass any errors in the process of closing a file to the user // application. // // Flush *may* be called more than once in the case where a file is // opened more than once, but it's not possible to detect from the // call itself whether this is a final flush. // // This is generally the last opportunity to finalize data and the // return value sets the return value of the Close that led to the // calling of Flush. // // Note that this is distinct from Fsync -- which is a user-requested // flush (fsync, etc...) func (h *mutFileHandle) Flush(*fuse.FlushRequest, fuse.Intr) fuse.Error { if h.tmp == nil { log.Printf("Flush called on camli mutFileHandle without a tempfile set") return fuse.EIO } _, err := h.tmp.Seek(0, 0) if err != nil { log.Println("mutFileHandle.Flush:", err) return fuse.EIO } var n int64 br, err := schema.WriteFileFromReader(h.f.fs.client, h.f.name, readerutil.CountingReader{Reader: h.tmp, N: &n}) if err != nil { log.Println("mutFileHandle.Flush:", err) return fuse.EIO } err = h.f.setContent(br, n) if err != nil { log.Printf("mutFileHandle.Flush: %v", err) return fuse.EIO } return nil }
func TestReindex(t *testing.T) { if testing.Short() { t.Skip("skipping in short mode") } type file struct { size int64 name string contents []byte } files := []file{ {17 << 20, "foo.dat", randBytesSrc(17<<20, 42)}, {10 << 20, "bar.dat", randBytesSrc(10<<20, 43)}, {5 << 20, "baz.dat", randBytesSrc(5<<20, 44)}, } pt := testPack(t, func(sto blobserver.Storage) error { for _, f := range files { if _, err := schema.WriteFileFromReader(sto, f.name, bytes.NewReader(f.contents)); err != nil { return err } } return nil }, wantNumLargeBlobs(4), wantNumSmallBlobs(0), ) // backup the meta that is supposed to be lost/erased. // pt.sto.reindex allocates a new pt.sto.meta, so meta != pt.sto.meta after it is called. meta := pt.sto.meta // and build new meta index if err := pt.sto.reindex(context.TODO(), func() (sorted.KeyValue, error) { return sorted.NewMemoryKeyValue(), nil }); err != nil { t.Fatal(err) } validBlobKey := func(key, value string) error { if !strings.HasPrefix(key, "b:") { return errors.New("not a blob meta key") } wantRef, ok := blob.Parse(key[2:]) if !ok { return errors.New("bogus blobref in key") } m, err := parseMetaRow([]byte(value)) if err != nil { return err } rc, err := pt.large.SubFetch(m.largeRef, int64(m.largeOff), int64(m.size)) if err != nil { return err } defer rc.Close() h := wantRef.Hash() n, err := io.Copy(h, rc) if err != nil { return err } if !wantRef.HashMatches(h) { return errors.New("content doesn't match") } if n != int64(m.size) { return errors.New("size doesn't match") } return nil } // check that new meta is identical to "lost" one newRows := 0 if err := sorted.Foreach(pt.sto.meta, func(key, newValue string) error { oldValue, err := meta.Get(key) if err != nil { t.Fatalf("Could not get value for %v in old meta: %v", key, err) } newRows++ // Exact match is fine. if oldValue == newValue { return nil } // If it differs, it should at least be correct. (blob metadata // can now point to different packed zips, depending on sorting) err = validBlobKey(key, newValue) if err == nil { return nil } t.Errorf("Reindexing error: for key %v: %v\n got: %q\nwant: %q", key, err, newValue, oldValue) return nil // keep enumerating, regardless of errors }); err != nil { t.Fatal(err) } // make sure they have the same number of entries too, to be sure that the reindexing // did not miss entries that the old meta had. oldRows := countSortedRows(t, meta) if oldRows != newRows { t.Fatalf("index number of entries mismatch: got %d entries in new index, wanted %d (as in index before reindexing)", newRows, oldRows) } // And verify we can read one of the files back out. hash := blob.NewHash() hash.Write(files[0].contents) pt.testOpenWholeRef(t, blob.RefFromHash(hash), files[0].size) }
// Tests a bunch of rounds on a bunch of data. func TestArchiverStress(t *testing.T) { if testing.Short() { t.Skip("Skipping in short mode") } src := new(test.Fetcher) fileRef, err := schema.WriteFileFromReader(src, "random", io.LimitReader(randReader{}, 10<<20)) if err != nil { t.Fatal(err) } n0 := src.NumBlobs() t.Logf("Wrote %v in %d blobs", fileRef, n0) refs0 := src.BlobrefStrings() var zips [][]byte archived := map[blob.Ref]bool{} a := &Archiver{ Source: src, MinZipSize: 1 << 20, DeleteSourceAfterStore: true, Store: func(zipd []byte, brs []blob.SizedRef) error { zips = append(zips, zipd) for _, sbr := range brs { if archived[sbr.Ref] { t.Error("duplicate archive of %v", sbr.Ref) } archived[sbr.Ref] = true } return nil }, } for { err := a.RunOnce() if err == ErrSourceTooSmall { break } if err != nil { t.Fatal(err) } } if len(archived) == 0 { t.Errorf("unexpected small number of archived blobs = %d", len(archived)) } if len(zips) < 2 { t.Errorf("unexpected small number of zip files = %d", len(zips)) } if n1 := src.NumBlobs() + len(archived); n0 != n1 { t.Errorf("original %d blobs != %d after + %d archived (%d)", n0, src.NumBlobs(), len(archived), n1) } // And restore: for _, zipd := range zips { if err := foreachZipEntry(zipd, func(br blob.Ref, contents []byte) { tb := &test.Blob{Contents: string(contents)} if tb.BlobRef() != br { t.Fatal("corrupt zip callback") } src.AddBlob(tb) }); err != nil { t.Fatal(err) } } refs1 := src.BlobrefStrings() if !reflect.DeepEqual(refs0, refs1) { t.Error("Restore error.") } }
func TestForeachZipBlob(t *testing.T) { const fileSize = 2 << 20 const fileName = "foo.dat" fileContents := randBytes(fileSize) ctx, cancel := context.WithCancel(context.TODO()) defer cancel() pt := testPack(t, func(sto blobserver.Storage) error { _, err := schema.WriteFileFromReader(sto, fileName, bytes.NewReader(fileContents)) return err }, wantNumLargeBlobs(1), wantNumSmallBlobs(0), ) zipBlob, err := singleBlob(pt.large) if err != nil { t.Fatal(err) } zipBytes := slurpBlob(t, pt.large, zipBlob.Ref) zipSize := len(zipBytes) all := map[blob.Ref]blob.SizedRef{} if err := blobserver.EnumerateAll(ctx, pt.logical, func(sb blob.SizedRef) error { all[sb.Ref] = sb return nil }); err != nil { t.Fatal(err) } foreachSaw := 0 blobSizeSum := 0 if err := pt.sto.foreachZipBlob(zipBlob.Ref, func(bap BlobAndPos) error { foreachSaw++ blobSizeSum += int(bap.Size) want, ok := all[bap.Ref] if !ok { t.Errorf("unwanted blob ref returned from foreachZipBlob: %v", bap.Ref) return nil } delete(all, bap.Ref) if want.Size != bap.Size { t.Errorf("for %v, foreachZipBlob size = %d; want %d", bap.Ref, bap.Size, want.Size) return nil } // Verify the offset. h := bap.Ref.Hash() h.Write(zipBytes[bap.Offset : bap.Offset+int64(bap.Size)]) if !bap.Ref.HashMatches(h) { return fmt.Errorf("foreachZipBlob returned blob %v at offset %d that failed validation", bap.Ref, bap.Offset) } return nil }); err != nil { t.Fatal(err) } t.Logf("foreachZipBlob enumerated %d blobs", foreachSaw) if len(all) > 0 { t.Errorf("foreachZipBlob forgot to enumerate %d blobs: %v", len(all), all) } // Calculate per-blobref zip overhead (zip file headers/TOC/manifest file, etc) zipOverhead := zipSize - blobSizeSum t.Logf("zip fixed overhead = %d bytes, for %d blobs (%d bytes each)", zipOverhead, foreachSaw, zipOverhead/foreachSaw) }
// TODO(aa): // * Parallelize: http://golang.org/doc/effective_go.html#concurrency // * Do more than one "page" worth of results // * Report progress and errors back through host interface // * All the rest of the metadata (see photoMeta) // * Conflicts: For all metadata changes, prefer any non-imported claims // * Test! func (r *run) importPhoto(parent *importer.Object, photo *photosSearchItem) error { filename := fmt.Sprintf("%s.%s", photo.Id, photo.OriginalFormat) photoNode, err := parent.ChildPathObject(filename) if err != nil { return err } // https://www.flickr.com/services/api/misc.dates.html dateTaken, err := time.ParseInLocation("2006-01-02 15:04:05", photo.DateTaken, schema.UnknownLocation) if err != nil { // default to the published date otherwise log.Printf("Flickr importer: problem with date taken of photo %v, defaulting to published date instead.", photo.Id) seconds, err := strconv.ParseInt(photo.DateUpload, 10, 64) if err != nil { return fmt.Errorf("could not parse date upload time %q for image %v: %v", photo.DateUpload, photo.Id, err) } dateTaken = time.Unix(seconds, 0) } attrs := []string{ attrFlickrId, photo.Id, nodeattr.DateCreated, schema.RFC3339FromTime(dateTaken), nodeattr.Description, photo.Description.Content, } if schema.IsInterestingTitle(photo.Title) { attrs = append(attrs, nodeattr.Title, photo.Title) } // Import all the metadata. SetAttrs() is a no-op if the value hasn't changed, so there's no cost to doing these on every run. // And this way if we add more things to import, they will get picked up. if err := photoNode.SetAttrs(attrs...); err != nil { return err } // Import the photo itself. Since it is expensive to fetch the image, we store its lastupdate and only refetch if it might have changed. // lastupdate is a Unix timestamp according to https://www.flickr.com/services/api/flickr.photos.getInfo.html seconds, err := strconv.ParseInt(photo.LastUpdate, 10, 64) if err != nil { return fmt.Errorf("could not parse lastupdate time for image %v: %v", photo.Id, err) } lastUpdate := time.Unix(seconds, 0) if lastUpdateString := photoNode.Attr(nodeattr.DateModified); lastUpdateString != "" { oldLastUpdate, err := time.Parse(time.RFC3339, lastUpdateString) if err != nil { return fmt.Errorf("could not parse last stored update time for image %v: %v", photo.Id, err) } if lastUpdate.Equal(oldLastUpdate) { if err := r.updatePrimaryPhoto(photoNode); err != nil { return err } return nil } } form := url.Values{} form.Set("user_id", r.userID) res, err := r.fetch(photo.URL, form) if err != nil { log.Printf("Flickr importer: Could not fetch %s: %s", photo.URL, err) return err } defer res.Body.Close() fileRef, err := schema.WriteFileFromReader(r.Host.Target(), filename, res.Body) if err != nil { return err } if err := photoNode.SetAttr(nodeattr.CamliContent, fileRef.String()); err != nil { return err } if err := r.updatePrimaryPhoto(photoNode); err != nil { return err } // Write lastupdate last, so that if any of the preceding fails, we will try again next time. if err := photoNode.SetAttr(nodeattr.DateModified, schema.RFC3339FromTime(lastUpdate)); err != nil { return err } return nil }
func (r *run) updatePhotoInAlbum(ctx context.Context, albumNode *importer.Object, photo picago.Photo) (ret error) { if photo.ID == "" { return errors.New("photo has no ID") } getMediaBytes := func() (io.ReadCloser, error) { log.Printf("Importing media from %v", photo.URL) resp, err := ctxutil.Client(ctx).Get(photo.URL) if err != nil { return nil, fmt.Errorf("importing photo %s: %v", photo.ID, err) } if resp.StatusCode != http.StatusOK { resp.Body.Close() return nil, fmt.Errorf("importing photo %s: status code = %d", photo.ID, resp.StatusCode) } return resp.Body, nil } var fileRefStr string idFilename := photo.ID + "-" + photo.Filename photoNode, err := albumNode.ChildPathObjectOrFunc(idFilename, func() (*importer.Object, error) { h := blob.NewHash() rc, err := getMediaBytes() if err != nil { return nil, err } fileRef, err := schema.WriteFileFromReader(r.Host.Target(), photo.Filename, io.TeeReader(rc, h)) if err != nil { return nil, err } fileRefStr = fileRef.String() wholeRef := blob.RefFromHash(h) if pn, err := findExistingPermanode(r.Host.Searcher(), wholeRef); err == nil { return r.Host.ObjectFromRef(pn) } return r.Host.NewObject() }) if err != nil { return err } const attrMediaURL = "picasaMediaURL" if fileRefStr == "" { fileRefStr = photoNode.Attr(nodeattr.CamliContent) // Only re-download the source photo if its URL has changed. // Empirically this seems to work: cropping a photo in the // photos.google.com UI causes its URL to change. And it makes // sense, looking at the ugliness of the URLs with all their // encoded/signed state. if !mediaURLsEqual(photoNode.Attr(attrMediaURL), photo.URL) { rc, err := getMediaBytes() if err != nil { return err } fileRef, err := schema.WriteFileFromReader(r.Host.Target(), photo.Filename, rc) rc.Close() if err != nil { return err } fileRefStr = fileRef.String() } } title := strings.TrimSpace(photo.Description) if strings.Contains(title, "\n") { title = title[:strings.Index(title, "\n")] } if title == "" && schema.IsInterestingTitle(photo.Filename) { title = photo.Filename } // TODO(tgulacsi): add more attrs (comments ?) // for names, see http://schema.org/ImageObject and http://schema.org/CreativeWork attrs := []string{ nodeattr.CamliContent, fileRefStr, attrPicasaId, photo.ID, nodeattr.Title, title, nodeattr.Description, photo.Description, nodeattr.LocationText, photo.Location, nodeattr.DateModified, schema.RFC3339FromTime(photo.Updated), nodeattr.DatePublished, schema.RFC3339FromTime(photo.Published), nodeattr.URL, photo.PageURL, } if photo.Latitude != 0 || photo.Longitude != 0 { attrs = append(attrs, nodeattr.Latitude, fmt.Sprintf("%f", photo.Latitude), nodeattr.Longitude, fmt.Sprintf("%f", photo.Longitude), ) } if err := photoNode.SetAttrs(attrs...); err != nil { return err } if err := photoNode.SetAttrValues("tag", photo.Keywords); err != nil { return err } if photo.Position > 0 { if err := albumNode.SetAttr( nodeattr.CamliPathOrderColon+strconv.Itoa(photo.Position-1), photoNode.PermanodeRef().String()); err != nil { return err } } // Do this last, after we're sure the "camliContent" attribute // has been saved successfully, because this is the one that // causes us to do it again in the future or not. if err := photoNode.SetAttrs(attrMediaURL, photo.URL); err != nil { return err } return nil }
// viaAPI is true if it came via the REST API, or false if it came via a zip file. func (r *run) importTweet(parent *importer.Object, tweet tweetItem, viaAPI bool) (dup bool, err error) { select { case <-r.Context().Done(): r.errorf("Twitter importer: interrupted") return false, r.Context().Err() default: } id := tweet.ID() tweetNode, err := parent.ChildPathObject(id) if err != nil { return false, err } // Because the zip format and the API format differ a bit, and // might diverge more in the future, never use the zip content // to overwrite data fetched via the API. If we add new // support for different fields in the future, we might want // to revisit this decision. Be wary of flip/flopping data if // modifying this, though. if tweetNode.Attr(attrImportMethod) == "api" && !viaAPI { return true, nil } // e.g. "2014-06-12 19:11:51 +0000" createdTime, err := timeParseFirstFormat(tweet.CreatedAt(), time.RubyDate, "2006-01-02 15:04:05 -0700") if err != nil { return false, fmt.Errorf("could not parse time %q: %v", tweet.CreatedAt(), err) } url := fmt.Sprintf("https://twitter.com/%s/status/%v", r.AccountNode().Attr(importer.AcctAttrUserName), id) attrs := []string{ "twitterId", id, nodeattr.Type, "twitter.com:tweet", nodeattr.StartDate, schema.RFC3339FromTime(createdTime), nodeattr.Content, tweet.Text(), nodeattr.URL, url, } if lat, long, ok := tweet.LatLong(); ok { attrs = append(attrs, nodeattr.Latitude, fmt.Sprint(lat), nodeattr.Longitude, fmt.Sprint(long), ) } if viaAPI { attrs = append(attrs, attrImportMethod, "api") } else { attrs = append(attrs, attrImportMethod, "zip") } for i, m := range tweet.Media() { filename := m.BaseFilename() if tweetNode.Attr("camliPath:"+filename) != "" && (i > 0 || tweetNode.Attr("camliContentImage") != "") { // Don't re-import media we've already fetched. continue } tried, gotMedia := 0, false for _, mediaURL := range m.URLs() { tried++ res, err := ctxutil.Client(r.Context()).Get(mediaURL) if err != nil { return false, fmt.Errorf("Error fetching %s for tweet %s : %v", mediaURL, url, err) } if res.StatusCode == http.StatusNotFound { continue } if res.StatusCode != 200 { return false, fmt.Errorf("HTTP status %d fetching %s for tweet %s", res.StatusCode, mediaURL, url) } if !viaAPI { log.Printf("For zip tweet %s, reading %v", url, mediaURL) } fileRef, err := schema.WriteFileFromReader(r.Host.Target(), filename, res.Body) res.Body.Close() if err != nil { return false, fmt.Errorf("Error fetching media %s for tweet %s: %v", mediaURL, url, err) } attrs = append(attrs, "camliPath:"+filename, fileRef.String()) if i == 0 { attrs = append(attrs, "camliContentImage", fileRef.String()) } log.Printf("Slurped %s as %s for tweet %s (%v)", mediaURL, fileRef.String(), url, tweetNode.PermanodeRef()) gotMedia = true break } if !gotMedia && tried > 0 { return false, fmt.Errorf("All media URLs 404s for tweet %s", url) } } changes, err := tweetNode.SetAttrs2(attrs...) if err == nil && changes { log.Printf("Imported tweet %s", url) } return !changes, err }
func TestRemoveBlobs(t *testing.T) { ctx, cancel := context.WithCancel(context.TODO()) defer cancel() // The basic small cases are handled via storagetest in TestStorage, // so this only tests removing packed blobs. small := new(test.Fetcher) large := new(test.Fetcher) sto := &storage{ small: small, large: large, meta: sorted.NewMemoryKeyValue(), log: test.NewLogger(t, "blobpacked: "), } sto.init() const fileSize = 1 << 20 fileContents := randBytes(fileSize) if _, err := schema.WriteFileFromReader(sto, "foo.dat", bytes.NewReader(fileContents)); err != nil { t.Fatal(err) } if small.NumBlobs() != 0 || large.NumBlobs() == 0 { t.Fatalf("small, large counts == %d, %d; want 0, non-zero", small.NumBlobs(), large.NumBlobs()) } var all []blob.SizedRef if err := blobserver.EnumerateAll(ctx, sto, func(sb blob.SizedRef) error { all = append(all, sb) return nil }); err != nil { t.Fatal(err) } // Find the zip zipBlob, err := singleBlob(sto.large) if err != nil { t.Fatalf("failed to find packed zip: %v", err) } // The zip file is in use, so verify we can't delete it. if err := sto.deleteZipPack(zipBlob.Ref); err == nil { t.Fatalf("zip pack blob deleted but it should not have been allowed") } // Delete everything for len(all) > 0 { del := all[0].Ref all = all[1:] if err := sto.RemoveBlobs([]blob.Ref{del}); err != nil { t.Fatalf("RemoveBlobs: %v", err) } if err := storagetest.CheckEnumerate(sto, all); err != nil { t.Fatalf("After deleting %v, %v", del, err) } } dRows := func() (n int) { if err := sorted.ForeachInRange(sto.meta, "d:", "", func(key, value string) error { if strings.HasPrefix(key, "d:") { n++ } return nil }); err != nil { t.Fatalf("meta iteration error: %v", err) } return } if n := dRows(); n == 0 { t.Fatalf("expected a 'd:' row after deletes") } // TODO: test the background pack-deleter loop? figure out its design first. if err := sto.deleteZipPack(zipBlob.Ref); err != nil { t.Errorf("error deleting zip %v: %v", zipBlob.Ref, err) } if n := dRows(); n != 0 { t.Errorf("expected the 'd:' row to be deleted") } }