// GenericFileSave saves a Generic File record to Pharos. If the Generic // File's ID is zero, this performs a POST to create a new record. // For non-zero IDs, this performs a PUT to update the existing record. // Either way, the record must have an IntellectualObject ID. The response // object will have a new copy of the GenericFile if the save was successful. func (client *PharosClient) GenericFileSave(obj *models.GenericFile) *PharosResponse { // Set up the response object resp := NewPharosResponse(PharosGenericFile) resp.files = make([]*models.GenericFile, 1) // URL and method relativeUrl := fmt.Sprintf("/api/%s/files/", client.apiVersion) httpMethod := "POST" if obj.Id > 0 { // PUT URL looks like /api/v2/files/college.edu%2Fobject_name%2Ffile.xml relativeUrl = fmt.Sprintf("%s%s", relativeUrl, escapeSlashes(obj.Identifier)) httpMethod = "PUT" } absoluteUrl := client.BuildUrl(relativeUrl) // Prepare the JSON data postData, err := obj.SerializeForPharos() if err != nil { resp.Error = err } // Run the request client.DoRequest(resp, httpMethod, absoluteUrl, bytes.NewBuffer(postData)) if resp.Error != nil { return resp } // Parse the JSON from the response body gf := &models.GenericFile{} resp.Error = json.Unmarshal(resp.data, gf) if resp.Error == nil { resp.files[0] = gf } return resp }
// buildFile saves a data file from the tar archive to disk, // then returns a struct with data we'll need to construct the // GenericFile object in Fedora later. func (reader *Reader) saveWithChecksums(gf *models.GenericFile) { // Set up a MultiWriter to stream data ONCE to file, // md5 and sha256. We don't want to process the stream // three separate times. err := os.MkdirAll(filepath.Dir(gf.IngestLocalPath), 0755) if err != nil { gf.IngestErrorMessage = err.Error() return } outputWriter, err := os.OpenFile(gf.IngestLocalPath, os.O_CREATE|os.O_WRONLY, 0644) if outputWriter != nil { defer outputWriter.Close() } if err != nil { gf.IngestErrorMessage = fmt.Sprintf("Error opening %s for writing: %v", gf.IngestLocalPath, err) return } md5Hash := md5.New() shaHash := sha256.New() multiWriter := io.MultiWriter(md5Hash, shaHash, outputWriter) io.Copy(multiWriter, reader.tarReader) gf.IngestMd5 = fmt.Sprintf("%x", md5Hash.Sum(nil)) gf.IngestSha256 = fmt.Sprintf("%x", shaHash.Sum(nil)) gf.IngestSha256GeneratedAt = time.Now().UTC() gf.FileFormat, _ = platform.GuessMimeType(gf.IngestLocalPath) // on err, defaults to application/binary return }
// Initializes the uploader object with connection data and metadata // for this specific GenericFile. func (storer *APTStorer) initUploader(ingestState *models.IngestState, gf *models.GenericFile, sendWhere string) *network.S3Upload { var region string var bucket string if sendWhere == "s3" { region = storer.Context.Config.APTrustS3Region bucket = storer.Context.Config.PreservationBucket } else if sendWhere == "glacier" { region = storer.Context.Config.APTrustGlacierRegion bucket = storer.Context.Config.ReplicationBucket } else { ingestState.IngestManifest.StoreResult.AddError("Cannot save %s to %s because "+ "storer doesn't know where %s is", gf.Identifier, sendWhere) ingestState.IngestManifest.StoreResult.ErrorIsFatal = true return nil } uploader := network.NewS3Upload( region, bucket, gf.IngestUUID, gf.FileFormat, ) uploader.AddMetadata("institution", ingestState.IngestManifest.Object.Institution) uploader.AddMetadata("bag", ingestState.IngestManifest.Object.Identifier) uploader.AddMetadata("bagpath", gf.OriginalPath()) uploader.AddMetadata("md5", gf.IngestMd5) uploader.AddMetadata("sha256", gf.IngestSha256) return uploader }
func TestInstitutionIdentifier(t *testing.T) { genericFile := models.GenericFile{} genericFile.Identifier = "uc.edu/cin.675812/data/object.properties" instId, err := genericFile.InstitutionIdentifier() if err != nil { t.Errorf(err.Error()) return } assert.Equal(t, "uc.edu", instId) }
func (storer *APTStorer) markFileAsStored(gf *models.GenericFile, sendWhere, storageUrl string) { if sendWhere == "s3" { gf.IngestStoredAt = time.Now().UTC() gf.IngestStorageURL = storageUrl gf.URI = storageUrl events := gf.FindEventsByType(constants.EventIdentifierAssignment) var event *models.PremisEvent for i := range events { existingEvent := events[i] if strings.HasPrefix(existingEvent.OutcomeDetail, "http://") || strings.HasPrefix(existingEvent.OutcomeDetail, "https://") { event = existingEvent break } } if event != nil { event.DateTime = time.Now().UTC() } } else if sendWhere == "glacier" { gf.IngestReplicatedAt = time.Now().UTC() gf.IngestReplicationURL = storageUrl events := gf.FindEventsByType(constants.EventReplication) if events != nil && len(events) > 0 { events[0].DateTime = time.Now().UTC() } } }
// Returns a reader that can read the file from within the tar archive. // The S3 uploader uses this reader to stream data to S3 and Glacier. func (storer *APTStorer) getReadCloser(ingestState *models.IngestState, gf *models.GenericFile) (*fileutil.TarFileIterator, io.ReadCloser) { tarFilePath := ingestState.IngestManifest.Object.IngestTarFilePath tfi, err := fileutil.NewTarFileIterator(tarFilePath) if err != nil { msg := fmt.Sprintf("Can't get TarFileIterator for %s: %v", tarFilePath, err) ingestState.IngestManifest.StoreResult.AddError(msg) return nil, nil } origPathWithBagName, err := gf.OriginalPathWithBagName() if err != nil { ingestState.IngestManifest.StoreResult.AddError(err.Error()) return nil, nil } readCloser, err := tfi.Find(origPathWithBagName) if err != nil { msg := fmt.Sprintf("Can't get reader for %s: %v", gf.Identifier, err) ingestState.IngestManifest.StoreResult.AddError(msg) if readCloser != nil { readCloser.Close() } return nil, nil } return tfi, readCloser }
func TestPreservationStorageFileName(t *testing.T) { genericFile := models.GenericFile{} genericFile.URI = "" fileName, err := genericFile.PreservationStorageFileName() if err == nil { t.Errorf("PreservationStorageFileName() should have returned an error") } genericFile.URI = "https://s3.amazonaws.com/aptrust.test.preservation/a58a7c00-392f-11e4-916c-0800200c9a66" fileName, err = genericFile.PreservationStorageFileName() if err != nil { t.Errorf("PreservationStorageFileName() returned an error: %v", err) return } assert.Equal(t, "a58a7c00-392f-11e4-916c-0800200c9a66", fileName) }
func TestOriginalPathWithBagName(t *testing.T) { genericFile := models.GenericFile{} genericFile.IntellectualObjectIdentifier = "uc.edu/cin.675812" // Top-level custom tag file genericFile.Identifier = "uc.edu/cin.675812/tagmanifest-sha256.txt" origPath, err := genericFile.OriginalPathWithBagName() require.Nil(t, err) assert.Equal(t, "cin.675812/tagmanifest-sha256.txt", origPath) // Payload file genericFile.Identifier = "uc.edu/cin.675812/data/object.properties" origPath, err = genericFile.OriginalPathWithBagName() require.Nil(t, err) assert.Equal(t, "cin.675812/data/object.properties", origPath) // Nested custom tag file genericFile.Identifier = "uc.edu/cin.675812/custom/tag/dir/special_info.xml" origPath, err = genericFile.OriginalPathWithBagName() require.Nil(t, err) assert.Equal(t, "cin.675812/custom/tag/dir/special_info.xml", origPath) }
func (storer *APTStorer) saveFile(ingestState *models.IngestState, gf *models.GenericFile) { existingSha256, err := storer.getExistingSha256(gf.Identifier) if err != nil { storer.Context.MessageLog.Error(err.Error()) ingestState.IngestManifest.StoreResult.AddError(err.Error()) return } // Set this, for the record. if existingSha256 != nil { gf.IngestPreviousVersionExists = true gf.Id = existingSha256.GenericFileId uuid, err := storer.getUuidOfExistingFile(gf.Identifier) if err != nil { message := fmt.Sprintf("Cannot find existing UUID for %s: %v", gf.Identifier, err.Error()) ingestState.IngestManifest.StoreResult.AddError(message) storer.Context.MessageLog.Error(message) // Probably not fatal, but treat it as such for now, // because we don't want leave orphan objects in S3, // or have the GenericFile.URL not match the actual // storage URL. This should only happen if a depositor // deletes the existing version of a GenericFile while // we are processing this ingest. The window for that // to happen is usually between a few seconds and a few // hours. ingestState.IngestManifest.StoreResult.ErrorIsFatal = true return } if uuid == "" { message := fmt.Sprintf("Cannot find existing UUID for %s.", gf.Identifier) ingestState.IngestManifest.StoreResult.AddError(message) storer.Context.MessageLog.Error(message) // Probably not fatal, but treat it as such for now. // Same note as in previous if statement above. ingestState.IngestManifest.StoreResult.ErrorIsFatal = true return } else { // OK. Set the GenericFile's UUID to match the existing file's // UUID, so that we overwrite the existing file, and so the // GenericFile record in Pharos still has the correct URL. message := fmt.Sprintf("Resetting UUID for '%s' to '%s' so we can overwrite "+ "the currently stored version of the file.", gf.Identifier, uuid) storer.Context.MessageLog.Info(message) // TODO: Test this in integration post test. gf.IngestUUID = uuid } if existingSha256.Digest != gf.IngestSha256 { storer.Context.MessageLog.Info( "GenericFile %s has same sha256. Does not need save.", gf.Identifier) gf.IngestNeedsSave = false } } // Now copy to storage only if the file has changed. if gf.IngestNeedsSave { storer.Context.MessageLog.Info("File %s needs save", gf.Identifier) if gf.IngestStoredAt.IsZero() || gf.IngestStorageURL == "" { storer.copyToLongTermStorage(ingestState, gf, "s3") } if gf.IngestReplicatedAt.IsZero() || gf.IngestReplicationURL == "" { storer.copyToLongTermStorage(ingestState, gf, "glacier") } } }