Exemple #1
0
// GenericFileSave saves a Generic File record to Pharos. If the Generic
// File's ID is zero, this performs a POST to create a new record.
// For non-zero IDs, this performs a PUT to update the existing record.
// Either way, the record must have an IntellectualObject ID. The response
// object will have a new copy of the GenericFile if the save was successful.
func (client *PharosClient) GenericFileSave(obj *models.GenericFile) *PharosResponse {
	// Set up the response object
	resp := NewPharosResponse(PharosGenericFile)
	resp.files = make([]*models.GenericFile, 1)

	// URL and method
	relativeUrl := fmt.Sprintf("/api/%s/files/", client.apiVersion)
	httpMethod := "POST"
	if obj.Id > 0 {
		// PUT URL looks like /api/v2/files/college.edu%2Fobject_name%2Ffile.xml
		relativeUrl = fmt.Sprintf("%s%s", relativeUrl, escapeSlashes(obj.Identifier))
		httpMethod = "PUT"
	}
	absoluteUrl := client.BuildUrl(relativeUrl)

	// Prepare the JSON data
	postData, err := obj.SerializeForPharos()
	if err != nil {
		resp.Error = err
	}

	// Run the request
	client.DoRequest(resp, httpMethod, absoluteUrl, bytes.NewBuffer(postData))
	if resp.Error != nil {
		return resp
	}

	// Parse the JSON from the response body
	gf := &models.GenericFile{}
	resp.Error = json.Unmarshal(resp.data, gf)
	if resp.Error == nil {
		resp.files[0] = gf
	}
	return resp
}
Exemple #2
0
// buildFile saves a data file from the tar archive to disk,
// then returns a struct with data we'll need to construct the
// GenericFile object in Fedora later.
func (reader *Reader) saveWithChecksums(gf *models.GenericFile) {
	// Set up a MultiWriter to stream data ONCE to file,
	// md5 and sha256. We don't want to process the stream
	// three separate times.
	err := os.MkdirAll(filepath.Dir(gf.IngestLocalPath), 0755)
	if err != nil {
		gf.IngestErrorMessage = err.Error()
		return
	}
	outputWriter, err := os.OpenFile(gf.IngestLocalPath, os.O_CREATE|os.O_WRONLY, 0644)
	if outputWriter != nil {
		defer outputWriter.Close()
	}
	if err != nil {
		gf.IngestErrorMessage = fmt.Sprintf("Error opening %s for writing: %v", gf.IngestLocalPath, err)
		return
	}
	md5Hash := md5.New()
	shaHash := sha256.New()
	multiWriter := io.MultiWriter(md5Hash, shaHash, outputWriter)
	io.Copy(multiWriter, reader.tarReader)
	gf.IngestMd5 = fmt.Sprintf("%x", md5Hash.Sum(nil))
	gf.IngestSha256 = fmt.Sprintf("%x", shaHash.Sum(nil))
	gf.IngestSha256GeneratedAt = time.Now().UTC()
	gf.FileFormat, _ = platform.GuessMimeType(gf.IngestLocalPath) // on err, defaults to application/binary
	return
}
Exemple #3
0
// Initializes the uploader object with connection data and metadata
// for this specific GenericFile.
func (storer *APTStorer) initUploader(ingestState *models.IngestState, gf *models.GenericFile, sendWhere string) *network.S3Upload {
	var region string
	var bucket string
	if sendWhere == "s3" {
		region = storer.Context.Config.APTrustS3Region
		bucket = storer.Context.Config.PreservationBucket
	} else if sendWhere == "glacier" {
		region = storer.Context.Config.APTrustGlacierRegion
		bucket = storer.Context.Config.ReplicationBucket
	} else {
		ingestState.IngestManifest.StoreResult.AddError("Cannot save %s to %s because "+
			"storer doesn't know where %s is", gf.Identifier, sendWhere)
		ingestState.IngestManifest.StoreResult.ErrorIsFatal = true
		return nil
	}
	uploader := network.NewS3Upload(
		region,
		bucket,
		gf.IngestUUID,
		gf.FileFormat,
	)
	uploader.AddMetadata("institution", ingestState.IngestManifest.Object.Institution)
	uploader.AddMetadata("bag", ingestState.IngestManifest.Object.Identifier)
	uploader.AddMetadata("bagpath", gf.OriginalPath())
	uploader.AddMetadata("md5", gf.IngestMd5)
	uploader.AddMetadata("sha256", gf.IngestSha256)
	return uploader
}
func TestInstitutionIdentifier(t *testing.T) {
	genericFile := models.GenericFile{}
	genericFile.Identifier = "uc.edu/cin.675812/data/object.properties"
	instId, err := genericFile.InstitutionIdentifier()
	if err != nil {
		t.Errorf(err.Error())
		return
	}
	assert.Equal(t, "uc.edu", instId)
}
Exemple #5
0
func (storer *APTStorer) markFileAsStored(gf *models.GenericFile, sendWhere, storageUrl string) {
	if sendWhere == "s3" {
		gf.IngestStoredAt = time.Now().UTC()
		gf.IngestStorageURL = storageUrl
		gf.URI = storageUrl
		events := gf.FindEventsByType(constants.EventIdentifierAssignment)
		var event *models.PremisEvent
		for i := range events {
			existingEvent := events[i]
			if strings.HasPrefix(existingEvent.OutcomeDetail, "http://") ||
				strings.HasPrefix(existingEvent.OutcomeDetail, "https://") {
				event = existingEvent
				break
			}
		}
		if event != nil {
			event.DateTime = time.Now().UTC()
		}
	} else if sendWhere == "glacier" {
		gf.IngestReplicatedAt = time.Now().UTC()
		gf.IngestReplicationURL = storageUrl
		events := gf.FindEventsByType(constants.EventReplication)
		if events != nil && len(events) > 0 {
			events[0].DateTime = time.Now().UTC()
		}
	}
}
Exemple #6
0
// Returns a reader that can read the file from within the tar archive.
// The S3 uploader uses this reader to stream data to S3 and Glacier.
func (storer *APTStorer) getReadCloser(ingestState *models.IngestState, gf *models.GenericFile) (*fileutil.TarFileIterator, io.ReadCloser) {
	tarFilePath := ingestState.IngestManifest.Object.IngestTarFilePath
	tfi, err := fileutil.NewTarFileIterator(tarFilePath)
	if err != nil {
		msg := fmt.Sprintf("Can't get TarFileIterator for %s: %v", tarFilePath, err)
		ingestState.IngestManifest.StoreResult.AddError(msg)
		return nil, nil
	}
	origPathWithBagName, err := gf.OriginalPathWithBagName()
	if err != nil {
		ingestState.IngestManifest.StoreResult.AddError(err.Error())
		return nil, nil
	}
	readCloser, err := tfi.Find(origPathWithBagName)
	if err != nil {
		msg := fmt.Sprintf("Can't get reader for %s: %v", gf.Identifier, err)
		ingestState.IngestManifest.StoreResult.AddError(msg)
		if readCloser != nil {
			readCloser.Close()
		}
		return nil, nil
	}
	return tfi, readCloser
}
func TestPreservationStorageFileName(t *testing.T) {
	genericFile := models.GenericFile{}
	genericFile.URI = ""
	fileName, err := genericFile.PreservationStorageFileName()
	if err == nil {
		t.Errorf("PreservationStorageFileName() should have returned an error")
	}
	genericFile.URI = "https://s3.amazonaws.com/aptrust.test.preservation/a58a7c00-392f-11e4-916c-0800200c9a66"
	fileName, err = genericFile.PreservationStorageFileName()
	if err != nil {
		t.Errorf("PreservationStorageFileName() returned an error: %v", err)
		return
	}
	assert.Equal(t, "a58a7c00-392f-11e4-916c-0800200c9a66", fileName)
}
func TestOriginalPathWithBagName(t *testing.T) {
	genericFile := models.GenericFile{}
	genericFile.IntellectualObjectIdentifier = "uc.edu/cin.675812"

	// Top-level custom tag file
	genericFile.Identifier = "uc.edu/cin.675812/tagmanifest-sha256.txt"
	origPath, err := genericFile.OriginalPathWithBagName()
	require.Nil(t, err)
	assert.Equal(t, "cin.675812/tagmanifest-sha256.txt", origPath)

	// Payload file
	genericFile.Identifier = "uc.edu/cin.675812/data/object.properties"
	origPath, err = genericFile.OriginalPathWithBagName()
	require.Nil(t, err)
	assert.Equal(t, "cin.675812/data/object.properties", origPath)

	// Nested custom tag file
	genericFile.Identifier = "uc.edu/cin.675812/custom/tag/dir/special_info.xml"
	origPath, err = genericFile.OriginalPathWithBagName()
	require.Nil(t, err)
	assert.Equal(t, "cin.675812/custom/tag/dir/special_info.xml", origPath)
}
Exemple #9
0
func (storer *APTStorer) saveFile(ingestState *models.IngestState, gf *models.GenericFile) {
	existingSha256, err := storer.getExistingSha256(gf.Identifier)
	if err != nil {
		storer.Context.MessageLog.Error(err.Error())
		ingestState.IngestManifest.StoreResult.AddError(err.Error())
		return
	}
	// Set this, for the record.
	if existingSha256 != nil {
		gf.IngestPreviousVersionExists = true
		gf.Id = existingSha256.GenericFileId

		uuid, err := storer.getUuidOfExistingFile(gf.Identifier)
		if err != nil {
			message := fmt.Sprintf("Cannot find existing UUID for %s: %v", gf.Identifier, err.Error())
			ingestState.IngestManifest.StoreResult.AddError(message)
			storer.Context.MessageLog.Error(message)
			// Probably not fatal, but treat it as such for now,
			// because we don't want leave orphan objects in S3,
			// or have the GenericFile.URL not match the actual
			// storage URL. This should only happen if a depositor
			// deletes the existing version of a GenericFile while
			// we are processing this ingest. The window for that
			// to happen is usually between a few seconds and a few
			// hours.
			ingestState.IngestManifest.StoreResult.ErrorIsFatal = true
			return
		}
		if uuid == "" {
			message := fmt.Sprintf("Cannot find existing UUID for %s.", gf.Identifier)
			ingestState.IngestManifest.StoreResult.AddError(message)
			storer.Context.MessageLog.Error(message)
			// Probably not fatal, but treat it as such for now.
			// Same note as in previous if statement above.
			ingestState.IngestManifest.StoreResult.ErrorIsFatal = true
			return
		} else {
			// OK. Set the GenericFile's UUID to match the existing file's
			// UUID, so that we overwrite the existing file, and so the
			// GenericFile record in Pharos still has the correct URL.
			message := fmt.Sprintf("Resetting UUID for '%s' to '%s' so we can overwrite "+
				"the currently stored version of the file.",
				gf.Identifier, uuid)
			storer.Context.MessageLog.Info(message)
			// TODO: Test this in integration post test.
			gf.IngestUUID = uuid
		}

		if existingSha256.Digest != gf.IngestSha256 {
			storer.Context.MessageLog.Info(
				"GenericFile %s has same sha256. Does not need save.", gf.Identifier)
			gf.IngestNeedsSave = false
		}
	}
	// Now copy to storage only if the file has changed.
	if gf.IngestNeedsSave {
		storer.Context.MessageLog.Info("File %s needs save", gf.Identifier)
		if gf.IngestStoredAt.IsZero() || gf.IngestStorageURL == "" {
			storer.copyToLongTermStorage(ingestState, gf, "s3")
		}
		if gf.IngestReplicatedAt.IsZero() || gf.IngestReplicationURL == "" {
			storer.copyToLongTermStorage(ingestState, gf, "glacier")
		}
	}
}