Example #1
0
// Download the file, and update the IngestManifest while we're at it.
func (fetcher *APTFetcher) downloadFile(ingestState *models.IngestState) error {
	downloader := network.NewS3Download(
		constants.AWSVirginia,
		ingestState.IngestManifest.S3Bucket,
		ingestState.IngestManifest.S3Key,
		ingestState.IngestManifest.Object.IngestTarFilePath,
		true,  // calculate md5 checksum on the entire tar file
		false, // calculate sha256 checksum on the entire tar file
	)

	// It's fairly common for very large bags to fail more than
	// once on transient network errors (e.g. "Connection reset by peer")
	// So we give this several tries.
	for i := 0; i < 10; i++ {
		downloader.Fetch()
		if downloader.ErrorMessage == "" {
			fetcher.Context.MessageLog.Info("Fetched %s/%s after %d attempts",
				ingestState.IngestManifest.S3Bucket,
				ingestState.IngestManifest.S3Key,
				i+1)
			break
		}
	}

	// Return now if we failed.
	if downloader.ErrorMessage != "" {
		return fmt.Errorf("Error fetching %s/%s: %v",
			ingestState.IngestManifest.S3Bucket,
			ingestState.IngestManifest.S3Key,
			downloader.ErrorMessage)
	}

	obj := ingestState.IngestManifest.Object
	obj.IngestSize = downloader.BytesCopied
	obj.IngestRemoteMd5 = *downloader.Response.ETag
	obj.IngestLocalMd5 = downloader.Md5Digest

	// The ETag for S3 object uploaded via single-part upload is
	// the file's md5 digest. For objects uploaded via multi-part
	// upload, the ETag is calculated differently and includes a
	// dash near the end, followed by the number of parts in the
	// multipart upload. We can't use that kind of ETag to verify
	// the md5 checksum that we calculated.
	obj.IngestMd5Verifiable = strings.Contains(downloader.Md5Digest, "-")
	if obj.IngestMd5Verifiable {
		obj.IngestMd5Verified = obj.IngestRemoteMd5 == obj.IngestLocalMd5
	}

	// If we got a bad checksum, note the error in the WorkSummary.
	if obj.IngestMd5Verifiable && !obj.IngestMd5Verified {
		ingestState.IngestManifest.FetchResult.AddError("Our md5 '%s' does not match S3 md5 '%s'",
			obj.IngestLocalMd5, obj.IngestRemoteMd5)
		ingestState.IngestManifest.FetchResult.ErrorIsFatal = true
	}

	return nil
}
Example #2
0
func getS3DownloadObject(t *testing.T) *network.S3Download {
	tmpDir, err := ioutil.TempDir("", "s3_download_test")
	if err != nil {
		t.Errorf(err.Error())
		return nil
	}
	tmpFilePath := filepath.Join(tmpDir, testFile)
	return network.NewS3Download(
		constants.AWSVirginia,
		testBucket,
		testFile,
		tmpFilePath,
		false,
		false,
	)
}
Example #3
0
// ISSUE: See https://www.pivotaltracker.com/story/show/134540309
// TODO: Don't even try to solve the issue above without a thorough plan.
func (packager *DPNPackager) fetchAllFiles(manifest *models.DPNIngestManifest) {
	downloader := apt_network.NewS3Download(
		constants.AWSVirginia,
		packager.Context.Config.PreservationBucket,
		"",    // s3 key to fetch - to be set below
		"",    // local path at which to save the s3 file - set below
		false, // no need to calculate md5
		true)  // calculate sha256 for fixity verification
	packager.Context.MessageLog.Info("Object %s has %d saved files",
		manifest.IntellectualObject.Identifier,
		len(manifest.IntellectualObject.GenericFiles))
	downloaded := 0
	for _, gf := range manifest.IntellectualObject.GenericFiles {
		downloader.Sha256Digest = ""
		downloader.ErrorMessage = ""

		// We're going to want to confirm the sha256 digest of the download...
		existingSha256 := gf.GetChecksumByAlgorithm(constants.AlgSha256)
		if existingSha256 == nil {
			manifest.PackageSummary.AddError("Cannot find sha256 digest for file %s", gf.Identifier)
			break
		}
		// Figure out what the key name is for this file. It's a UUID.
		s3KeyName, err := gf.PreservationStorageFileName()
		if err != nil {
			manifest.PackageSummary.AddError("File %s: %v", gf.Identifier, err)
			break
		}

		// Tell the downloader what we're downloading, and where to put it.
		// Any files outside the data directory are tag files, and per the DPN
		// spec, tag files from APTrust bags have to go into a dir called
		// aptrust-tags. (Actually, <anything>-tags, but we're going with
		// aptrust-tags.) See the DPN bagging spec here:
		// https://wiki.duraspace.org/display/DPNC/BagIt+Specification
		downloader.KeyName = s3KeyName
		targetPath := gf.OriginalPath()
		if !strings.HasPrefix(gf.OriginalPath(), "data/") {
			targetPath = filepath.Join("aptrust-tags", gf.OriginalPath())
		}
		downloader.LocalPath = filepath.Join(manifest.LocalDir, targetPath)

		// Fetch is the expensive part, so we don't even want to get to this
		// point if we don't have the info above.
		packager.Context.MessageLog.Info("Downloading %s (%s) to %s", gf.Identifier,
			s3KeyName, downloader.LocalPath)
		downloader.Fetch()
		if downloader.ErrorMessage != "" {
			msg := fmt.Sprintf("Error fetching %s from S3: %s", gf.Identifier, downloader.ErrorMessage)
			packager.Context.MessageLog.Error(msg)
			manifest.PackageSummary.AddError(msg)
			break
		}
		if downloader.Sha256Digest != existingSha256.Digest {
			msg := fmt.Sprintf("sha256 digest mismatch for for file %s."+
				"Our digest: %s. Digest of fetched file: %s",
				gf.Identifier, existingSha256, downloader.Sha256Digest)
			packager.Context.MessageLog.Error(msg)
			manifest.PackageSummary.AddError(msg)
			break
		}
		downloaded += 1
	}
	totalFileCount := len(manifest.IntellectualObject.GenericFiles)
	if downloaded == totalFileCount {
		packager.Context.MessageLog.Info("Downloaded all %d files for %s",
			downloaded, manifest.IntellectualObject.Identifier)
	} else {
		packager.Context.MessageLog.Error("Downloaded only %d of %d files for %s",
			downloaded, totalFileCount, manifest.IntellectualObject.Identifier)
	}
}