// Download the file, and update the IngestManifest while we're at it. func (fetcher *APTFetcher) downloadFile(ingestState *models.IngestState) error { downloader := network.NewS3Download( constants.AWSVirginia, ingestState.IngestManifest.S3Bucket, ingestState.IngestManifest.S3Key, ingestState.IngestManifest.Object.IngestTarFilePath, true, // calculate md5 checksum on the entire tar file false, // calculate sha256 checksum on the entire tar file ) // It's fairly common for very large bags to fail more than // once on transient network errors (e.g. "Connection reset by peer") // So we give this several tries. for i := 0; i < 10; i++ { downloader.Fetch() if downloader.ErrorMessage == "" { fetcher.Context.MessageLog.Info("Fetched %s/%s after %d attempts", ingestState.IngestManifest.S3Bucket, ingestState.IngestManifest.S3Key, i+1) break } } // Return now if we failed. if downloader.ErrorMessage != "" { return fmt.Errorf("Error fetching %s/%s: %v", ingestState.IngestManifest.S3Bucket, ingestState.IngestManifest.S3Key, downloader.ErrorMessage) } obj := ingestState.IngestManifest.Object obj.IngestSize = downloader.BytesCopied obj.IngestRemoteMd5 = *downloader.Response.ETag obj.IngestLocalMd5 = downloader.Md5Digest // The ETag for S3 object uploaded via single-part upload is // the file's md5 digest. For objects uploaded via multi-part // upload, the ETag is calculated differently and includes a // dash near the end, followed by the number of parts in the // multipart upload. We can't use that kind of ETag to verify // the md5 checksum that we calculated. obj.IngestMd5Verifiable = strings.Contains(downloader.Md5Digest, "-") if obj.IngestMd5Verifiable { obj.IngestMd5Verified = obj.IngestRemoteMd5 == obj.IngestLocalMd5 } // If we got a bad checksum, note the error in the WorkSummary. if obj.IngestMd5Verifiable && !obj.IngestMd5Verified { ingestState.IngestManifest.FetchResult.AddError("Our md5 '%s' does not match S3 md5 '%s'", obj.IngestLocalMd5, obj.IngestRemoteMd5) ingestState.IngestManifest.FetchResult.ErrorIsFatal = true } return nil }
func getS3DownloadObject(t *testing.T) *network.S3Download { tmpDir, err := ioutil.TempDir("", "s3_download_test") if err != nil { t.Errorf(err.Error()) return nil } tmpFilePath := filepath.Join(tmpDir, testFile) return network.NewS3Download( constants.AWSVirginia, testBucket, testFile, tmpFilePath, false, false, ) }
// ISSUE: See https://www.pivotaltracker.com/story/show/134540309 // TODO: Don't even try to solve the issue above without a thorough plan. func (packager *DPNPackager) fetchAllFiles(manifest *models.DPNIngestManifest) { downloader := apt_network.NewS3Download( constants.AWSVirginia, packager.Context.Config.PreservationBucket, "", // s3 key to fetch - to be set below "", // local path at which to save the s3 file - set below false, // no need to calculate md5 true) // calculate sha256 for fixity verification packager.Context.MessageLog.Info("Object %s has %d saved files", manifest.IntellectualObject.Identifier, len(manifest.IntellectualObject.GenericFiles)) downloaded := 0 for _, gf := range manifest.IntellectualObject.GenericFiles { downloader.Sha256Digest = "" downloader.ErrorMessage = "" // We're going to want to confirm the sha256 digest of the download... existingSha256 := gf.GetChecksumByAlgorithm(constants.AlgSha256) if existingSha256 == nil { manifest.PackageSummary.AddError("Cannot find sha256 digest for file %s", gf.Identifier) break } // Figure out what the key name is for this file. It's a UUID. s3KeyName, err := gf.PreservationStorageFileName() if err != nil { manifest.PackageSummary.AddError("File %s: %v", gf.Identifier, err) break } // Tell the downloader what we're downloading, and where to put it. // Any files outside the data directory are tag files, and per the DPN // spec, tag files from APTrust bags have to go into a dir called // aptrust-tags. (Actually, <anything>-tags, but we're going with // aptrust-tags.) See the DPN bagging spec here: // https://wiki.duraspace.org/display/DPNC/BagIt+Specification downloader.KeyName = s3KeyName targetPath := gf.OriginalPath() if !strings.HasPrefix(gf.OriginalPath(), "data/") { targetPath = filepath.Join("aptrust-tags", gf.OriginalPath()) } downloader.LocalPath = filepath.Join(manifest.LocalDir, targetPath) // Fetch is the expensive part, so we don't even want to get to this // point if we don't have the info above. packager.Context.MessageLog.Info("Downloading %s (%s) to %s", gf.Identifier, s3KeyName, downloader.LocalPath) downloader.Fetch() if downloader.ErrorMessage != "" { msg := fmt.Sprintf("Error fetching %s from S3: %s", gf.Identifier, downloader.ErrorMessage) packager.Context.MessageLog.Error(msg) manifest.PackageSummary.AddError(msg) break } if downloader.Sha256Digest != existingSha256.Digest { msg := fmt.Sprintf("sha256 digest mismatch for for file %s."+ "Our digest: %s. Digest of fetched file: %s", gf.Identifier, existingSha256, downloader.Sha256Digest) packager.Context.MessageLog.Error(msg) manifest.PackageSummary.AddError(msg) break } downloaded += 1 } totalFileCount := len(manifest.IntellectualObject.GenericFiles) if downloaded == totalFileCount { packager.Context.MessageLog.Info("Downloaded all %d files for %s", downloaded, manifest.IntellectualObject.Identifier) } else { packager.Context.MessageLog.Error("Downloaded only %d of %d files for %s", downloaded, totalFileCount, manifest.IntellectualObject.Identifier) } }