func TestValidate_BadFileNames(t *testing.T) { bagValidationConfig, err := getValidationConfig() if err != nil { assert.Fail(t, "Could not load BagValidationConfig: %s", err.Error()) } optionalFileSpec := validation.FileSpec{Presence: "OPTIONAL"} bagValidationConfig.FileSpecs["tagmanifest-md5.txt"] = optionalFileSpec _, filename, _, _ := runtime.Caller(0) dir := filepath.Dir(filename) pathToBag, err := filepath.Abs(path.Join(dir, "..", "testdata", "unit_test_bags", "example.edu.sample_bad_file_names.tar")) validator, err := validation.NewBagValidator(pathToBag, bagValidationConfig) if err != nil { assert.Fail(t, "NewBagValidator returned unexpected error: %s", err.Error()) } result := validator.Validate() assert.NotNil(t, result.IntellectualObject) assert.Equal(t, 9, len(result.IntellectualObject.GenericFiles)) assert.True(t, strings.Contains(result.IntellectualObject.IngestErrorMessage, "Filename 'data/-starts-with-dash'")) assert.True(t, strings.Contains(result.IntellectualObject.IngestErrorMessage, "Filename 'data/contains#hash'")) assert.True(t, strings.Contains(result.IntellectualObject.IngestErrorMessage, "Filename 'data/contains*star'")) assert.True(t, strings.Contains(result.IntellectualObject.IngestErrorMessage, "Filename 'data/contains+plus'")) assert.False(t, result.ParseSummary.HasErrors()) assert.NotNil(t, result.ValidationSummary) require.True(t, result.ValidationSummary.HasErrors()) }
func (packager *DPNPackager) validate() { for manifest := range packager.ValidationChannel { packager.Context.MessageLog.Info("Validating %s", manifest.LocalTarFile) manifest.NsqMessage.Touch() manifest.ValidateSummary.Attempted = true manifest.ValidateSummary.AttemptNumber += 1 manifest.ValidateSummary.Start() var validationResult *validation.ValidationResult validator, err := validation.NewBagValidator(manifest.LocalTarFile, packager.BagValidationConfig) if err != nil { manifest.PackageSummary.AddError(err.Error()) } else { // Validation can take a long time for large bags. validationResult = validator.Validate() } if validationResult == nil { // This should be impossible manifest.PackageSummary.AddError("Bag validator returned nil result!") } else if validationResult.ParseSummary.HasErrors() || validationResult.ValidationSummary.HasErrors() { for _, errMsg := range validationResult.ParseSummary.Errors { manifest.PackageSummary.AddError("Validator parse error: %s", errMsg) } for _, errMsg := range validationResult.ValidationSummary.Errors { manifest.PackageSummary.AddError("Validation error: %s", errMsg) } } manifest.ValidateSummary.Finish() manifest.NsqMessage.Touch() packager.PostProcessChannel <- manifest } }
func TestNewBagValidator_BadConfig(t *testing.T) { _, filename, _, _ := runtime.Caller(0) dir := filepath.Dir(filename) pathToBag, err := filepath.Abs(path.Join(dir, "..", "testdata", "unit_test_bags", "example.edu.tagsample_good.tar")) if err != nil { assert.Fail(t, "Can't figure out Abs path: %s", err.Error()) } bagValidationConfig, err := getValidationConfig() if err != nil { assert.Fail(t, "Could not load BagValidationConfig: %v", err) } badPathSpec := validation.TagSpec{ FilePath: "", Presence: "REQUIRED", EmptyOK: true, } badPresenceSpec := validation.TagSpec{ FilePath: "orangina", Presence: "orangina", EmptyOK: true, } bagValidationConfig.TagSpecs["bad_path_spec"] = badPathSpec bagValidationConfig.TagSpecs["bad_presence"] = badPresenceSpec _, err = validation.NewBagValidator(pathToBag, bagValidationConfig) require.NotNil(t, err) assert.True(t, strings.Contains(err.Error(), "TagSpec for file ''")) assert.True(t, strings.Contains(err.Error(), "TagSpec for file 'orangina'")) }
/*************************************************************************** Step 1 of 2: Validate the bag. ***************************************************************************/ func (validator *DPNValidator) validate() { for manifest := range validator.ValidationChannel { // Don't time us out, NSQ! manifest.NsqMessage.Touch() // Tell Pharos that we've started to validate item. note := "Validating bag" manifest.DPNWorkItem.Note = ¬e SaveDPNWorkItemState(validator.Context, manifest, manifest.ValidateSummary) // Set up a new validator to check this bag. bagValidator, err := validation.NewBagValidator(manifest.LocalPath, validator.BagValidationConfig) if err != nil { // Could not create a BagValidator. Should this be fatal? manifest.ValidateSummary.AddError(err.Error()) } else { // Validation can take hours for very large bags. validationResult := bagValidator.Validate() // The validator creates its own WorkSummary, complete with // Start/Finish timestamps, error messages and everything. // Just copy that into our IngestManifest. manifest.ValidateSummary = validationResult.ValidationSummary } manifest.NsqMessage.Touch() validator.PostProcessChannel <- manifest } }
func TestValidate_NoDataDir(t *testing.T) { bagValidationConfig, err := getValidationConfig() if err != nil { assert.Fail(t, "Could not load BagValidationConfig: %s", err.Error()) } optionalFileSpec := validation.FileSpec{Presence: "OPTIONAL"} bagValidationConfig.FileSpecs["tagmanifest-md5.txt"] = optionalFileSpec _, filename, _, _ := runtime.Caller(0) dir := filepath.Dir(filename) pathToBag, err := filepath.Abs(path.Join(dir, "..", "testdata", "unit_test_bags", "example.edu.sample_no_data_dir.tar")) validator, err := validation.NewBagValidator(pathToBag, bagValidationConfig) if err != nil { assert.Fail(t, "NewBagValidator returned unexpected error: %s", err.Error()) } result := validator.Validate() assert.NotNil(t, result.IntellectualObject) assert.Equal(t, 5, len(result.IntellectualObject.GenericFiles)) assert.True(t, strings.Contains(result.IntellectualObject.IngestErrorMessage, "File 'data/datastream-DC' in manifest 'manifest-md5.txt' is missing")) assert.True(t, strings.Contains(result.IntellectualObject.IngestErrorMessage, "File 'data/datastream-descMetadata' in manifest 'manifest-md5.txt' is missing")) assert.True(t, strings.Contains(result.IntellectualObject.IngestErrorMessage, "File 'data/datastream-MARC' in manifest 'manifest-md5.txt' is missing")) assert.True(t, strings.Contains(result.IntellectualObject.IngestErrorMessage, "File 'data/datastream-RELS-EXT' in manifest 'manifest-md5.txt' is missing")) assert.True(t, result.ParseSummary.HasErrors()) assert.NotNil(t, result.ValidationSummary) require.True(t, result.ValidationSummary.HasErrors()) }
// These good bags are from the old Bagman test suite. We have to make sure they // pass here, so we know validation is backwards-compatible. func TestValidate_GoodBags(t *testing.T) { goodBags := []string{ "example.edu.multipart.b01.of02.tar", "example.edu.multipart.b02.of02.tar", "example.edu.sample_good.tar", } bagValidationConfig, err := getValidationConfig() if err != nil { assert.Fail(t, "Could not load BagValidationConfig: %s", err.Error()) } optionalFileSpec := validation.FileSpec{Presence: "OPTIONAL"} bagValidationConfig.FileSpecs["tagmanifest-md5.txt"] = optionalFileSpec _, filename, _, _ := runtime.Caller(0) dir := filepath.Dir(filename) for _, goodBag := range goodBags { pathToBag, err := filepath.Abs(path.Join(dir, "..", "testdata", "unit_test_bags", goodBag)) validator, err := validation.NewBagValidator(pathToBag, bagValidationConfig) if err != nil { assert.Fail(t, "NewBagValidator returned unexpected error: %s", err.Error()) } result := validator.Validate() require.NotNil(t, result.IntellectualObject, goodBag) assert.NotEmpty(t, result.IntellectualObject.GenericFiles, goodBag) assert.Empty(t, result.IntellectualObject.IngestErrorMessage, goodBag) assert.False(t, result.ParseSummary.HasErrors(), goodBag) } }
// A valid bag should have no errors. func TestBagValidator_ValidBag(t *testing.T) { bagValidationConfig, err := getValidationConfig() if err != nil { assert.Fail(t, "Could not load BagValidationConfig: %s", err.Error()) } _, filename, _, _ := runtime.Caller(0) dir := filepath.Dir(filename) pathToBag, err := filepath.Abs(path.Join(dir, "..", "testdata", "unit_test_bags", "example.edu.tagsample_good.tar")) validator, err := validation.NewBagValidator(pathToBag, bagValidationConfig) if err != nil { assert.Fail(t, "NewBagValidator returned unexpected error: %s", err.Error()) } result := validator.Validate() assert.NotNil(t, result.IntellectualObject) assert.Equal(t, 16, len(result.IntellectualObject.GenericFiles)) assert.Empty(t, result.IntellectualObject.IngestErrorMessage) assert.False(t, result.ParseSummary.HasErrors()) assert.NotNil(t, result.ValidationSummary) require.False(t, result.ValidationSummary.HasErrors()) for _, gf := range result.IntellectualObject.GenericFiles { assert.NotEmpty(t, gf.IngestSha256VerifiedAt) assert.NotEmpty(t, gf.IngestMd5VerifiedAt) } }
// Validate a file that does not exist. func TestNewBagValidator_FileDoesNotExist(t *testing.T) { bagValidationConfig, err := getValidationConfig() if err != nil { assert.Fail(t, "Could not load BagValidationConfig: %v", err) } _, err = validation.NewBagValidator("/blah/blah/blah", bagValidationConfig) if err == nil { assert.Fail(t, "NewBagValidator should have raised error on non-existent file") } }
// ------------------------------------------------------------------------- // Step 2 of 4: Validate // // Make sure the tar file is a valid bag. // ------------------------------------------------------------------------- func (fetcher *APTFetcher) validate() { for ingestState := range fetcher.ValidationChannel { // Don't time us out, NSQ! ingestState.TouchNSQ() // Tell Pharos that we've started to validate item. // Let's NOT quit if there's an error here. In that case, Pharos // might not know that we're validating, but we can still proceed. // Restarting the whole fetch process would be expensive. MarkWorkItemStarted(ingestState, fetcher.Context, constants.StageValidate, "Validating bag.") // Set up a new validator to check this bag. var validationResult *validation.ValidationResult validator, err := validation.NewBagValidator( ingestState.IngestManifest.Object.IngestTarFilePath, fetcher.BagValidationConfig) if err != nil { // Could not create a BagValidator. Should this be fatal? ingestState.IngestManifest.ValidateResult.AddError(err.Error()) } else { // Here's where bag validation actually happens. There's a lot // going on in this call, which can take anywhere from 2 seconds // to several hours to complete, depending on the size of the bag. // The most time-consuming part of the validation process is // calculating md5 and sha256 checksums on every file in the bag. // If the bag is 100GB+ in size, that takes a long time. validationResult = validator.Validate() // The validator creates its own WorkSummary, complete with // Start/Finish timestamps, error messages and everything. // Just copy that into our IngestManifest. ingestState.IngestManifest.ValidateResult = validationResult.ValidationSummary // NOTE that we are OVERWRITING the IntellectualObject here // with the much more complete version returned by the validator, // but we have to reset some basic data that's only available // in the current context. ingestState.IngestManifest.Object = validationResult.IntellectualObject SetBasicObjectInfo(ingestState, fetcher.Context) // If the bag is invalid, that's a fatal error. We should not do // any further processing on it. if validationResult.HasErrors() { ingestState.IngestManifest.ValidateResult.ErrorIsFatal = true ingestState.IngestManifest.ValidateResult.Retry = false } } ingestState.TouchNSQ() fetcher.CleanupChannel <- ingestState } }
// Read from a file that is not a directory or a valid tar file. func TestValidate_BadFileFormat(t *testing.T) { _, thisfile, _, _ := runtime.Caller(0) bagValidationConfig, err := getValidationConfig() if err != nil { assert.Fail(t, "Could not load BagValidationConfig: %v", err) } validator, err := validation.NewBagValidator(thisfile, bagValidationConfig) if err != nil { assert.Fail(t, "NewBagValidator raised unexpected error: %s", err.Error()) } result := validator.Validate() assert.True(t, result.ParseSummary.HasErrors()) assert.NotEmpty(t, result.IntellectualObject.IngestErrorMessage) }
// Bad params should cause error, not panic. func TestNewBagValidatorWithBadParams(t *testing.T) { // Good BagValidationConfig, bad bag path bagValidationConfig, err := getValidationConfig() if err != nil { assert.Fail(t, "Could not load BagValidationConfig: %s", err.Error()) } pathToBag := "/path/does/not/exist.tar" _, err = validation.NewBagValidator(pathToBag, bagValidationConfig) if err == nil { assert.Fail(t, "NewBagValidator should have complained about bad bag path.") } // Good bag path, bad BagValidationConfig _, filename, _, _ := runtime.Caller(0) dir := filepath.Dir(filename) pathToBag, err = filepath.Abs(path.Join(dir, "..", "testdata", "unit_test_bags", "example.edu.tagsample_good.tar")) if err != nil { assert.Fail(t, "Can't figure out Abs path: %s", err.Error()) } _, err = validation.NewBagValidator(pathToBag, nil) if err == nil { assert.Fail(t, "NewBagValidator should have complained about nil BagValidationConfig.") } }
func TestNewBagValidator_BadRegex(t *testing.T) { _, filename, _, _ := runtime.Caller(0) dir := filepath.Dir(filename) pathToBag, err := filepath.Abs(path.Join(dir, "..", "testdata", "unit_test_bags", "example.edu.tagsample_good.tar")) if err != nil { assert.Fail(t, "Can't figure out Abs path: %s", err.Error()) } bagValidationConfig, err := getValidationConfig() if err != nil { assert.Fail(t, "Could not load BagValidationConfig: %v", err) } bagValidationConfig.FileNamePattern = "ThisPatternIsInvalid[-" _, err = validation.NewBagValidator(pathToBag, bagValidationConfig) require.NotNil(t, err) assert.True(t, strings.Contains(err.Error(), "Cannot compile regex")) }
// Read an invalid bag from a tar file. func TestValidate_FromTarFile_BagInvalid(t *testing.T) { bagValidationConfig, err := getValidationConfig() if err != nil { assert.Fail(t, "Could not load BagValidationConfig: %s", err.Error()) } _, filename, _, _ := runtime.Caller(0) dir := filepath.Dir(filename) pathToBag, err := filepath.Abs(path.Join(dir, "..", "testdata", "unit_test_bags", "example.edu.tagsample_bad.tar")) validator, err := validation.NewBagValidator(pathToBag, bagValidationConfig) if err != nil { assert.Fail(t, "NewBagValidator returned unexpected error: %s", err.Error()) } result := validator.Validate() assert.NotNil(t, result.IntellectualObject) assert.Equal(t, 16, len(result.IntellectualObject.GenericFiles)) assert.NotEmpty(t, result.IntellectualObject.IngestErrorMessage) assert.True(t, result.ParseSummary.HasErrors()) }
func TestNewBagValidator(t *testing.T) { _, filename, _, _ := runtime.Caller(0) dir := filepath.Dir(filename) pathToBag, err := filepath.Abs(path.Join(dir, "..", "testdata", "unit_test_bags", "example.edu.tagsample_good.tar")) if err != nil { assert.Fail(t, "Can't figure out Abs path: %s", err.Error()) } bagValidationConfig, err := getValidationConfig() if err != nil { assert.Fail(t, "Could not load BagValidationConfig: %v", err) } validator, err := validation.NewBagValidator(pathToBag, bagValidationConfig) if err != nil { assert.Fail(t, "Error creating BagValidator: %s", err.Error()) } assert.NotNil(t, validator) assert.Equal(t, pathToBag, validator.PathToBag) assert.NotNil(t, validator.BagValidationConfig) }
// Read an invalid bag from a directory func TestValidate_FromDirectory_BagInvalid(t *testing.T) { tempDir, bagPath, err := testhelper.UntarTestBag("example.edu.tagsample_bad.tar") if err != nil { assert.Fail(t, err.Error()) } if tempDir != "" { defer os.RemoveAll(tempDir) } bagValidationConfig, err := getValidationConfig() if err != nil { assert.Fail(t, "Could not load BagValidationConfig: %s", err.Error()) } validator, err := validation.NewBagValidator(bagPath, bagValidationConfig) if err != nil { assert.Fail(t, "NewBagValidator returned unexpected error: %s", err.Error()) } result := validator.Validate() assert.NotNil(t, result.IntellectualObject) assert.NotEmpty(t, result.IntellectualObject.IngestErrorMessage) assert.True(t, result.ParseSummary.HasErrors()) }
// Make sure we catch all errors in an invalid bag. // This is a more thorough version of TestValidate_FromTarFile_BagInvalid func TestValidate_InvalidBag(t *testing.T) { bagValidationConfig, err := getValidationConfig() if err != nil { assert.Fail(t, "Could not load BagValidationConfig: %s", err.Error()) } _, filename, _, _ := runtime.Caller(0) dir := filepath.Dir(filename) pathToBag, err := filepath.Abs(path.Join(dir, "..", "testdata", "unit_test_bags", "example.edu.tagsample_bad.tar")) validator, err := validation.NewBagValidator(pathToBag, bagValidationConfig) if err != nil { assert.Fail(t, "NewBagValidator returned unexpected error: %s", err.Error()) } result := validator.Validate() assert.NotNil(t, result.IntellectualObject) assert.Equal(t, 16, len(result.IntellectualObject.GenericFiles)) assert.NotEmpty(t, result.IntellectualObject.IngestErrorMessage) assert.True(t, result.ParseSummary.HasErrors()) assert.True(t, result.ValidationSummary.HasErrors()) assert.True(t, result.HasErrors()) err_0 := "File 'data/file-not-in-bag' in manifest 'manifest-sha256.txt' is missing from bag" err_1 := "File 'custom_tags/tag_file_xyz.pdf' in manifest 'tagmanifest-md5.txt' is missing from bag" err_2 := "File 'custom_tags/tag_file_xyz.pdf' in manifest 'tagmanifest-sha256.txt' is missing from bag" err_3 := "Value for tag 'Title' is missing." err_4 := "Tag 'Access' has illegal value 'acksess'." err_5 := "Bad sha256 digest for 'data/datastream-descMetadata': manifest says 'This-checksum-is-bad-on-purpose.-The-validator-should-catch-it!!', file digest is 'cf9cbce80062932e10ee9cd70ec05ebc24019deddfea4e54b8788decd28b4bc7'" err_6 := "Bad md5 digest for 'custom_tags/tracked_tag_file.txt': manifest says '00000000000000000000000000000000', file digest is 'dafbffffc3ed28ef18363394935a2651'" err_7 := "Bad sha256 digest for 'custom_tags/tracked_tag_file.txt': manifest says '0000000000000000000000000000000000000000000000000000000000000000', file digest is '3f2f50c5bde87b58d6132faee14d1a295d115338643c658df7fa147e2296ccdd'" assert.Equal(t, 8, len(result.ValidationSummary.Errors)) assert.True(t, util.StringListContains(result.ValidationSummary.Errors, err_0)) assert.True(t, util.StringListContains(result.ValidationSummary.Errors, err_1)) assert.True(t, util.StringListContains(result.ValidationSummary.Errors, err_2)) assert.True(t, util.StringListContains(result.ValidationSummary.Errors, err_3)) assert.True(t, util.StringListContains(result.ValidationSummary.Errors, err_4)) assert.True(t, util.StringListContains(result.ValidationSummary.Errors, err_5)) assert.True(t, util.StringListContains(result.ValidationSummary.Errors, err_6)) assert.True(t, util.StringListContains(result.ValidationSummary.Errors, err_7)) }
func TestValidate_BadAccess(t *testing.T) { bagValidationConfig, err := getValidationConfig() if err != nil { assert.Fail(t, "Could not load BagValidationConfig: %s", err.Error()) } optionalFileSpec := validation.FileSpec{Presence: "OPTIONAL"} bagValidationConfig.FileSpecs["tagmanifest-md5.txt"] = optionalFileSpec _, filename, _, _ := runtime.Caller(0) dir := filepath.Dir(filename) pathToBag, err := filepath.Abs(path.Join(dir, "..", "testdata", "unit_test_bags", "example.edu.sample_bad_access.tar")) validator, err := validation.NewBagValidator(pathToBag, bagValidationConfig) if err != nil { assert.Fail(t, "NewBagValidator returned unexpected error: %s", err.Error()) } result := validator.Validate() assert.NotNil(t, result.IntellectualObject) assert.Equal(t, 9, len(result.IntellectualObject.GenericFiles)) assert.Equal(t, "Tag 'Access' has illegal value 'hands off!'.", result.IntellectualObject.IngestErrorMessage) assert.False(t, result.ParseSummary.HasErrors()) assert.NotNil(t, result.ValidationSummary) require.True(t, result.ValidationSummary.HasErrors()) }
func TestValidate_NoMd5Manifest(t *testing.T) { bagValidationConfig, err := getValidationConfig() if err != nil { assert.Fail(t, "Could not load BagValidationConfig: %s", err.Error()) } optionalFileSpec := validation.FileSpec{Presence: "OPTIONAL"} bagValidationConfig.FileSpecs["tagmanifest-md5.txt"] = optionalFileSpec _, filename, _, _ := runtime.Caller(0) dir := filepath.Dir(filename) pathToBag, err := filepath.Abs(path.Join(dir, "..", "testdata", "unit_test_bags", "example.edu.sample_no_md5_manifest.tar")) validator, err := validation.NewBagValidator(pathToBag, bagValidationConfig) if err != nil { assert.Fail(t, "NewBagValidator returned unexpected error: %s", err.Error()) } result := validator.Validate() assert.NotNil(t, result.IntellectualObject) assert.Equal(t, 8, len(result.IntellectualObject.GenericFiles)) // First error is general: BagIt spec says all bags should contain // at least one payload manifest. assert.True(t, strings.Contains(result.IntellectualObject.IngestErrorMessage, "Bag has no payload manifest (manifest-<alg>.txt)")) // Second error is from our specific BagValidationConfig, which says // that the bag must have an md5 manifest. assert.True(t, strings.Contains(result.IntellectualObject.IngestErrorMessage, "Required file 'manifest-md5.txt' is missing.")) assert.True(t, strings.Contains(result.IntellectualObject.IngestErrorMessage, "File 'data/datastream-DC' does not appear in any payload manifest")) assert.True(t, strings.Contains(result.IntellectualObject.IngestErrorMessage, "File 'data/datastream-descMetadata' does not appear in any payload manifest")) assert.True(t, strings.Contains(result.IntellectualObject.IngestErrorMessage, "File 'data/datastream-MARC' does not appear in any payload manifest")) assert.True(t, strings.Contains(result.IntellectualObject.IngestErrorMessage, "File 'data/datastream-RELS-EXT' does not appear in any payload manifest")) assert.False(t, result.ParseSummary.HasErrors()) assert.NotNil(t, result.ValidationSummary) require.True(t, result.ValidationSummary.HasErrors()) }