// selectBlobs splits the list of all blobs randomly into two lists. A blob // will be contained in the firstone ith probability p. func selectBlobs(t *testing.T, repo restic.Repository, p float32) (list1, list2 restic.BlobSet) { done := make(chan struct{}) defer close(done) list1 = restic.NewBlobSet() list2 = restic.NewBlobSet() blobs := restic.NewBlobSet() for id := range repo.List(restic.DataFile, done) { entries, _, err := repo.ListPack(id) if err != nil { t.Fatalf("error listing pack %v: %v", id, err) } for _, entry := range entries { h := restic.BlobHandle{ID: entry.ID, Type: entry.Type} if blobs.Has(h) { t.Errorf("ignoring duplicate blob %v", h) continue } blobs.Insert(h) if rand.Float32() <= p { list1.Insert(restic.BlobHandle{ID: entry.ID, Type: entry.Type}) } else { list2.Insert(restic.BlobHandle{ID: entry.ID, Type: entry.Type}) } } } return list1, list2 }
func loadIDSet(t testing.TB, filename string) restic.BlobSet { f, err := os.Open(filename) if err != nil { t.Logf("unable to open golden file %v: %v", filename, err) return restic.NewBlobSet() } sc := bufio.NewScanner(f) blobs := restic.NewBlobSet() for sc.Scan() { var h restic.BlobHandle err := json.Unmarshal([]byte(sc.Text()), &h) if err != nil { t.Errorf("file %v contained invalid blob: %#v", filename, err) continue } blobs.Insert(h) } if err = f.Close(); err != nil { t.Errorf("closing file %v failed with error %v", filename, err) } return blobs }
// DuplicateBlobs returns a list of blobs that are stored more than once in the // repo. func (idx *Index) DuplicateBlobs() (dups restic.BlobSet) { dups = restic.NewBlobSet() seen := restic.NewBlobSet() for _, p := range idx.Packs { for _, entry := range p.Entries { h := restic.BlobHandle{ID: entry.ID, Type: entry.Type} if seen.Has(h) { dups.Insert(h) } seen.Insert(h) } } return dups }
func BenchmarkFindUsedBlobs(b *testing.B) { repo, cleanup := repository.TestRepository(b) defer cleanup() sn := restic.TestCreateSnapshot(b, repo, findTestTime, findTestDepth, 0) b.ResetTimer() for i := 0; i < b.N; i++ { seen := restic.NewBlobSet() blobs := restic.NewBlobSet() err := restic.FindUsedBlobs(repo, *sn.Tree, blobs, seen) if err != nil { b.Error(err) } b.Logf("found %v blobs", len(blobs)) } }
func TestFindUsedBlobs(t *testing.T) { repo, cleanup := repository.TestRepository(t) defer cleanup() var snapshots []*restic.Snapshot for i := 0; i < findTestSnapshots; i++ { sn := restic.TestCreateSnapshot(t, repo, findTestTime.Add(time.Duration(i)*time.Second), findTestDepth, 0) t.Logf("snapshot %v saved, tree %v", sn.ID().Str(), sn.Tree.Str()) snapshots = append(snapshots, sn) } for i, sn := range snapshots { usedBlobs := restic.NewBlobSet() err := restic.FindUsedBlobs(repo, *sn.Tree, usedBlobs, restic.NewBlobSet()) if err != nil { t.Errorf("FindUsedBlobs returned error: %v", err) continue } if len(usedBlobs) == 0 { t.Errorf("FindUsedBlobs returned an empty set") continue } goldenFilename := filepath.Join("testdata", fmt.Sprintf("used_blobs_snapshot%d", i)) want := loadIDSet(t, goldenFilename) if !want.Equals(usedBlobs) { t.Errorf("snapshot %d: wrong list of blobs returned:\n missing blobs: %v\n extra blobs: %v", i, want.Sub(usedBlobs), usedBlobs.Sub(want)) } if *updateGoldenFiles { saveIDSet(t, goldenFilename, usedBlobs) } } }
func runPrune(gopts GlobalOptions) error { repo, err := OpenRepository(gopts) if err != nil { return err } lock, err := lockRepoExclusive(repo) defer unlockRepo(lock) if err != nil { return err } err = repo.LoadIndex() if err != nil { return err } done := make(chan struct{}) defer close(done) var stats struct { blobs int packs int snapshots int bytes int64 } Verbosef("counting files in repo\n") for _ = range repo.List(restic.DataFile, done) { stats.packs++ } Verbosef("building new index for repo\n") bar := newProgressMax(!gopts.Quiet, uint64(stats.packs), "packs") idx, err := index.New(repo, bar) if err != nil { return err } for _, pack := range idx.Packs { stats.bytes += pack.Size } Verbosef("repository contains %v packs (%v blobs) with %v bytes\n", len(idx.Packs), len(idx.Blobs), formatBytes(uint64(stats.bytes))) blobCount := make(map[restic.BlobHandle]int) duplicateBlobs := 0 duplicateBytes := 0 // find duplicate blobs for _, p := range idx.Packs { for _, entry := range p.Entries { stats.blobs++ h := restic.BlobHandle{ID: entry.ID, Type: entry.Type} blobCount[h]++ if blobCount[h] > 1 { duplicateBlobs++ duplicateBytes += int(entry.Length) } } } Verbosef("processed %d blobs: %d duplicate blobs, %v duplicate\n", stats.blobs, duplicateBlobs, formatBytes(uint64(duplicateBytes))) Verbosef("load all snapshots\n") // find referenced blobs snapshots, err := restic.LoadAllSnapshots(repo) if err != nil { return err } stats.snapshots = len(snapshots) Verbosef("find data that is still in use for %d snapshots\n", stats.snapshots) usedBlobs := restic.NewBlobSet() seenBlobs := restic.NewBlobSet() bar = newProgressMax(!gopts.Quiet, uint64(len(snapshots)), "snapshots") bar.Start() for _, sn := range snapshots { debug.Log("process snapshot %v", sn.ID().Str()) err = restic.FindUsedBlobs(repo, *sn.Tree, usedBlobs, seenBlobs) if err != nil { return err } debug.Log("found %v blobs for snapshot %v", sn.ID().Str()) bar.Report(restic.Stat{Blobs: 1}) } bar.Done() Verbosef("found %d of %d data blobs still in use, removing %d blobs\n", len(usedBlobs), stats.blobs, stats.blobs-len(usedBlobs)) // find packs that need a rewrite rewritePacks := restic.NewIDSet() for h, blob := range idx.Blobs { if !usedBlobs.Has(h) { rewritePacks.Merge(blob.Packs) continue } if blobCount[h] > 1 { rewritePacks.Merge(blob.Packs) } } removeBytes := 0 // find packs that are unneeded removePacks := restic.NewIDSet() for packID, p := range idx.Packs { hasActiveBlob := false for _, blob := range p.Entries { h := restic.BlobHandle{ID: blob.ID, Type: blob.Type} if usedBlobs.Has(h) { hasActiveBlob = true continue } removeBytes += int(blob.Length) } if hasActiveBlob { continue } removePacks.Insert(packID) if !rewritePacks.Has(packID) { return errors.Fatalf("pack %v is unneeded, but not contained in rewritePacks", packID.Str()) } rewritePacks.Delete(packID) } Verbosef("will delete %d packs and rewrite %d packs, this frees %s\n", len(removePacks), len(rewritePacks), formatBytes(uint64(removeBytes))) err = repository.Repack(repo, rewritePacks, usedBlobs) if err != nil { return err } for packID := range removePacks { err = repo.Backend().Remove(restic.DataFile, packID.String()) if err != nil { Warnf("unable to remove file %v from the repository\n", packID.Str()) } } Verbosef("creating new index\n") stats.packs = 0 for _ = range repo.List(restic.DataFile, done) { stats.packs++ } bar = newProgressMax(!gopts.Quiet, uint64(stats.packs), "packs") idx, err = index.New(repo, bar) if err != nil { return err } var supersedes restic.IDs for idxID := range repo.List(restic.IndexFile, done) { err := repo.Backend().Remove(restic.IndexFile, idxID.String()) if err != nil { fmt.Fprintf(os.Stderr, "unable to remove index %v: %v\n", idxID.Str(), err) } supersedes = append(supersedes, idxID) } id, err := idx.Save(repo, supersedes) if err != nil { return err } Verbosef("saved new index as %v\n", id.Str()) Verbosef("done\n") return nil }