func mimeScanBlobs(dbDir, blobDir string, workers int) error { fsck, err := db.NewRO(dbDir) if err != nil { return err } bs, err := dir.New(blobDir) if err != nil { return err } stats := fs.NewStats() defer stats.LogEvery(10 * time.Second).Stop() defer log.Print(stats) go func() { for _ = range time.Tick(10 * time.Second) { fmt.Println(time.Now(), stats) } }() blobCh := fsck.List("file") var wg sync.WaitGroup for i := 0; i < workers; i++ { wg.Add(1) go func() { defer wg.Done() for ref := range blobCh { s, err := schemaFromBlobRef(bs, ref) if err != nil { log.Printf("%s: previously indexed; now missing", ref) stats.Add("badschema") continue } file, err := s.NewFileReader(bs) if err != nil { log.Printf("%s: unreadable: %s", ref, err) stats.Add("unreadable") continue } mime, _ := magic.MIMETypeFromReader(file) file.Close() if mime != "" { if pos := strings.Index(mime, "; charset="); pos >= 0 { mime = mime[:pos] } if err := fsck.PlaceMIME(ref, mime); err != nil { log.Printf("%s: PlaceMIME(): %s", ref, mime) mime = "error" } } else { mime = "unknown" } stats.Add(mime) } }() } wg.Wait() return nil }
func scanBlobs(dbDir, blobDir string, restart bool) { fsck, err := db.New(dbDir) if err != nil { log.Fatal(err) } last := fsck.Last() if last != "" { if restart { fmt.Println("overwriting blob scan resume marker at", last) last = "" } else { fmt.Println("resuming blob scan at", last) } } blobCh := streamBlobs(blobDir, last) stats := fs.NewStats() defer stats.LogEvery(10 * time.Second).Stop() defer log.Print(stats) for b := range blobCh { if !b.ValidContents() { stats.Add("corrupt") continue } ref := b.Ref() body := b.Open() s, ok := parseSchema(ref, body) body.Close() if !ok { stats.Add("data") if err := fsck.Place(ref.String(), b.Token, "", nil); err != nil { log.Fatal(err) } continue } needs := indexSchemaBlob(fsck, s) t := s.Type() stats.Add(t) if err := fsck.Place(ref.String(), b.Token, t, needs); err != nil { log.Fatal(err) } } }
func main() { dbDir := flag.String("db_dir", "", "FSCK state database directory") blobDir := flag.String("blob_dir", "", "Camlistore blob directory") mimeType := flag.String("mime_type", "image/jpeg", "MIME type of files to scan") print := flag.Bool("print", false, "Print ref and camera model") workers := fsck.Parallel{Workers: 32} flag.Var(workers, "workers", "parallel worker goroutines") flag.Parse() fdb, err := db.New(*dbDir) if err != nil { log.Fatal(err) } bs, err := dir.New(*blobDir) if err != nil { log.Fatal(err) } stats := fsck.NewStats() defer stats.LogTopNEvery(10, 10*time.Second).Stop() defer log.Print(stats) files := fsck.NewFiles(bs) go func() { files.ReadRefs(fdb.ListMIME(*mimeType)) files.Close() }() go files.LogErrors() workers.Go(func() { for r := range files.Readers { ex, err := exif.Decode(r) if err != nil { stats.Add("error") continue } tag, err := ex.Get(exif.Model) if err != nil { stats.Add("missing") continue } stats.Add(tag.String()) if *print { id := "unknown" if tag, err := ex.Get(exif.ImageUniqueID); err == nil { id = tag.String() stats.Add("unique-id-exif") } else if thumb, err := ex.JpegThumbnail(); err == nil { hash := sha1.Sum(thumb) id = hex.EncodeToString(hash[:20]) stats.Add("unique-id-thumb") } else if r.PartsSize() < 1e7 { if _, err := r.Seek(0, 0); err == nil { hash := sha1.New() io.Copy(hash, r) id = hex.EncodeToString(hash.Sum(nil)) stats.Add("unique-id-sha1") } else { id = "read-error" stats.Add("unique-id-sha1-error") } } else { stats.Add("unique-id-too-big") } fmt.Printf("%s %s %q %q\n", r.BlobRef(), id, r.FileName(), tag) } } }) workers.Wait() }