func testRasterizer(t *testing.T, rasterizer Rasterizer, expectation string) { assert.True(t, rasterizer.Enabled(), "%s.Enabled() failed.", rasterizer.String()) testDataDir, err := testutils.TestDataDir() assert.Nil(t, err, "TestDataDir missing: %v", err) tempDir, err := ioutil.TempDir("", "pdf_test_") assert.Nil(t, err, "ioutil.TempDir failed") defer util.RemoveAll(tempDir) pdfSrcPath := path.Join(testDataDir, "minimal.pdf") assert.True(t, fileutil.FileExists(pdfSrcPath), "Path '%s' does not exist", pdfSrcPath) pdfInputPath := path.Join(tempDir, "minimal.pdf") err = os.Symlink(pdfSrcPath, pdfInputPath) assert.Nil(t, err, "Symlink failed") assert.True(t, fileutil.FileExists(pdfInputPath), "Path '%s' does not exist", pdfInputPath) outputFileName := path.Join(tempDir, "test.png") badPath := path.Join(tempDir, "this_file_should_really_not_exist.pdf") if err := rasterizer.Rasterize(badPath, outputFileName); err == nil { t.Errorf(": Got '%v' Want '%v'", err, nil) } if err := rasterizer.Rasterize(pdfInputPath, outputFileName); err != nil { t.Errorf(": Got '%v' Want '!nil'", err) } expectedOutput := path.Join(testDataDir, expectation) assert.True(t, filesEqual(outputFileName, expectedOutput), "png output not correct") }
// Rasterize assumes that filepath.Dir(pdfInputPath) is writable func (Pdfium) Rasterize(pdfInputPath, pngOutputPath string) error { if !(Pdfium{}).Enabled() { return fmt.Errorf("pdfium_test is missing") } // Check input if !fileutil.FileExists(pdfInputPath) { return fmt.Errorf("Path '%s' does not exist", pdfInputPath) } // Remove any files created by pdfiumExecutable defer func() { // Assume pdfInputPath has glob characters. matches, _ := filepath.Glob(fmt.Sprintf("%s.*.png", pdfInputPath)) for _, match := range matches { util.Remove(match) } }() command := exec.Command(pdfiumExecutable, "--png", pdfInputPath) if err := command.Start(); err != nil { return err } go func() { time.Sleep(5 * time.Second) _ = command.Process.Kill() }() if err := command.Wait(); err != nil { return err } firstPagePath := fmt.Sprintf("%s.0.png", pdfInputPath) if !fileutil.FileExists(firstPagePath) { return fmt.Errorf("First rasterized page (%s) not found.", firstPagePath) } if err := os.Rename(firstPagePath, pngOutputPath); err != nil { return err } return nil }
// See Source interface. func (f *FileSystemSource) Poll(startTime, endTime int64) ([]ResultFileLocation, error) { retval := []ResultFileLocation{} // although GetLatestGSDirs is in the "gs" package, there's nothing specific about // its operation that makes it not re-usable here. dirs := gs.GetLatestGSDirs(startTime, endTime, f.rootDir) for _, dir := range dirs { // Inject dir into a closure. func(dir string) { walkFn := func(path string, info os.FileInfo, err error) error { if err != nil { // We swallow the error to continue processing, but make sure it's // shows up in the logs. glog.Errorf("Error walking %s: %s", path, err) return nil } if info.IsDir() { return nil } updateTimestamp := info.ModTime().Unix() if updateTimestamp > startTime { rf, err := FileSystemResult(path) if err != nil { glog.Errorf("Unable to create file system result: %s", err) return nil } retval = append(retval, rf) } return nil } // Only walk the tree if the top directory exists. if fileutil.FileExists(dir) { if err := filepath.Walk(dir, walkFn); err != nil { glog.Infof("Unable to read the local dir %s: %s", dir, err) return } } }(dir) } return retval, nil }
// getDB returns a BoltDB if the instance exists in the internal map of // databases or on disk. Otherwise it will create the database on disk if // the 'create' parameter is true. If the database does not exist and create // is false, it will return (nil, nil). func (r *rpcServer) getDB(database string, create bool) (*bolt.DB, error) { r.dbsMutex.Lock() defer r.dbsMutex.Unlock() db, ok := r.databases[database] if ok { return db, nil } // Check if the database exists on disk. databaseFile := filepath.Join(r.dataDir, database+".db") if !create && !fileutil.FileExists(databaseFile) { return nil, fmt.Errorf("Database %s does not exist.", database) } db, err := bolt.Open(databaseFile, 0644, nil) if err != nil { return nil, err } r.databases[database] = db return db, nil }
func main() { defer common.LogPanic() common.Init() fileInfos, err := ioutil.ReadDir(".") if err != nil { glog.Fatalf("Unable to read directory.") } // Get the directory for _, info := range fileInfos { if info.IsDir() { continue } fileName := info.Name() outFileName := fileutil.TwoLevelRadixPath(fileName) if fileName == outFileName { glog.Infof("Excluding %s -> %s", fileName, outFileName) continue } if !fileutil.FileExists(outFileName) { // Create the path if it doesn't exist. targetDir, _ := filepath.Split(outFileName) if err = os.MkdirAll(targetDir, 0700); err != nil { glog.Errorf("Unable to run create path: %s", targetDir) } if err := os.Rename(fileName, outFileName); err != nil { glog.Errorf("Unable to run mv %s %s", fileName, outFileName) } } } }
func (p *pdfProcessor) rasterizeAndUpload(dmResultName string, dmResults *goldingestion.DMResults, pdfResults []*goldingestion.Result) error { processedResults := make([]*goldingestion.Result, 0, len(pdfResults)*len(p.rasterizers)) // Create a temporary directory to hold the rastered images. tempDir, err := ioutil.TempDir(p.pdfCacheDir, "pdfingestion") if err != nil { return err } defer util.RemoveAll(tempDir) // Go through all results that generated PDF files. for resultIdx, result := range pdfResults { // Fetch the PDF file if it's not in the cache. pdfFileName := fmt.Sprintf("%s.%s", result.Digest, PDF_EXT) pdfPath := filepath.Join(p.pdfCacheDir, pdfFileName) if !fileutil.FileExists(pdfPath) { if err = p.download(p.inImagesBucket, p.inImagesDir, pdfFileName, pdfPath); err != nil { glog.Errorf("Unable to retrieve image: %s. Error: %s", pdfFileName, err) continue } } // Generate an image for each rasterizer. for rasterIdx, rasterizer := range p.rasterizers { tempName := filepath.Join(tempDir, fmt.Sprintf("rastering_%d_%d.%s", resultIdx, rasterIdx, PNG_EXT)) err := rasterizer.Rasterize(pdfPath, tempName) if err != nil { glog.Errorf("Rasterizing %s with %s failed: %s", filepath.Base(pdfPath), rasterizer.String(), err) continue } // Open the generated image and calculate the MD5. file, err := os.Open(tempName) if err != nil { glog.Errorf("Unable to open generated image: %s", err) continue } var buf bytes.Buffer md5, err := util.MD5FromReader(file, &buf) if err != nil { glog.Errorf("Unable to calculate MD5 hash of file %s. Got error: %s", tempName, err) continue } digest := hex.EncodeToString(md5) uploadFileName := fmt.Sprintf("%s.%s", digest, PNG_EXT) if err := p.upload(p.outImagesBucket, p.outImagesDir, uploadFileName, bytes.NewBuffer(buf.Bytes())); err != nil { glog.Errorf("Unable to upload file %s. Error: %s", uploadFileName, err) continue } // Update the result and add it to the successfully processed results. result.Key["rasterizer"] = rasterizer.String() result.Digest = digest result.Options["ext"] = PNG_EXT processedResults = append(processedResults, result) } } // If we have no processed results we consider it an error. if len(processedResults) == 0 { return fmt.Errorf("No input image was processed successfully.") } // Replace the old results in the original result and write it to the cloud. dmResults.Results = processedResults jsonBytes, err := json.MarshalIndent(dmResults, "", " ") if err != nil { return fmt.Errorf("Unable to encode JSON: %s", err) } return p.upload(p.outJsonBucket, p.outJsonDir, dmResultName, bytes.NewBuffer(jsonBytes)) }