Beispiel #1
0
func testRasterizer(t *testing.T, rasterizer Rasterizer, expectation string) {
	assert.True(t, rasterizer.Enabled(), "%s.Enabled() failed.", rasterizer.String())

	testDataDir, err := testutils.TestDataDir()
	assert.Nil(t, err, "TestDataDir missing: %v", err)

	tempDir, err := ioutil.TempDir("", "pdf_test_")
	assert.Nil(t, err, "ioutil.TempDir failed")
	defer util.RemoveAll(tempDir)

	pdfSrcPath := path.Join(testDataDir, "minimal.pdf")
	assert.True(t, fileutil.FileExists(pdfSrcPath), "Path '%s' does not exist", pdfSrcPath)
	pdfInputPath := path.Join(tempDir, "minimal.pdf")

	err = os.Symlink(pdfSrcPath, pdfInputPath)
	assert.Nil(t, err, "Symlink failed")
	assert.True(t, fileutil.FileExists(pdfInputPath), "Path '%s' does not exist", pdfInputPath)

	outputFileName := path.Join(tempDir, "test.png")

	badPath := path.Join(tempDir, "this_file_should_really_not_exist.pdf")

	if err := rasterizer.Rasterize(badPath, outputFileName); err == nil {
		t.Errorf(": Got '%v' Want '%v'", err, nil)
	}

	if err := rasterizer.Rasterize(pdfInputPath, outputFileName); err != nil {
		t.Errorf(": Got '%v' Want '!nil'", err)
	}

	expectedOutput := path.Join(testDataDir, expectation)
	assert.True(t, filesEqual(outputFileName, expectedOutput), "png output not correct")
}
Beispiel #2
0
// Rasterize assumes that filepath.Dir(pdfInputPath) is writable
func (Pdfium) Rasterize(pdfInputPath, pngOutputPath string) error {
	if !(Pdfium{}).Enabled() {
		return fmt.Errorf("pdfium_test is missing")
	}

	// Check input
	if !fileutil.FileExists(pdfInputPath) {
		return fmt.Errorf("Path '%s' does not exist", pdfInputPath)
	}

	// Remove any files created by pdfiumExecutable
	defer func() {
		// Assume pdfInputPath has glob characters.
		matches, _ := filepath.Glob(fmt.Sprintf("%s.*.png", pdfInputPath))
		for _, match := range matches {
			util.Remove(match)
		}
	}()

	command := exec.Command(pdfiumExecutable, "--png", pdfInputPath)
	if err := command.Start(); err != nil {
		return err
	}
	go func() {
		time.Sleep(5 * time.Second)
		_ = command.Process.Kill()
	}()
	if err := command.Wait(); err != nil {
		return err
	}

	firstPagePath := fmt.Sprintf("%s.0.png", pdfInputPath)
	if !fileutil.FileExists(firstPagePath) {
		return fmt.Errorf("First rasterized page (%s) not found.", firstPagePath)
	}
	if err := os.Rename(firstPagePath, pngOutputPath); err != nil {
		return err
	}
	return nil
}
Beispiel #3
0
// See Source interface.
func (f *FileSystemSource) Poll(startTime, endTime int64) ([]ResultFileLocation, error) {
	retval := []ResultFileLocation{}

	// although GetLatestGSDirs is in the "gs" package, there's nothing specific about
	// its operation that makes it not re-usable here.
	dirs := gs.GetLatestGSDirs(startTime, endTime, f.rootDir)
	for _, dir := range dirs {
		// Inject dir into a closure.
		func(dir string) {
			walkFn := func(path string, info os.FileInfo, err error) error {
				if err != nil {
					// We swallow the error to continue processing, but make sure it's
					// shows up in the logs.
					glog.Errorf("Error walking %s: %s", path, err)
					return nil
				}
				if info.IsDir() {
					return nil
				}

				updateTimestamp := info.ModTime().Unix()
				if updateTimestamp > startTime {
					rf, err := FileSystemResult(path)
					if err != nil {
						glog.Errorf("Unable to create file system result: %s", err)
						return nil
					}
					retval = append(retval, rf)
				}
				return nil
			}

			// Only walk the tree if the top directory exists.
			if fileutil.FileExists(dir) {
				if err := filepath.Walk(dir, walkFn); err != nil {
					glog.Infof("Unable to read the local dir %s: %s", dir, err)
					return
				}
			}
		}(dir)
	}

	return retval, nil
}
Beispiel #4
0
// getDB returns a BoltDB if the instance exists in the internal map of
// databases or on disk. Otherwise it will create the database on disk if
// the 'create' parameter is true. If the database does not exist and create
// is false, it will return (nil, nil).
func (r *rpcServer) getDB(database string, create bool) (*bolt.DB, error) {
	r.dbsMutex.Lock()
	defer r.dbsMutex.Unlock()

	db, ok := r.databases[database]
	if ok {
		return db, nil
	}

	// Check if the database exists on disk.
	databaseFile := filepath.Join(r.dataDir, database+".db")
	if !create && !fileutil.FileExists(databaseFile) {
		return nil, fmt.Errorf("Database %s does not exist.", database)
	}

	db, err := bolt.Open(databaseFile, 0644, nil)
	if err != nil {
		return nil, err
	}
	r.databases[database] = db
	return db, nil
}
Beispiel #5
0
func main() {
	defer common.LogPanic()
	common.Init()

	fileInfos, err := ioutil.ReadDir(".")
	if err != nil {
		glog.Fatalf("Unable to read directory.")
	}

	// Get the directory
	for _, info := range fileInfos {
		if info.IsDir() {
			continue
		}

		fileName := info.Name()
		outFileName := fileutil.TwoLevelRadixPath(fileName)

		if fileName == outFileName {
			glog.Infof("Excluding %s -> %s", fileName, outFileName)
			continue
		}

		if !fileutil.FileExists(outFileName) {
			// Create the path if it doesn't exist.
			targetDir, _ := filepath.Split(outFileName)
			if err = os.MkdirAll(targetDir, 0700); err != nil {
				glog.Errorf("Unable to run create path: %s", targetDir)
			}

			if err := os.Rename(fileName, outFileName); err != nil {
				glog.Errorf("Unable to run mv %s %s", fileName, outFileName)
			}
		}

	}
}
func (p *pdfProcessor) rasterizeAndUpload(dmResultName string, dmResults *goldingestion.DMResults, pdfResults []*goldingestion.Result) error {
	processedResults := make([]*goldingestion.Result, 0, len(pdfResults)*len(p.rasterizers))

	// Create a temporary directory to hold the rastered images.
	tempDir, err := ioutil.TempDir(p.pdfCacheDir, "pdfingestion")
	if err != nil {
		return err
	}
	defer util.RemoveAll(tempDir)

	// Go through all results that generated PDF files.
	for resultIdx, result := range pdfResults {
		// Fetch the PDF file if it's not in the cache.
		pdfFileName := fmt.Sprintf("%s.%s", result.Digest, PDF_EXT)
		pdfPath := filepath.Join(p.pdfCacheDir, pdfFileName)
		if !fileutil.FileExists(pdfPath) {
			if err = p.download(p.inImagesBucket, p.inImagesDir, pdfFileName, pdfPath); err != nil {
				glog.Errorf("Unable to retrieve image: %s. Error: %s", pdfFileName, err)
				continue
			}
		}

		// Generate an image for each rasterizer.
		for rasterIdx, rasterizer := range p.rasterizers {
			tempName := filepath.Join(tempDir, fmt.Sprintf("rastering_%d_%d.%s", resultIdx, rasterIdx, PNG_EXT))
			err := rasterizer.Rasterize(pdfPath, tempName)
			if err != nil {
				glog.Errorf("Rasterizing %s with %s failed: %s", filepath.Base(pdfPath), rasterizer.String(), err)
				continue
			}

			// Open the generated image and calculate the MD5.
			file, err := os.Open(tempName)
			if err != nil {
				glog.Errorf("Unable to open generated image: %s", err)
				continue
			}

			var buf bytes.Buffer
			md5, err := util.MD5FromReader(file, &buf)
			if err != nil {
				glog.Errorf("Unable to calculate MD5 hash of file %s. Got error: %s", tempName, err)
				continue
			}
			digest := hex.EncodeToString(md5)
			uploadFileName := fmt.Sprintf("%s.%s", digest, PNG_EXT)
			if err := p.upload(p.outImagesBucket, p.outImagesDir, uploadFileName, bytes.NewBuffer(buf.Bytes())); err != nil {
				glog.Errorf("Unable to upload file %s. Error: %s", uploadFileName, err)
				continue
			}

			// Update the result and add it to the successfully processed results.
			result.Key["rasterizer"] = rasterizer.String()
			result.Digest = digest
			result.Options["ext"] = PNG_EXT
			processedResults = append(processedResults, result)
		}
	}

	// If we have no processed results we consider it an error.
	if len(processedResults) == 0 {
		return fmt.Errorf("No input image was processed successfully.")
	}

	// Replace the old results in the original result and write it to the cloud.
	dmResults.Results = processedResults
	jsonBytes, err := json.MarshalIndent(dmResults, "", "    ")
	if err != nil {
		return fmt.Errorf("Unable to encode JSON: %s", err)
	}

	return p.upload(p.outJsonBucket, p.outJsonDir, dmResultName, bytes.NewBuffer(jsonBytes))
}