Ejemplo n.º 1
0
// FileSystemResult returns a ResultFileLocation for files. path is the path
// where the target file resides and rootDir is the root of all paths.
func FileSystemResult(path, rootDir string) (ResultFileLocation, error) {
	// Read file into buffer and calculate the md5 in the process.
	file, err := os.Open(path)
	if err != nil {
		return nil, err
	}
	defer util.Close(file)

	var buf bytes.Buffer
	md5, err := util.MD5FromReader(file, &buf)
	if err != nil {
		return nil, fmt.Errorf("Unable to get MD5 hash of %s: %s", path, err)
	}

	absRootDir, err := filepath.Abs(rootDir)
	if err != nil {
		return nil, err
	}

	absPath, err := filepath.Abs(path)
	if err != nil {
		return nil, err
	}

	return &fsResultFileLocation{
		path: strings.TrimPrefix(absPath, absRootDir+"/"),
		buf:  buf.Bytes(),
		md5:  hex.EncodeToString(md5),
	}, nil
}
Ejemplo n.º 2
0
func (p *pdfProcessor) rasterizeAndUpload(dmResultName string, dmResults *goldingestion.DMResults, pdfResults []*goldingestion.Result) error {
	processedResults := make([]*goldingestion.Result, 0, len(pdfResults)*len(p.rasterizers))

	// Create a temporary directory to hold the rastered images.
	tempDir, err := ioutil.TempDir(p.pdfCacheDir, "pdfingestion")
	if err != nil {
		return err
	}
	defer util.RemoveAll(tempDir)

	// Go through all results that generated PDF files.
	for resultIdx, result := range pdfResults {
		// Fetch the PDF file if it's not in the cache.
		pdfFileName := fmt.Sprintf("%s.%s", result.Digest, PDF_EXT)
		pdfPath := filepath.Join(p.pdfCacheDir, pdfFileName)
		if !fileutil.FileExists(pdfPath) {
			if err = p.download(p.inImagesBucket, p.inImagesDir, pdfFileName, pdfPath); err != nil {
				glog.Errorf("Unable to retrieve image: %s. Error: %s", pdfFileName, err)
				continue
			}
		}

		// Generate an image for each rasterizer.
		for rasterIdx, rasterizer := range p.rasterizers {
			tempName := filepath.Join(tempDir, fmt.Sprintf("rastering_%d_%d.%s", resultIdx, rasterIdx, PNG_EXT))
			err := rasterizer.Rasterize(pdfPath, tempName)
			if err != nil {
				glog.Errorf("Rasterizing %s with %s failed: %s", filepath.Base(pdfPath), rasterizer.String(), err)
				continue
			}

			// Open the generated image and calculate the MD5.
			file, err := os.Open(tempName)
			if err != nil {
				glog.Errorf("Unable to open generated image: %s", err)
				continue
			}

			var buf bytes.Buffer
			md5, err := util.MD5FromReader(file, &buf)
			if err != nil {
				glog.Errorf("Unable to calculate MD5 hash of file %s. Got error: %s", tempName, err)
				continue
			}
			digest := hex.EncodeToString(md5)
			uploadFileName := fmt.Sprintf("%s.%s", digest, PNG_EXT)
			if err := p.upload(p.outImagesBucket, p.outImagesDir, uploadFileName, bytes.NewBuffer(buf.Bytes())); err != nil {
				glog.Errorf("Unable to upload file %s. Error: %s", uploadFileName, err)
				continue
			}

			// Update the result and add it to the successfully processed results.
			result.Key["rasterizer"] = rasterizer.String()
			result.Digest = digest
			result.Options["ext"] = PNG_EXT
			processedResults = append(processedResults, result)
		}
	}

	// If we have no processed results we consider it an error.
	if len(processedResults) == 0 {
		return fmt.Errorf("No input image was processed successfully.")
	}

	// Replace the old results in the original result and write it to the cloud.
	dmResults.Results = processedResults
	jsonBytes, err := json.MarshalIndent(dmResults, "", "    ")
	if err != nil {
		return fmt.Errorf("Unable to encode JSON: %s", err)
	}

	return p.upload(p.outJsonBucket, p.outJsonDir, dmResultName, bytes.NewBuffer(jsonBytes))
}