// FileSystemResult returns a ResultFileLocation for files. path is the path // where the target file resides and rootDir is the root of all paths. func FileSystemResult(path, rootDir string) (ResultFileLocation, error) { // Read file into buffer and calculate the md5 in the process. file, err := os.Open(path) if err != nil { return nil, err } defer util.Close(file) var buf bytes.Buffer md5, err := util.MD5FromReader(file, &buf) if err != nil { return nil, fmt.Errorf("Unable to get MD5 hash of %s: %s", path, err) } absRootDir, err := filepath.Abs(rootDir) if err != nil { return nil, err } absPath, err := filepath.Abs(path) if err != nil { return nil, err } return &fsResultFileLocation{ path: strings.TrimPrefix(absPath, absRootDir+"/"), buf: buf.Bytes(), md5: hex.EncodeToString(md5), }, nil }
func (p *pdfProcessor) rasterizeAndUpload(dmResultName string, dmResults *goldingestion.DMResults, pdfResults []*goldingestion.Result) error { processedResults := make([]*goldingestion.Result, 0, len(pdfResults)*len(p.rasterizers)) // Create a temporary directory to hold the rastered images. tempDir, err := ioutil.TempDir(p.pdfCacheDir, "pdfingestion") if err != nil { return err } defer util.RemoveAll(tempDir) // Go through all results that generated PDF files. for resultIdx, result := range pdfResults { // Fetch the PDF file if it's not in the cache. pdfFileName := fmt.Sprintf("%s.%s", result.Digest, PDF_EXT) pdfPath := filepath.Join(p.pdfCacheDir, pdfFileName) if !fileutil.FileExists(pdfPath) { if err = p.download(p.inImagesBucket, p.inImagesDir, pdfFileName, pdfPath); err != nil { glog.Errorf("Unable to retrieve image: %s. Error: %s", pdfFileName, err) continue } } // Generate an image for each rasterizer. for rasterIdx, rasterizer := range p.rasterizers { tempName := filepath.Join(tempDir, fmt.Sprintf("rastering_%d_%d.%s", resultIdx, rasterIdx, PNG_EXT)) err := rasterizer.Rasterize(pdfPath, tempName) if err != nil { glog.Errorf("Rasterizing %s with %s failed: %s", filepath.Base(pdfPath), rasterizer.String(), err) continue } // Open the generated image and calculate the MD5. file, err := os.Open(tempName) if err != nil { glog.Errorf("Unable to open generated image: %s", err) continue } var buf bytes.Buffer md5, err := util.MD5FromReader(file, &buf) if err != nil { glog.Errorf("Unable to calculate MD5 hash of file %s. Got error: %s", tempName, err) continue } digest := hex.EncodeToString(md5) uploadFileName := fmt.Sprintf("%s.%s", digest, PNG_EXT) if err := p.upload(p.outImagesBucket, p.outImagesDir, uploadFileName, bytes.NewBuffer(buf.Bytes())); err != nil { glog.Errorf("Unable to upload file %s. Error: %s", uploadFileName, err) continue } // Update the result and add it to the successfully processed results. result.Key["rasterizer"] = rasterizer.String() result.Digest = digest result.Options["ext"] = PNG_EXT processedResults = append(processedResults, result) } } // If we have no processed results we consider it an error. if len(processedResults) == 0 { return fmt.Errorf("No input image was processed successfully.") } // Replace the old results in the original result and write it to the cloud. dmResults.Results = processedResults jsonBytes, err := json.MarshalIndent(dmResults, "", " ") if err != nil { return fmt.Errorf("Unable to encode JSON: %s", err) } return p.upload(p.outJsonBucket, p.outJsonDir, dmResultName, bytes.NewBuffer(jsonBytes)) }