繁体   English   中英

PDFBox 2.0.8-从一个文档中提取图像并在另一个文档中使用

[英]PDFBox 2.0.8 - Extracting an image from one document and using it in another

我正在编写一个Java应用程序以用作模板读取器和写入器。 我在处理文字方面取得了成功,但在图片处理上却有些困难...

获取图像是简单的部分-使用扩展PDFStreamEngine的类

package readingPdf;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.pdfbox.contentstream.PDFStreamEngine;
import org.apache.pdfbox.contentstream.operator.DrawObject;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.contentstream.operator.state.Concatenate;
import org.apache.pdfbox.contentstream.operator.state.Restore;
import org.apache.pdfbox.contentstream.operator.state.Save;
import org.apache.pdfbox.contentstream.operator.state.SetGraphicsStateParameters;
import org.apache.pdfbox.contentstream.operator.state.SetMatrix;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.util.Matrix;

public class ImageStripper extends PDFStreamEngine {

    ArrayList<Object  []> imagesData = null;
    public ImageStripper() throws IOException {
        // preparing PDFStreamEngine
        addOperator(new Concatenate());
        addOperator(new DrawObject());
        addOperator(new SetGraphicsStateParameters());
        addOperator(new Save());
        addOperator(new Restore());
        addOperator(new SetMatrix());
        imagesData = new ArrayList<Object[]>();
    }

    @Override
    protected void processOperator(Operator operator, List<COSBase> operands) throws IOException {
        String operation = operator.getName();
        if ("Do".equals(operation)) {
            COSName objectName = (COSName) operands.get(0);
            // get the PDF object
            PDXObject xobject = getResources().getXObject(objectName);
            // check if the object is an image object
            if (xobject instanceof PDImageXObject) {
                Object[] imageData = new Object[3];
                PDImageXObject image = (PDImageXObject) xobject;

                Matrix ctmNew = getGraphicsState().getCurrentTransformationMatrix();

                // position of image in the pdf in terms of user space units
                System.out.println("position in PDF = " + ctmNew.getTranslateX() + ", " + ctmNew.getTranslateY()
                        + " in user space units");

                imageData[0] = ctmNew.getTranslateX();// xPos
                imageData[1] = ctmNew.getTranslateY();// yPos

                imageData[2] = image;//Image

                imagesData.add(imageData);

            } else if (xobject instanceof PDFormXObject) {
                PDFormXObject form = (PDFormXObject) xobject;
                showForm(form);
            }
        } else {
            super.processOperator(operator, operands);
        }
    }

    public ArrayList<Object[]> getImagesList(){
        return imagesData;
    }
}

接下来是其实现

public class PDFManager{

    private PDFParser parser;
    private PDDocument pdDoc;
    private PDDocument retDoc;
    private COSDocument cosDoc;
    private PDPage page;
    private String filePath;
    private File file; 

    public PDDocument transferImage() throws IOException {
        this.pdDoc = null;
        this.cosDoc = null;

        file = new File(filePath);
        parser = new PDFParser(new RandomAccessFile(file, "r"));
        parser.parse();
        cosDoc = parser.getDocument();
        pdDoc = new PDDocument(cosDoc);

        //Get Image Data
        ImageStripper imageStripper = new ImageStripper();
        imageStripper.processPage(pdDoc.getPage(0));
        ArrayList<Object []> imageList = imageStripper.getImagesList();

        //Close Doc
        pdDoc.close();
        cosDoc.close();

        //Create new PDF Doc
        retDoc = new PDDocument();
        page = new PDPage(new PDRectangle(PDRectangle.A4.getHeight(), PDRectangle.A4.getWidth())); 
        retDoc.addPage(page);

        PDPageContentStream cs = new PDPageContentStream(retDoc, page, AppendMode.OVERWRITE, true);

        for(int pos = 0; pos < imageList.size() ; pos++) {
            Object [] imageData = imageList.get(pos);

            float xPos = (float)imageData[0];
            float yPos = (float)imageData[1];
            PDImageXObject image = (PDImageXObject)imageData[2];
            cs.drawImage(image, xPos, yPos);
        }

        cs.close();
        return retDoc;
    }

    public static void main(String[] args) throws IOException {

        PDFManager pdfManager = new PDFManager();

        PDDocument doc =pdfManager.ToText("c:\\test\\test.pdf"); 

        doc.save("c:\\test\\test2.pdf");
        doc.close();
    }
}

现在问题出在我正在写的地方,叫cs.drawImage 除了尝试保存新文件时,所有代码执行都没有任何问题。我得到了异常, COSStream has been closed and cannot be read. Perhaps its enclosing PDDocument has been closed? COSStream has been closed and cannot be read. Perhaps its enclosing PDDocument has been closed?

我怀疑仍然存在元数据,将图像链接到调用PDImageXobject.createFromFile("c:\\\\test\\\\testImage.png", doc)从中提取的原始文档,该文档将完美写入PDImageXObject 当我将PDDocument写入的PDImageXObject传递到PDImageXObject我怀疑它会以某种方式链接。

我无法将图像保存到临时位置,因为这只是测试POC。

任何援助将不胜感激

@蒂尔曼·豪舍尔

感谢您的解决方案

我将原始文档的结尾移到了一个单独的方法中,该方法在写入文件后调用

public void closeFiles(){
    pdDoc.close();
    cosDoc.close();
}

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM