简体   繁体   中英

Java pdf to Excel Conversion

I'm extracting data from PDF to excel. In that PDF contains table also. I used Itext- pdf to covert PDF to text & with the help of apache poi covert text to excel. but I'm not able to retrieve the data to store in the database. I tried PDF-BOX , ASPOSE also Same result I'm getting. If any one knows, Please help me to solve this issue.

Here is my code

// pdf to text using itext

            PdfReader reader = new PdfReader(
                    "C:\\Users\\mohmeds\\Desktop\\BOI_SCFS banking.pdf_page_1.pdf");
            PdfReaderContentParser parser = new PdfReaderContentParser(
                    reader);
            // PrintWriter out = new PrintWriter(new FileOutputStream(txt));
            TextExtractionStrategy strategy;
            String line = null;
            for (int i = 1; i <= reader.getNumberOfPages(); i++) {
                strategy = parser.processContent(i,
                        new SimpleTextExtractionStrategy());
                line = strategy.getResultantText();
            }
            reader.close();

            // using apache poi text to excel converter

            org.apache.poi.ss.usermodel.Workbook wb = new HSSFWorkbook();
            CreationHelper helper = wb.getCreationHelper();
            Sheet sheet = wb.createSheet("new sheet");
            System.out.println("link------->" + line);
            List<String> lines = IOUtils.readLines(new StringReader(line));

            for (int i = 0; i < lines.size(); i++) {
                String str[] = lines.get(i).split(",");
                Row row = sheet.createRow((short) i);
                for (int j = 0; j < str.length; j++) {
                    row.createCell(j).setCellValue(
                            helper.createRichTextString(str[j]));

                }
            }

            FileOutputStream fileOut = new FileOutputStream(
                    "C:\\Users\\mohmeds\\Desktop\\someName1.xls");
            wb.write(fileOut);
            fileOut.close();

Your question is a little vague, but if you are looking to store data from the PDF into a database, it's likely you'll want to extract the data as CSV rather than Excel. Also, the code here eliminates the middle step of converting PDF to Text, then Text to Excel. When defining the format, choose 'csv':

package com.pdftables.examples;

import java.io.File;
import java.util.Arrays;
import java.util.List;

import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.entity.mime.MultipartEntityBuilder;
import org.apache.http.entity.mime.content.FileBody;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;

public class ConvertToFile {
    private static List<String> formats = Arrays.asList(new String[] { "csv", "xml", "xlsx-single", "xlsx-multiple" });

    public static void main(String[] args) throws Exception {
        if (args.length != 3) {
            System.out.println("Command line: <API_KEY> <FORMAT> <PDF filename>");
            System.exit(1);
        }

        final String apiKey = args[0];
        final String format = args[1].toLowerCase();
        final String pdfFilename = args[2];

        if (!formats.contains(format)) {
            System.out.println("Invalid output format: \"" + format + "\"");
            System.exit(1);
        }

        // Avoid cookie warning with default cookie configuration
        RequestConfig globalConfig = RequestConfig.custom().setCookieSpec(CookieSpecs.STANDARD).build();

        File inputFile = new File(pdfFilename);

        if (!inputFile.canRead()) {
            System.out.println("Can't read input PDF file: \"" + pdfFilename + "\"");
            System.exit(1);
        }

        try (CloseableHttpClient httpclient = HttpClients.custom().setDefaultRequestConfig(globalConfig).build()) {
            HttpPost httppost = new HttpPost("https://pdftables.com/api?format=" + format + "&key=" + apiKey);
            FileBody fileBody = new FileBody(inputFile);

            HttpEntity requestBody = MultipartEntityBuilder.create().addPart("f", fileBody).build();
            httppost.setEntity(requestBody);

            System.out.println("Sending request");

            try (CloseableHttpResponse response = httpclient.execute(httppost)) {
                if (response.getStatusLine().getStatusCode() != 200) {
                    System.out.println(response.getStatusLine());
                    System.exit(1);
                }
                HttpEntity resEntity = response.getEntity();
                if (resEntity != null) {
                    final String outputFilename = getOutputFilename(pdfFilename, format.replaceFirst("-.*$", ""));
                    System.out.println("Writing output to " + outputFilename);

                    final File outputFile = new File(outputFilename);
                    FileUtils.copyToFile(resEntity.getContent(), outputFile);
                } else {
                    System.out.println("Error: file missing from response");
                    System.exit(1);
                }
            }
        }
    }

    private static String getOutputFilename(String pdfFilename, String suffix) {
        if (pdfFilename.length() >= 5 && pdfFilename.toLowerCase().endsWith(".pdf")) {
            return pdfFilename.substring(0, pdfFilename.length() - 4) + "." + suffix;
        } else {
            return pdfFilename + "." + suffix;
        }
    }
}

https://github.com/pdftables/java-pdftables-api/blob/master/pdftables.java

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM