Java pdf to Excel Conversion

Question

I'm extracting data from PDF to excel. In that PDF contains table also. I used Itext- pdf to covert PDF to text & with the help of apache poi covert text to excel. but I'm not able to retrieve the data to store in the database. I tried PDF-BOX , ASPOSE also Same result I'm getting. If any one knows, Please help me to solve this issue.

Here is my code

// pdf to text using itext

            PdfReader reader = new PdfReader(
                    "C:\\Users\\mohmeds\\Desktop\\BOI_SCFS banking.pdf_page_1.pdf");
            PdfReaderContentParser parser = new PdfReaderContentParser(
                    reader);
            // PrintWriter out = new PrintWriter(new FileOutputStream(txt));
            TextExtractionStrategy strategy;
            String line = null;
            for (int i = 1; i <= reader.getNumberOfPages(); i++) {
                strategy = parser.processContent(i,
                        new SimpleTextExtractionStrategy());
                line = strategy.getResultantText();
            }
            reader.close();

            // using apache poi text to excel converter

            org.apache.poi.ss.usermodel.Workbook wb = new HSSFWorkbook();
            CreationHelper helper = wb.getCreationHelper();
            Sheet sheet = wb.createSheet("new sheet");
            System.out.println("link------->" + line);
            List<String> lines = IOUtils.readLines(new StringReader(line));

            for (int i = 0; i < lines.size(); i++) {
                String str[] = lines.get(i).split(",");
                Row row = sheet.createRow((short) i);
                for (int j = 0; j < str.length; j++) {
                    row.createCell(j).setCellValue(
                            helper.createRichTextString(str[j]));

                }
            }

            FileOutputStream fileOut = new FileOutputStream(
                    "C:\\Users\\mohmeds\\Desktop\\someName1.xls");
            wb.write(fileOut);
            fileOut.close();

Answer 1

Your question is a little vague, but if you are looking to store data from the PDF into a database, it's likely you'll want to extract the data as CSV rather than Excel. Also, the code here eliminates the middle step of converting PDF to Text, then Text to Excel. When defining the format, choose 'csv':

package com.pdftables.examples;

import java.io.File;
import java.util.Arrays;
import java.util.List;

import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.entity.mime.MultipartEntityBuilder;
import org.apache.http.entity.mime.content.FileBody;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;

public class ConvertToFile {
    private static List<String> formats = Arrays.asList(new String[] { "csv", "xml", "xlsx-single", "xlsx-multiple" });

    public static void main(String[] args) throws Exception {
        if (args.length != 3) {
            System.out.println("Command line: <API_KEY> <FORMAT> <PDF filename>");
            System.exit(1);
        }

        final String apiKey = args[0];
        final String format = args[1].toLowerCase();
        final String pdfFilename = args[2];

        if (!formats.contains(format)) {
            System.out.println("Invalid output format: \"" + format + "\"");
            System.exit(1);
        }

        // Avoid cookie warning with default cookie configuration
        RequestConfig globalConfig = RequestConfig.custom().setCookieSpec(CookieSpecs.STANDARD).build();

        File inputFile = new File(pdfFilename);

        if (!inputFile.canRead()) {
            System.out.println("Can't read input PDF file: \"" + pdfFilename + "\"");
            System.exit(1);
        }

        try (CloseableHttpClient httpclient = HttpClients.custom().setDefaultRequestConfig(globalConfig).build()) {
            HttpPost httppost = new HttpPost("https://pdftables.com/api?format=" + format + "&key=" + apiKey);
            FileBody fileBody = new FileBody(inputFile);

            HttpEntity requestBody = MultipartEntityBuilder.create().addPart("f", fileBody).build();
            httppost.setEntity(requestBody);

            System.out.println("Sending request");

            try (CloseableHttpResponse response = httpclient.execute(httppost)) {
                if (response.getStatusLine().getStatusCode() != 200) {
                    System.out.println(response.getStatusLine());
                    System.exit(1);
                }
                HttpEntity resEntity = response.getEntity();
                if (resEntity != null) {
                    final String outputFilename = getOutputFilename(pdfFilename, format.replaceFirst("-.*$", ""));
                    System.out.println("Writing output to " + outputFilename);

                    final File outputFile = new File(outputFilename);
                    FileUtils.copyToFile(resEntity.getContent(), outputFile);
                } else {
                    System.out.println("Error: file missing from response");
                    System.exit(1);
                }
            }
        }
    }

    private static String getOutputFilename(String pdfFilename, String suffix) {
        if (pdfFilename.length() >= 5 && pdfFilename.toLowerCase().endsWith(".pdf")) {
            return pdfFilename.substring(0, pdfFilename.length() - 4) + "." + suffix;
        } else {
            return pdfFilename + "." + suffix;
        }
    }
}

https://github.com/pdftables/java-pdftables-api/blob/master/pdftables.java

Java pdf to Excel Conversion

Question

1 answers

solution1
0 2019-03-29 07:09:12

Java pdf to Excel Conversion

Question

1 answers

solution1 0 2019-03-29 07:09:12

solution1
0 2019-03-29 07:09:12