简体   繁体   中英

How to convert pdf file to excel in c#

I want to extract some data like " email addresses " .. from table which are in PDF file and use this email addresses which I extract to send email to those people.

What I have found so far through searching the web:

  1. I have to convert the PDF file to Excel to read the data easily and use them as I want.

  2. I find some free dll like itextsharp or PDFsharp .

But I didn't find any snippet code help to do this in C#. is there any solution?

You absolutely do not have to convert PDF to Excel. First of all, please determine whether your PDF contains textual data, or it is scanned image. If it contains textual data, then you are right about using "some free dll". I recommend iTextSharp as it is popular and easy to use.

Now the controversial part. If you don't need rock solid solution, it would be easiest to read all PDF to a string and then retrieve emails using regular expression.
Here is example (not perfect) of reading PDF with iTextSharp and extracting emails:

public string PdfToString(string fileName)
{
    var sb = new StringBuilder();    
    var reader = new PdfReader(fileName);
    for (int page = 1; page <= reader.NumberOfPages; page++)
    {
        var strategy = new SimpleTextExtractionStrategy();
        string text = PdfTextExtractor.GetTextFromPage(reader, page, strategy);
        text = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(text)));
        sb.Append(text);
    }
    reader.Close();        
    return sb.ToString();
}
//adjust expression as needed
Regex emailRegex = new Regex("Email Address (?<email>.+?) Passport No");
public IEnumerable<string> ExtractEmails(string content)
{   
    var matches = emailRegex.Matches(content);
    foreach (Match m in matches)
    {
        yield return m.Groups["email"].Value;
    }
}

Using bytescout PDF Extractor SDK we can be able to extract the whole page to csv as below.

CSVExtractor extractor = new CSVExtractor();
extractor.RegistrationName = "demo";
extractor.RegistrationKey = "demo";

TableDetector tdetector = new TableDetector();
tdetector.RegistrationKey = "demo";
tdetector.RegistrationName = "demo";

// Load the document
extractor.LoadDocumentFromFile("C:\\sample.pdf");
tdetector.LoadDocumentFromFile("C:\\sample.pdf");

int pageCount = tdetector.GetPageCount();

for (int i = 1; i <= pageCount; i++)
{
    int j = 1;

        do
        {
                extractor.SetExtractionArea(tdetector.GetPageRect_Left(i),
                tdetector.GetPageRect_Top(i),
                tdetector.GetPageRect_Width(i),
                tdetector.GetPageRect_Height(i)
            );

            // and finally save the table into CSV file
            extractor.SavePageCSVToFile(i, "C:\\page-" + i + "-table-" + j + ".csv");
            j++;
        } while (tdetector.FindNextTable()); // search next table
}
public void Convert(string fileNames) {
    int pageCount = 0;
    iTextSharp.text.pdf.PdfReader reader = new iTextSharp.text.pdf.PdfReader(fileNames);
    pageCount = reader.NumberOfPages;
    string ext = System.IO.Path.GetExtension(fileNames);

    //string[] outfiles = new string[pageCount];
    //Excel.Application app = new Excel.Application();
    //app.Workbooks.Add("");
    CSVExtractor extractor = new CSVExtractor();
    //string outfilePDF1 = fileNames.Replace((System.IO.Path.GetFileName(fileNames)), (System.IO.Path.GetFileName(fileNames).Replace(".pdf", "") + "_rez" + ".csv"));
    string outfilePDFExcel1 = fileNames.Replace((System.IO.Path.GetFileName(fileNames)),
        (System.IO.Path.GetFileName(fileNames).Replace(".pdf", "") + "_rez" + ".xls"));
    extractor.RegistrationName = "demo";
    extractor.RegistrationKey = "demo";

    string folderName = @"C:\Users\Dafina\Desktop\PDF_EditProject\PDF_EditProject\PDFs";
    string pathString = System.IO.Path.Combine(folderName, System.IO.Path.GetFileName(fileNames).Replace(".pdf", "")) + "-CSVs";
    System.IO.Directory.CreateDirectory(pathString);
    for (int i = 0; i < pageCount; i++)
    {
        string outfilePDF = fileNames.Replace((System.IO.Path.GetFileName(fileNames)),
            (System.IO.Path.GetFileName(fileNames).Replace(".pdf", "") + "_" + (i + 1).ToString()) + ext);
        extractor.LoadDocumentFromFile(outfilePDF);
        //string outfile = fileNames.Replace((System.IO.Path.GetFileName(fileNames)),
        //    (System.IO.Path.GetFileName(fileNames).Replace(".pdf", "") + "_" + (i + 1).ToString()) + ".csv");
        string outfile = fileNames.Replace((System.IO.Path.GetFileName(fileNames)),
            (System.IO.Path.GetFileName(fileNames).Replace(".pdf", "") + "-CSVs\\" + "Sheet_" + (i + 1).ToString()) + ".csv");
        extractor.SaveCSVToFile(outfile);
    }
    Excel.Application xlApp = new Microsoft.Office.Interop.Excel.Application();

    if (xlApp == null)
    {
        Console.WriteLine("Excel is not properly installed!!");
        return;
    }

    Excel.Workbook xlWorkBook;


    object misValue = System.Reflection.Missing.Value;
    xlWorkBook = xlApp.Workbooks.Add(misValue);
    string[] cvsFiles = Directory.GetFiles(pathString);
    Array.Sort(cvsFiles, new AlphanumComparatorFast());

    //string[] lista = new string[pageCount];
    //for (int t = 0; t < pageCount; t++)
    //{
    //    lista[t] = cvsFiles[t];           
    //}

    //Array.Sort(lista, new AlphanumComparatorFast());


    Microsoft.Office.Interop.Excel.Worksheet xlWorkSheet;
    for (int i = 0; i < cvsFiles.Length; i++)
    {
        int sheet = i + 1;
        xlWorkSheet = xlWorkBook.Sheets[sheet];

        if (i < cvsFiles.Length - 1)
        {
            xlWorkBook.Worksheets.Add(Type.Missing, xlWorkSheet, Type.Missing, Type.Missing);
        }


        int sheetRow = 1;
        Encoding objEncoding = Encoding.Default;
        StreamReader readerd = new StreamReader(File.OpenRead(cvsFiles[i]));
        int ColumLength = 0;
        while (!readerd.EndOfStream)
        {
            string line = readerd.ReadLine();
            Console.WriteLine(line);
            try
            {
                string[] columns = line.Split((new char[] { '\"' }));

                for (int col = 0; col < columns.Length; col++)
                {
                    if (ColumLength < columns.Length)
                    {
                        ColumLength = columns.Length;
                    }
                    if (col % 2 == 0)
                    {

                    }
                    else if (columns[col] == "")
                    {

                    }
                    else
                    {
                        xlWorkSheet.Cells[sheetRow, col + 1] = columns[col].Replace("\"", "");
                    }
                }
                sheetRow++;
            }
            catch (Exception e)
            {
                string msg = e.Message;
            }
        }

        int k = 1;
        for (int s = 1; s <= ColumLength; s++)
        {
            xlWorkSheet.Columns[k].Delete();
            k++;
        }



        releaseObject(xlWorkSheet);
        readerd.Close();
    }

    xlWorkBook.SaveAs(outfilePDFExcel1, Microsoft.Office.Interop.Excel.XlFileFormat.xlWorkbookNormal,
        misValue, misValue, misValue, misValue, Microsoft.Office.Interop.Excel.XlSaveAsAccessMode.xlExclusive,
        misValue, misValue, misValue, misValue, misValue);
    xlWorkBook.Close(true, misValue, misValue);

    xlApp.Quit();

    releaseObject(xlWorkBook);
    releaseObject(xlApp);

    var dir = new DirectoryInfo(pathString);
    dir.Attributes = dir.Attributes & ~FileAttributes.ReadOnly;
    dir.Delete(true);

}

Probably the Best code would be to use Third party dll

namespace ConsoleApp2
{
    internal class Program
    {
        static void Main(string[] args)
        {
            string pathToPdf = @"D:\abc\abc.pdf";
            string pathToExcel = Path.ChangeExtension(pathToPdf, ".xls");
            
            SautinSoft.PdfFocus f = new SautinSoft.PdfFocus();

   
            f.ExcelOptions.ConvertNonTabularDataToSpreadsheet = false;

            // 'true'  = Preserve original page layout.
            // 'false' = Place tables before text.
            f.ExcelOptions.PreservePageLayout = true;

            // The information includes the names for the culture, the writing system,
            // the calendar used, the sort order of strings, and formatting for dates and numbers.
            System.Globalization.CultureInfo ci = new System.Globalization.CultureInfo("en-US");
            ci.NumberFormat.NumberDecimalSeparator = ",";
            ci.NumberFormat.NumberGroupSeparator = ".";
            f.ExcelOptions.CultureInfo = ci;

            f.OpenPdf(pathToPdf);

            if (f.PageCount > 0)
            {
                int result = f.ToExcel(pathToExcel);

                // Open the resulted Excel workbook.
                if (result == 0)
                {
                    System.Diagnostics.Process.Start(pathToExcel);
                }
            }



        }
    }
}

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM