简体   繁体   中英

Excel VBA to Search for Text in PDF and Extract and Name Pages

I have the following code, which looks at each cell in column A of my spreadsheet, searches for the text it finds there in the specified PDF and then extracts the page where it finds the text as a PDF, naming it with the value in the cell of the spreadsheet. The code works but is rather slow, I may need to search for as many as 200 words in a PDF which could be as long as 600 pages. Is there a way to make the code faster? Currently it loops through each cell searches through each page looping through each word until it finds the word in the cell.

    Sub test_with_PDF()

    Dim objApp As Object
    Dim objPDDoc As Object
    Dim objjso As Object
    Dim wordsCount As Long
    Dim page As Long
    Dim i As Long
    Dim strData As String
    Dim strFileName As String
    Dim lastrow As Long, c As Range
    Dim PageNos As Integer
    Dim newPDF As Acrobat.CAcroPDDoc
    Dim NewName As String
    Dim Folder As String
    lastrow = Sheets("Sheet1").Cells(Rows.Count, "A").End(xlUp).Row

    strFileName = selectFile()
    Folder = GetFolder()

    Set objApp = CreateObject("AcroExch.App")
    Set objPDDoc = CreateObject("AcroExch.PDDoc")
    'AD.1 open file, if =false file is damage
    If objPDDoc.Open(strFileName) Then
        Set objjso = objPDDoc.GetJSObject

 PageNos = 0
 For Each c In Sheets("Sheet1").Range("A2:A" & lastrow)

        For page = 0 To objPDDoc.GetNumPages - 1
            wordsCount = objjso.GetPageNumWords(page)
            For i = 0 To wordsCount

                If InStr(1, c.Value, ", ") = 0 Then

                    If objjso.getPageNthWord(page, i) = c.Value Then
                        PageNos = PageNos + 1
                        If FileExist(Folder & "\" & c.Offset(0, 4) & ".pdf") Then

                                Set newPDF = CreateObject("AcroExch.pdDoc")
                                NewName = Folder & "\" & c.Offset(0, 4) & ".pdf"
                                newPDF.Open (NewName)
                                newPDF.InsertPages lngPages, objPDDoc, page, 1, 0
                                newPDF.Save 1, NewName
                                newPDF.Close
                                Set newPDF = Nothing
                                Exit For
                         Else
                                Set newPDF = CreateObject("AcroExch.PDDoc")
                                newPDF.Create
                                NewName = Folder & "\" & c.Offset(0, 4) & ".pdf"
                                newPDF.InsertPages -1, objPDDoc, page, 1, 0
                                newPDF.Save 1, NewName
                                newPDF.Close
                                Set newPDF = Nothing
                                Exit For

                        End If
                    End If
                Else

                If objjso.getPageNthWord(page, i) = c.Offset(0, 1).Value Then
                    If objjso.getPageNthWord(page, i + 1) = c.Offset(0, 2).Value Then
                        PageNos = PageNos + 1
                         If FileExist(Folder & "\" & c.Offset(0, 4) & ".pdf") Then

                                Set newPDF = CreateObject("AcroExch.pdDoc")
                                NewName = Folder & "\" & c.Offset(0, 4) & ".pdf"
                                newPDF.Open (NewName)
                                newPDF.InsertPages lngPages, objPDDoc, page, 1, 0
                                newPDF.Save 1, NewName
                                newPDF.Close
                                Set newPDF = Nothing
                                Exit For
                         Else
                                Set newPDF = CreateObject("AcroExch.PDDoc")
                                newPDF.Create
                                NewName = Folder & "\" & c.Offset(0, 4) & ".pdf"
                                newPDF.InsertPages -1, objPDDoc, page, 1, 0
                                newPDF.Save 1, NewName
                                newPDF.Close
                                Set newPDF = Nothing
                                Exit For

                        End If
                        Exit For
                    End If
                End If
            End If
            Next i
        Next page
        c.Offset(0, 3).Value = PageNos
        PageNos = 0
    Next c
    MsgBox "Done"
    Else
        MsgBox "error!"
    End If
End Sub

Function FileExist(path As String) As Boolean
    If Dir(path) <> vbNullString Then FileExist = True
End Function
Function selectFile()
Dim fd As FileDialog, fileName As String

On Error GoTo ErrorHandler

Set fd = Application.FileDialog(msoFileDialogFilePicker)

fd.AllowMultiSelect = False

If fd.Show = True Then
    If fd.SelectedItems(1) <> vbNullString Then
        fileName = fd.SelectedItems(1)
    End If
Else
    'Exit code if no file is selected
    End
End If

'Return Selected FileName
selectFile = fileName

Set fd = Nothing

Exit Function

ErrorHandler:
Set fd = Nothing
MsgBox "Error " & Err & ": " & Error(Err)

End Function
Function GetFolder() As String
    Dim fldr As FileDialog
    Dim sItem As String
    Set fldr = Application.FileDialog(msoFileDialogFolderPicker)
    With fldr
        .Title = "Select the Folder where you want you new PDFs to go"
        .AllowMultiSelect = False
        .InitialFileName = Application.DefaultFilePath
        If .Show <> -1 Then GoTo NextCode
        sItem = .SelectedItems(1)
    End With
NextCode:
    GetFolder = sItem
    Set fldr = Nothing
End Function

Many thanks in advance.

Loops are definitely excellent for some things, but can tie down processing with these higher queries. Recently, a colleague and I were doing a similar task (not pdf-related though), and we had much success with using a range.find method instead of a loop executing instr on each cell.

Some points of interest: -To mimic the “loop cells” functionality when using the .find method, we ended our range statement with .cells, as seen below:

activesheet.usedrange.cells.find( )

Where the desired string goes within the ( ).

-The return value: “A Range object that represents the first cell where that information is found.”

Once the .find method returns a range, a subsequent subroutine can extract the page number and document name.

-If you need to find the nth instance of an occurrence, “You can use the FindNext andFindPrevious methods to repeat the search.” (Microsoft)

Microsoft overview of range.find: https://msdn.microsoft.com/en-us/vba/excel-vba/articles/range-find-method-excel

So with this approach, the user can use a loop based on a count of cells in your list to execute the .find method for each string.

Downside is (I assume) that this must be done on text within the excel application; also, I've not tested it to determine if the string has to inhabit the cell by itself (I don't think this is a concern).

'===================

Another suggestion that might be beneficial is to first bulk-rip all text from the .pdf with as little looping as possible (direct actions at the document object level). Then your find/return approach can be applied to the bulk text.

I did a similar activity when creating study notes from a professor's PowerPoints; I grabbed all the text into a .txt file, then returned every sentence containing the instance of a list of strings.

'=====================

A few caveats: I admit that I have not executed parsing at the sheer size of your project, so my suggestions might not be advantageous in practice.

Also, I have not done much work parsing .pdf documents, as I try to opt for anything that is .txt/excel app first, and engage it instead.

Good luck in your endeavors; I hope I was able to at least provide food for thought!

Sorry to post a quick, incomplete answer, but I think I can point you in a good direction.

Instead of making the system look up the two terms hundreds of billions of times, then make hundreds of billions of comparisons, put your search terms into an array, and the text of each page into a long string.Then it only has to do one look up and 200 comparisons per page.

'Dim your Clipboard functions
Public Declare PtrSafe Function OpenClipboard Lib "user32" (ByVal hwnd As Long) As Long
Public Declare PtrSafe Function EmptyClipboard Lib "user32" () As Long
Public Declare PtrSafe Function CloseClipboard Lib "user32" () As Long

'...

Dim objData As New MSForms.DataObject
Dim arrSearch() As String
Dim strTxt As String

'...

'Create array of search terms
For i = 2 To lastrow
    arrSearch(i - 2) = Sheets("Sheet1").Cells(1, i)
Next i

For page = 0 To objPDDoc.GetNumPages - 1

    '[Move each page into a new document. You already have that code]

    'Clear clipboard
    OpenClipboard (0&)
    EmptyClipboard
    CloseClipboard

    'Copy page to clipboard
    objApp.MenuItemExecute ("SelectAll")
    objApp.MenuItemExecute ("Copy")
    'You can also do this with the JavaScript object: objjso.ExecMenuItem("Item Name")
    'You may have to insert a waiting function like sleep() here to wait for the action to complete

    'Put data from clipboard into a string.
    objData.GetFromClipboard
    strTxt = objData.GetText 'Now you can search the entire content of the page at once, within memory

    'Compare each element of the array to the string
    For i = LBound(arrSearch) To UBound(arrSearch)
        If InStr(1, strTxt, arrSearch(i)) > 0 Then
            '[You found a match. Your code here]
        End If
    Next i

Next page

This is still cumbersome because you have to open each page in a new document. If there is a good way to determine which page you're on purely by text (such as the page number at the bottom of page a, followed immediately by the header at the top of page b) then you might look at copying the entire text of the document into one string, then using the clues from the text to decide which page to extract once you find a match. That would be a lot faster I believe.

Sub BatchRenameCS()

Dim objApp As Object
Dim objPDDoc As Object
Dim objjso As Object
Dim newPDF As Acrobat.CAcroPDDoc
Dim lastrow2 As Long
Dim strFileName As String
Dim Folder As String
Dim Page As Long
Dim Cell As Long
Dim PDFCharacterCount() As Long
Dim CharacterCount As Long
Dim i As Integer
Dim c As Integer
Dim x As Integer
Dim strSource As String
Dim strResult As String
Dim PDFCharacters As String
Dim PDFCharacters2 As String
Dim PDFPasteData() As String
Dim PasteDataPage As Integer
Dim LastRow As Long
Dim NewName As String
Dim NewNamePageNum As Integer
Dim Check()

Sheets("Sheet1").Range("C:D").ClearContents

strFileName = selectFile()
Folder = GetFolder()

'create array with pdf word count
Set objApp = CreateObject("AcroExch.App")
Set objPDDoc = CreateObject("AcroExch.PDDoc")
'AD.1 open file, if =false file is damage
    If objPDDoc.Open(strFileName) Then
        Set objjso = objPDDoc.GetJSObject

ReDim PDFCharacterCount(1 To objPDDoc.GetNumPages) As Long

For Page = 1 To objPDDoc.GetNumPages
PDFCharacters = ""
PDFCharacters2 = ""
    For c = 0 To objjso.GetPageNumWords(Page - 1)
    PDFCharacters = PDFCharacters & objjso.getPageNthWord(Page - 1, c)
    Next c
    For i = 1 To Len(PDFCharacters)
        Select Case Asc(Mid(PDFCharacters, i, 1))
            Case 48 To 57, 65 To 90, 97 To 122:
            PDFCharacters2 = PDFCharacters2 & (Mid(PDFCharacters, i, 1))
            Case Else
            PDFCharacters2 = PDFCharacters2 & ""
        End Select
    Next
    PDFCharacterCount(Page) = Len(PDFCharacters2)

Next Page

lastrow2 = Sheets("Sheet2").Cells(Rows.Count, "A").End(xlUp).Row
Page = 1
ReDim PDFPasteData(1 To objPDDoc.GetNumPages) As String
For Cell = 1 To lastrow2
    strResult = ""
    strSource = Sheets("Sheet2").Cells(Cell, 1).Text
    PDFPasteData(Page) = PDFPasteData(Page) & " " & strSource
    For i = 1 To Len(strSource)
        Select Case Asc(Mid(strSource, i, 1))
            Case 48 To 57, 65 To 90, 97 To 122:
            strResult = strResult & (Mid(strSource, i, 1))
            Case Else
            strResult = strResult & ""
        End Select
    Next

CharacterCount = CharacterCount + Len(strResult)

If CharacterCount = PDFCharacterCount(Page) Then
CharacterCount = 0
Page = Page + 1
End If

Next Cell
ReDim Check(2, objPDDoc.GetNumPages)
LastRow = Sheets("Sheet1").Cells(Rows.Count, "A").End(xlUp).Row
For Each LookUpCell In Worksheets("Sheet1").Range("A2:A" & LastRow)
    For PasteDataPage = 1 To objPDDoc.GetNumPages
        If InStr(PDFPasteData(PasteDataPage), LookUpCell.Value) Then
        Check(1, PasteDataPage) = Check(1, PasteDataPage) + 1
        Check(2, PasteDataPage) = Check(2, PasteDataPage) & LookUpCell.Offset(0, 1).Value & Chr(10)
                                If FileExist(Folder & "\" & LookUpCell.Offset(0, 1) & ".pdf") Then

                                        Set newPDF = CreateObject("AcroExch.pdDoc")
                                        NewName = Folder & "\" & LookUpCell.Offset(0, 1) & ".pdf"
                                        newPDF.Open (NewName)
                                        newPDF.InsertPages newPDF.GetNumPages - 1, objPDDoc, PasteDataPage - 1, 1, 0
                                        newPDF.Save 1, NewName
                                        newPDF.Close
                                        Set newPDF = Nothing
                                 Else
                                        Set newPDF = CreateObject("AcroExch.PDDoc")
                                        newPDF.Create
                                        NewName = Folder & "\" & LookUpCell.Offset(0, 1) & ".pdf"
                                        newPDF.InsertPages -1, objPDDoc, PasteDataPage - 1, 1, 0
                                        newPDF.Save 1, NewName
                                        newPDF.Close
                                        Set newPDF = Nothing
                                End If
        End If
    Next PasteDataPage
Next LookUpCell
x = 1
For PasteDataPage = 1 To objPDDoc.GetNumPages
    If Check(1, PasteDataPage) <> 1 Then
    Sheets("Sheet1").Cells(x, 3) = PasteDataPage
    Sheets("Sheet1").Cells(x, 4) = Check(2, PasteDataPage)
    x = x + 1
    End If
Next PasteDataPage
End If
MsgBox "Done"
End Sub
Function FileExist(path As String) As Boolean
    If Dir(path) <> vbNullString Then FileExist = True
End Function
Function selectFile()
Dim fd As FileDialog, fileName As String
On Error GoTo ErrorHandler
Set fd = Application.FileDialog(msoFileDialogFilePicker)
fd.AllowMultiSelect = False
If fd.Show = True Then
    If fd.SelectedItems(1) <> vbNullString Then
        fileName = fd.SelectedItems(1)
    End If
Else
    'Exit code if no file is selected
    End
End If
'Return Selected FileName
selectFile = fileName
Set fd = Nothing
Exit Function
ErrorHandler:
Set fd = Nothing
MsgBox "Error " & Err & ": " & Error(Err)
End Function
Function GetFolder() As String
    Dim fldr As FileDialog
    Dim sItem As String
    Set fldr = Application.FileDialog(msoFileDialogFolderPicker)
    With fldr
        .Title = "Select the Folder where you want you new PDFs to go"
        .AllowMultiSelect = False
        .InitialFileName = Application.DefaultFilePath
        If .Show <> -1 Then GoTo NextCode
        sItem = .SelectedItems(1)
    End With
NextCode:
    GetFolder = sItem
    Set fldr = Nothing

End Function

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM