使用 Tesseract OCR 從掃描的 pdf 個文件夾中提取文本

[英]Use Tesseract OCR to extract text from a scanned pdf folders

我有使用 Tesseract OCR 從掃描的 pdf 文件/普通 pdf 文件中提取/轉換文本的代碼。 但我想讓我的代碼轉換一個 pdf 文件夾而不是單個 pdf 文件,然后提取的文本文件將存儲在我想要的文件夾中。


filePath = '/Users/CodingStark/scanned/scanned-file.pdf'
pages = convert_from_path(filePath, 500)

image_counter = 1
# Iterate through all the pages stored above 
for page in pages: 
    filename = "page_"+str(image_counter)+".jpg"
    page.save(filename, 'JPEG') 
    image_counter = image_counter + 1

filelimit = image_counter-1
# Creating a text file to write the output 
outfile = "scanned-file.txt"

f = open(outfile, "a") 
# Iterate from 1 to total number of pages 
for i in range(1, filelimit + 1): 

    filename = "page_"+str(i)+".jpg"
    # Recognize the text as string in image using pytesserct 
    text = str(((pytesseract.image_to_string(Image.open(filename))))) 

    text = text.replace('-\n', '')     

#Close the file after writing all the text. 

我想自動化我的代碼,以便它會轉換掃描文件夾中的所有 pdf 文件,而那些提取的文本文件將位於我想要的文件夾中。 另外,請問有什么方法可以刪除代碼后的所有jpg文件嗎? 因為它占用了大量的 memory 個空間。 太感謝了!!


def tesseractOCR_pdf(pdf):

    filePath = pdf
    pages = convert_from_path(filePath, 500)

    # Counter to store images of each page of PDF to image 
    image_counter = 1

    # Iterate through all the pages stored above 
    for page in pages:
        # Declaring filename for each page of PDF as JPG 
        # For each page, filename will be: 
        # PDF page 1 -> page_1.jpg 
        # PDF page 2 -> page_2.jpg 
        # PDF page 3 -> page_3.jpg 
        # .... 
        # PDF page n -> page_n.jpg 

        filename = "page_"+str(image_counter)+".jpg"
        # Save the image of the page in system 
        page.save(filename, 'JPEG') 
        # Increment the counter to update filename 
        image_counter = image_counter + 1

    # Variable to get count of total number of pages 
    filelimit = image_counter-1

    # Create an empty string for stroing purposes
    text = ""
    # Iterate from 1 to total number of pages 
    for i in range(1, filelimit + 1): 
        # Set filename to recognize text from 
        # Again, these files will be: 
        # page_1.jpg 
        # page_2.jpg 
        # .... 
        # page_n.jpg 
        filename = "page_"+str(i)+".jpg"

        # Recognize the text as string in image using pytesserct 
        text += str(((pytesseract.image_to_string(Image.open(filename))))) 

        text = text.replace('-\n', '')     

    #Delete all the jpg files that created from above
    for i in glob.glob("*.jpg"):
    return text

def tesseractOCR_img(img):

    filePath = img
    text = str(pytesseract.image_to_string(filePath,lang='eng',config='--psm 6'))
    text = text.replace('-\n', '')
    return text

def Tesseract_ALL(docDir, txtDir):
    if docDir == "": docDir = os.getcwd() + "\\" #if no docDir passed in 
    for doc in os.listdir(docDir): #iterate through docs in doc directory
            fileExtension = doc.split(".")[-1]
            if fileExtension == "pdf":
                pdfFilename = docDir + doc 
                text = tesseractOCR_pdf(pdfFilename) #get string of text content of pdf
                textFilename = txtDir + doc + ".txt"
                textFile = open(textFilename, "w") #make text file
                textFile.write(text) #write text to text file
#             elif (fileExtension == "tif") | (fileExtension == "tiff") | (fileExtension == "jpg"):
                imgFilename = docDir + doc 
                text = tesseractOCR_img(imgFilename) #get string of text content of img
                textFilename = txtDir + doc + ".txt"
                textFile = open(textFilename, "w") #make text file
                textFile.write(text) #write text to text file
            print("Error in file: "+ str(doc))
    for filename in os.listdir(txtDir):
        fileExtension = filename.split(".")[-2]
        if fileExtension == "pdf":
            os.rename(txtDir + filename, txtDir + filename.replace('.pdf', ''))
        elif fileExtension == "tif":
            os.rename(txtDir + filename, txtDir + filename.replace('.tif', ''))
        elif fileExtension == "tiff":
            os.rename(txtDir + filename, txtDir + filename.replace('.tiff', ''))
        elif fileExtension == "jpg":
            os.rename(txtDir + filename, txtDir + filename.replace('.jpg', ''))

#Below are the code to run the functions
#Specific telling the function where the documents located and where you want the txt files to be at
docDir = "pdf_folder"
txtDir = "text_folder"

Tesseract_ALL(docDir, txtDir)


import glob,os
import os, subprocess

pdf_dir = "dir"
for pdf_file in glob.glob(os.path.join(pdf_dir, "*.PDF")):
      //// put here what you want to do for each pdf file


