使用 Tesseract OCR 從掃描的 pdf 個文件夾中提取文本

Question

我有使用 Tesseract OCR 從掃描的 pdf 文件/普通 pdf 文件中提取/轉換文本的代碼。 但我想讓我的代碼轉換一個 pdf 文件夾而不是單個 pdf 文件，然后提取的文本文件將存儲在我想要的文件夾中。

請參閱下面的代碼：

filePath = '/Users/CodingStark/scanned/scanned-file.pdf'
pages = convert_from_path(filePath, 500)


image_counter = 1
  
# Iterate through all the pages stored above 
for page in pages: 
  
    filename = "page_"+str(image_counter)+".jpg"
          
    page.save(filename, 'JPEG') 
  
    image_counter = image_counter + 1
    

filelimit = image_counter-1
  
# Creating a text file to write the output 
outfile = "scanned-file.txt"
  

f = open(outfile, "a") 
  
# Iterate from 1 to total number of pages 
for i in range(1, filelimit + 1): 

    filename = "page_"+str(i)+".jpg"
          
    # Recognize the text as string in image using pytesserct 
    text = str(((pytesseract.image_to_string(Image.open(filename))))) 

    text = text.replace('-\n', '')     
  

    f.write(text) 
#Close the file after writing all the text. 
f.close()

我想自動化我的代碼，以便它會轉換掃描文件夾中的所有 pdf 文件，而那些提取的文本文件將位於我想要的文件夾中。 另外，請問有什么方法可以刪除代碼后的所有jpg文件嗎？ 因為它占用了大量的 memory 個空間。 太感謝了！！

更新為答案

def tesseractOCR_pdf(pdf):

    filePath = pdf
    
    pages = convert_from_path(filePath, 500)

    # Counter to store images of each page of PDF to image 
    image_counter = 1

    # Iterate through all the pages stored above 
    for page in pages:
        # Declaring filename for each page of PDF as JPG 
        # For each page, filename will be: 
        # PDF page 1 -> page_1.jpg 
        # PDF page 2 -> page_2.jpg 
        # PDF page 3 -> page_3.jpg 
        # .... 
        # PDF page n -> page_n.jpg 

        filename = "page_"+str(image_counter)+".jpg"
        
        # Save the image of the page in system 
        page.save(filename, 'JPEG') 
        # Increment the counter to update filename 
        image_counter = image_counter + 1

    # Variable to get count of total number of pages 
    filelimit = image_counter-1


    # Create an empty string for stroing purposes
    text = ""
    # Iterate from 1 to total number of pages 
    for i in range(1, filelimit + 1): 
        # Set filename to recognize text from 
        # Again, these files will be: 
        # page_1.jpg 
        # page_2.jpg 
        # .... 
        # page_n.jpg 
        filename = "page_"+str(i)+".jpg"

        # Recognize the text as string in image using pytesserct 
        text += str(((pytesseract.image_to_string(Image.open(filename))))) 

        text = text.replace('-\n', '')     

    
    #Delete all the jpg files that created from above
    for i in glob.glob("*.jpg"):
        os.remove(i)
        
    return text

def tesseractOCR_img(img):

    filePath = img
    
    text = str(pytesseract.image_to_string(filePath,lang='eng',config='--psm 6'))
    
    text = text.replace('-\n', '')
    
    return text

def Tesseract_ALL(docDir, txtDir):
    if docDir == "": docDir = os.getcwd() + "\\" #if no docDir passed in 
        
    for doc in os.listdir(docDir): #iterate through docs in doc directory
        try:
            fileExtension = doc.split(".")[-1]
            
            if fileExtension == "pdf":
                pdfFilename = docDir + doc 
                text = tesseractOCR_pdf(pdfFilename) #get string of text content of pdf
                textFilename = txtDir + doc + ".txt"
                textFile = open(textFilename, "w") #make text file
                textFile.write(text) #write text to text file
            else:   
#             elif (fileExtension == "tif") | (fileExtension == "tiff") | (fileExtension == "jpg"):
                imgFilename = docDir + doc 
                text = tesseractOCR_img(imgFilename) #get string of text content of img
                textFilename = txtDir + doc + ".txt"
                textFile = open(textFilename, "w") #make text file
                textFile.write(text) #write text to text file
        except:
            print("Error in file: "+ str(doc))
            
    for filename in os.listdir(txtDir):
        fileExtension = filename.split(".")[-2]
        if fileExtension == "pdf":
            os.rename(txtDir + filename, txtDir + filename.replace('.pdf', ''))
        elif fileExtension == "tif":
            os.rename(txtDir + filename, txtDir + filename.replace('.tif', ''))
        elif fileExtension == "tiff":
            os.rename(txtDir + filename, txtDir + filename.replace('.tiff', ''))
        elif fileExtension == "jpg":
            os.rename(txtDir + filename, txtDir + filename.replace('.jpg', ''))

#Below are the code to run the functions
#Specific telling the function where the documents located and where you want the txt files to be at
docDir = "pdf_folder"
txtDir = "text_folder"

Tesseract_ALL(docDir, txtDir)

Answer 1

這是從路徑讀取的循環，

import glob,os
import os, subprocess

pdf_dir = "dir"
os.chdir(pdf_dir)
for pdf_file in glob.glob(os.path.join(pdf_dir, "*.PDF")):
      //// put here what you want to do for each pdf file

使用 Tesseract OCR 從掃描的 pdf 個文件夾中提取文本

問題描述

更新為答案

1 個解決方案

解決方案1
3 已采納 2020-11-05 14:24:25

使用 Tesseract OCR 從掃描的 pdf 個文件夾中提取文本

問題描述

更新為答案

1 個解決方案

解決方案1 3 已采納 2020-11-05 14:24:25

解決方案1
3 已采納 2020-11-05 14:24:25