简体   繁体   中英

Extract text from a scanned pdf with images?

I've tried to extract text from a pdf created from the computer and it worked but I wasn't able to extract text from a scanned pdf, which you can find here , with images and several pages such as this one:

在此处输入图像描述

Here is the code I used:

# libraries
## split
from PyPDF2 import PdfFileWriter, PdfFileReader
## read 
import sys
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.layout import LAParams
import io
# remove files
import os

# split in case there is several pages
def pdfspliter(filename):
    inputpdf = PdfFileReader(open(filename, "rb"))

    for i in range(inputpdf.numPages):
        output = PdfFileWriter()
        output.addPage(inputpdf.getPage(i))
        with open("document-page%s.pdf" % i, "wb") as outputStream:
            output.write(outputStream)
        pdfparser("document-page%s.pdf" % i)
        os.remove("document-page%s.pdf" % i)

# read a given page
def pdfparser(data):

    fp = open(data, 'rb')
    rsrcmgr = PDFResourceManager()
    retstr = io.StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.

    for page in PDFPage.get_pages(fp):
        interpreter.process_page(page)
        data =  retstr.getvalue()

    print(data)

if __name__ == '__main__':
    filename = sys.argv[1]
    pdfspliter(filename)

Can you help extract text from this kind of files?

Update with Tesseract OCR

I made an attempt with Tesseract OCR with Python, it extracts some pages of a pdf text but really takes time and seems to stop at a point:

# import the necessary packages
from PIL import Image
import pytesseract
import argparse
import cv2
import os
## split
from PyPDF2 import PdfFileWriter, PdfFileReader
# remove
import sys
# 
from pdf2image import convert_from_path
# import all files with a name
import glob

# functions
def pdfspliterimager(filename):
    inputpdf = PdfFileReader(open(filename, "rb"))
    for i in range(inputpdf.numPages):
        output = PdfFileWriter()
        output.addPage(inputpdf.getPage(i))
        with open("document-page%s.pdf" % i, "wb") as outputStream:
            output.write(outputStream)
        pages = convert_from_path("document-page%s.pdf" % i, 500)
        for page in pages:
            page.save('out%s.jpg'%i, 'JPEG')

        os.remove("document-page%s.pdf" % i)

# construct the argument parse and parse the arguments
ap = argparse.ArgumentParser()
ap.add_argument("-i", "--image", required=True,
    help="path to input image to be OCR'd")
ap.add_argument("-p", "--preprocess", type=str, default="thresh",
    help="type of preprocessing to be done")
args = vars(ap.parse_args())

# we test if it is a pdf
image_path = args["image"]
# if it is a pdf we convert it to an image
if image_path.endswith('.pdf'):
    pdfspliterimager(image_path)

# for all files with out in their name
file_names = glob.glob("out*")
for file_name in file_names:
    # load the image and convert it to grayscale
    image = cv2.imread(file_name)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # check to see if we should apply thresholding to preprocess the
    # image
    if args["preprocess"] == "thresh":
        gray = cv2.threshold(gray, 0, 255,
            cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]

    # make a check to see if median blurring should be done to remove
    # noise
    elif args["preprocess"] == "blur":
        gray = cv2.medianBlur(gray, 3)

    # write the grayscale image to disk as a temporary file so we can
    # apply OCR to it
    filename = "{}.png".format(os.getpid())
    cv2.imwrite(filename, gray)

    # load the image as a PIL/Pillow image, apply OCR, and then delete
    # the temporary file
    text = pytesseract.image_to_string(Image.open(filename))
    os.remove(filename)
    print(text)

    # show the output images
    cv2.imshow("Image", image)
    cv2.imshow("Output", gray)
    cv2.waitKey(0)

OCR on PDF files using Python

import os
import io
from PIL import Image
import pytesseract
from wand.image import Image as wi
import gc

def Get_text_from_image(pdf_path):
    pdf=wi(filename=pdf_path,resolution=300)
    pdfImg=pdf.convert('jpeg')
    imgBlobs=[]
    extracted_text=[]
    for img in pdfImg.sequence:
        page=wi(image=img)
        imgBlobs.append(page.make_blob('jpeg'))
    for imgBlob in imgBlobs:
        im=Image.open(io.BytesIO(imgBlob))
        text=pytesseract.image_to_string(im,lang='eng')
        extracted_text.append(text)
    return ([i.replace("\n","") for i in extracted_text])   

I done a minor modification

The below code converts all the pages of the PDF to images in sequence by end of the code Am destroying Image sequence because it is taking huge memory to processing

def Get_text_from_image(pdf_path):
    import pytesseract,io,gc
    from PIL import Image
    from wand.image import Image as wi
    import gc
    """ Extracting text content from Image  """

    pdf=wi(filename=pdf_path,resolution=300)                                                                                                                
    pdfImg=pdf.convert('jpeg')                                                                                                                                                                              
    imgBlobs=[]
    extracted_text=[]
    try:        
        for img in pdfImg.sequence:
            page=wi(image=img)
            imgBlobs.append(page.make_blob('jpeg'))
            for i in range(0,5):
                [gc.collect() for i in range(0,10)]

        for imgBlob in imgBlobs:
            im=Image.open(io.BytesIO(imgBlob))
            text=pytesseract.image_to_string(im,lang='eng')
            text = text.replace(r"\n", " ")
            extracted_text.append(text)
            for i in range(0,5):
                [gc.collect() for i in range(0,10)]
        return (''.join([i.replace("\n"," ").replace("\n\n"," ") for i in extracted_text]))
        [gc.collect() for i in range(0,10)]
    finally:
        [gc.collect() for i in range(0,10)]
        img.destroy()

Why would you OCR the parts that are already text based and not just the parts that are images. Check if the element is text, get the text, if not text get the LTImage from the LTFigure element and OCR the image. My question is is there not something other than pytesseract for OCR?

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM