I am running a python script which you can see below for reference. This script uses pytesseract to convert text in an image obtained from pdf to json file containing the text as string along with page number etc. But every time I run this script, after a while, my disk storage runs to no space left and it is only freed after I restart my computer. To give you an example, my computer has 20GB left as of right now, but after running the script for some time, the disk gets full I have no idea why this is happening. I have tried to use 'del' to free up space if local variables are using it and also gc.collect() to force free up that space but nothing is working. What am I doing wrong and how can I improve it ?
import io
from PIL import Image
import pytesseract
from wand.image import Image as wi
import gc
import json
import uuid
import gc
def generate_id(code):
increment_no = str(uuid.uuid4().int)[5:12]
_id = code + increment_no
return _id
def pdf_to_json(pdf_path):
"""This function takes in the path of pdf to generate a json object with the following attributes"""
"""Company (Name of company), id (Unique Id), Page_*No. (Example Page_1, Page_2 etc.) with each page containing text in that speicifc pdf page"""
data = {}
pdf=wi(filename=pdf_path,resolution=300)
data['company'] = str(pdf_path.split('/')[-1:][0])
countrycode = str(pdf_path.split('/')[-2:-1][0].split('_')[0:1][0])
data['id'] = generate_id(countrycode)
pdfImg=pdf.convert('jpeg')
del pdf
gc.collect()
imgBlobs=[]
for img in pdfImg.sequence:
page=wi(image=img)
gc.collect()
imgBlobs.append(page.make_blob('jpeg'))
del page
gc.collect()
del pdfImg
gc.collect()
i=1
Pages = []
for imgBlob in imgBlobs:
im=Image.open(io.BytesIO(imgBlob))
text=pytesseract.image_to_string(im,lang='eng')
Pages.append(text)
del text
gc.collect()
im.close()
del im
gc.collect()
del imgBlobs
gc.collect()
data['Pages'] = Pages
with open('/Users/rishabh/Desktop/CyberBoxer/hawaii_pdf/'+data['id']+'.json', 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
del data
gc.collect()
del Pages
gc.collect()
from os import listdir
onlyfiles = [f for f in listdir('/Users/rishabh/Desktop/CyberBoxer/iowa_pdf/')]
j=1
for i in onlyfiles:
if '.pdf' in i:
start = time.time()
pdf_path = '/Users/rishabh/Desktop/CyberBoxer/iowa_pdf/'+i
pdf_to_json(pdf_path)
print(j)
j+=1
end = time.time()
print(end-start)
gc.collect()```
I figured why it was happening, it was because of wand Image module in python, I had to destroy the objects obtained which by 'del' or gc.collect() wouldn't happen since wand image has it's own destroy method.
Here's the updated function for the same :
"""This function takes in the path of pdf to generate a json object with the following attributes"""
"""Company (Name of company), id (Unique Id), Page_*No. (Example Page_1, Page_2 etc.) with each page containing text in that speicifc pdf page"""
data = {}
#pdf=wi(filename=pdf_path,resolution=300)
data['company'] = str(pdf_path.split('/')[-1:][0])
countrycode = str(pdf_path.split('/')[-2:-1][0].split('_')[0:1][0])
data['id'] = generate_id(countrycode)
#pdfImg=pdf.convert('jpeg')
#del pdf
#gc.collect()
#imgBlobs=[]
#for img in pdfImg.sequence:
# page=wi(image=img)
# gc.collect()
# imgBlobs.append(page.make_blob('jpeg'))
# del page
# gc.collect()
req_image = []
with WI(filename=pdf_path, resolution=150) as image_jpeg:
image_jpeg.compression_quality = 99
image_jpeg = image_jpeg.convert('jpeg')
for img in image_jpeg.sequence:
with WI(image=img) as img_page:
req_image.append(img_page.make_blob('jpeg'))
image_jpeg.destroy()
i=1
Pages = []
for imgBlob in req_image:
im=Image.open(io.BytesIO(imgBlob))
text=pytesseract.image_to_string(im,lang='eng')
Pages.append(text)
im.close()
del im
data['Pages'] = Pages
with open('/Users/rishabh/Desktop/CyberBoxer/iowa_pdf/'+data['id']+'.json', 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)```
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.