简体   繁体   中英

Why is running this python script taking all my disk space?

I am running a python script which you can see below for reference. This script uses pytesseract to convert text in an image obtained from pdf to json file containing the text as string along with page number etc. But every time I run this script, after a while, my disk storage runs to no space left and it is only freed after I restart my computer. To give you an example, my computer has 20GB left as of right now, but after running the script for some time, the disk gets full I have no idea why this is happening. I have tried to use 'del' to free up space if local variables are using it and also gc.collect() to force free up that space but nothing is working. What am I doing wrong and how can I improve it ?

import io
from PIL import Image
import pytesseract
from wand.image import Image as wi
import gc
import json
import uuid
import gc

def generate_id(code):
    increment_no = str(uuid.uuid4().int)[5:12]
    _id = code + increment_no
    return _id

def pdf_to_json(pdf_path):
    """This function takes in the path of pdf to generate a json object with the following attributes"""
    """Company (Name of company), id (Unique Id), Page_*No. (Example Page_1, Page_2 etc.) with each page containing text in that speicifc pdf page"""
    data = {}
    pdf=wi(filename=pdf_path,resolution=300)
    data['company'] = str(pdf_path.split('/')[-1:][0])
    countrycode = str(pdf_path.split('/')[-2:-1][0].split('_')[0:1][0])
    data['id'] = generate_id(countrycode)
    pdfImg=pdf.convert('jpeg')
    del pdf
    gc.collect()
    imgBlobs=[]
    for img in pdfImg.sequence:
        page=wi(image=img)
        gc.collect()
        imgBlobs.append(page.make_blob('jpeg'))
        del page
        gc.collect()
    del pdfImg
    gc.collect()
    i=1
    Pages = []
    for imgBlob in imgBlobs:
        im=Image.open(io.BytesIO(imgBlob))
        text=pytesseract.image_to_string(im,lang='eng')
        Pages.append(text)
        del text
        gc.collect()
        im.close()
        del im
        gc.collect()
    del imgBlobs
    gc.collect()
    data['Pages'] = Pages
    with open('/Users/rishabh/Desktop/CyberBoxer/hawaii_pdf/'+data['id']+'.json', 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
    del data
    gc.collect()
    del Pages
    gc.collect()

from os import listdir
onlyfiles = [f for f in listdir('/Users/rishabh/Desktop/CyberBoxer/iowa_pdf/')]

j=1
for i in onlyfiles:
    if '.pdf' in i:
        start = time.time()
        pdf_path = '/Users/rishabh/Desktop/CyberBoxer/iowa_pdf/'+i
        pdf_to_json(pdf_path)
        print(j)
        j+=1
        end = time.time()
        print(end-start)
        gc.collect()```

I figured why it was happening, it was because of wand Image module in python, I had to destroy the objects obtained which by 'del' or gc.collect() wouldn't happen since wand image has it's own destroy method.

Here's the updated function for the same :

    """This function takes in the path of pdf to generate a json object with the following attributes"""
    """Company (Name of company), id (Unique Id), Page_*No. (Example Page_1, Page_2 etc.) with each page containing text in that speicifc pdf page"""
    data = {}
    #pdf=wi(filename=pdf_path,resolution=300)
    data['company'] = str(pdf_path.split('/')[-1:][0])
    countrycode = str(pdf_path.split('/')[-2:-1][0].split('_')[0:1][0])
    data['id'] = generate_id(countrycode)
    #pdfImg=pdf.convert('jpeg')
    #del pdf
    #gc.collect()
    #imgBlobs=[]
    #for img in pdfImg.sequence:
    #    page=wi(image=img)
    #    gc.collect()
    #    imgBlobs.append(page.make_blob('jpeg'))
    #    del page
    #    gc.collect()    
    req_image = []
    with WI(filename=pdf_path, resolution=150) as image_jpeg:
        image_jpeg.compression_quality = 99
        image_jpeg = image_jpeg.convert('jpeg')

        for img in image_jpeg.sequence:
            with WI(image=img) as img_page:
                req_image.append(img_page.make_blob('jpeg'))
    image_jpeg.destroy()
    i=1
    Pages = []
    for imgBlob in req_image:
        im=Image.open(io.BytesIO(imgBlob))
        text=pytesseract.image_to_string(im,lang='eng')
        Pages.append(text)
        im.close()
        del im
    data['Pages'] = Pages
    with open('/Users/rishabh/Desktop/CyberBoxer/iowa_pdf/'+data['id']+'.json', 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)```

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM