What is the most efficient way to read an hdf5 file containing an image stored as a numpy array?

Question

I'm converting image files to hdf5 files as follows:

import h5py
import io
import os
import cv2
import numpy as np
from PIL import Image

def convertJpgtoH5(input_dir, filename, output_dir):
    filepath = input_dir + '/' + filename
    print('image size: %d bytes'%os.path.getsize(filepath))
    img_f = open(filepath, 'rb')
    binary_data = img_f.read()
    binary_data_np = np.asarray(binary_data)
    new_filepath = output_dir + '/' + filename[:-4] + '.hdf5'
    f = h5py.File(new_filepath, 'w')
    dset = f.create_dataset('image', data = binary_data_np)
    f.close()
    print('hdf5 file size: %d bytes'%os.path.getsize(new_filepath))

pathImg = '/path/to/images'
pathH5 = '/path/to/hdf5/files'
ext = [".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif"]

for img in os.listdir(pathImg):
        if img.endswith(tuple(ext)):
            convertJpgtoH5(pathImg, img, pathH5)

I later read these hdf5 files as follows:

for hf in os.listdir(pathH5):
    if hf.endswith(".hdf5"):
        hf = h5py.File(f"{pathH5}/{hf}", "r")
        key = list(hf.keys())[0]
        data = np.array(hf[key]) 
        img = Image.open(io.BytesIO(data))
        image = cv2.cvtColor(np.float32(img), cv2.COLOR_BGR2RGB)
        hf.close()

Is there a more efficient way to read the hdf5 files rather than converting to numpy array, opening with Pillow before using with OpenCV?

Answer 1

Ideally this should be closed as a duplicate because most of what you want to do is explained in the answers I referenced in my comments above. I am including those links here:

There is one difference: my examples load all the image data into 1 HDF5 file, and you are creating 1 HDF5 file for each image. Frankly, I don't think there is much value doing that. You wind up with twice as many files and there's nothing gained. If you are still interested in doing that, here are 2 more answers that might help (and I updated your code at the end):

In the interest of addressing your specific question, I modified your code to use cv2 only (no need for PIL). I resized the images and saved as 1 dataset in 1 file. If you are using the images for training and testing a CNN model, you need to do this anyway (it needs arrays of size/consistent shape). Also, I think you can save the data as int8 -- no need for floats. See below.

import h5py
import glob
import os
import cv2
import numpy as np

def convertImagetoH5(imgfilename):
    print('image size: %d bytes'%os.path.getsize(imgfilename))
    img = cv2.imread(imgfilename, cv2.COLOR_BGR2RGB)
    img_resize = cv2.resize(img, (IMG_WIDTH, IMG_HEIGHT) )
    return img_resize 


pathImg = '/path/to/images'
pathH5 = '/path/to/hdf5file'
ext_list = [".ppm", ".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif"]
IMG_WIDTH = 120
IMG_HEIGHT = 120

#get list of all images and number of images
all_images = []
for ext in ext_list:
    all_images.extend(glob.glob(pathImg+"/*"+ext, recursive=True))
n_images = len(all_images)

ds_img_arr = np.zeros((n_images, IMG_WIDTH, IMG_HEIGHT,3),dtype=np.uint8)

for cnt,img in enumerate(all_images):
    img_arr = convertImagetoH5(img)
    ds_img_arr[cnt]=img_arr[:]
    
h5_filepath = pathH5 + '/all_image_data.hdf5'
with h5py.File(h5_filepath, 'w') as h5f:
    dset = h5f.create_dataset('images', data=ds_img_arr)

print('hdf5 file size: %d bytes'%os.path.getsize(h5_filepath))

with h5py.File(h5_filepath, "r") as h5r:
    key = list(h5r.keys())[0]
    print (key, h5r[key].shape, h5r[key].dtype)

If you really want 1 HDF5 for each image, the code from your question is updated below. Again, only cv2 is used -- no need for PIL. Images are not resized. This is for completeness only (to demonstrate the process). It's not how you should manage your image data.

import h5py
import os
import cv2
import numpy as np

def convertImagetoH5(input_dir, filename, output_dir):
    filepath = input_dir + '/' + filename
    print('image size: %d bytes'%os.path.getsize(filepath))
    img = cv2.imread(filepath, cv2.COLOR_BGR2RGB)
    new_filepath = output_dir + '/' + filename[:-4] + '.hdf5'
    with h5py.File(new_filepath, 'w') as h5f:
        h5f.create_dataset('image', data =img)
    print('hdf5 file size: %d bytes'%os.path.getsize(new_filepath))

pathImg = '/path/to/images'
pathH5 = '/path/to/hdf5file'
ext = [".ppm", ".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif"]

# Loop thru image files and create a matching HDF5 file
for img in os.listdir(pathImg):
        if img.endswith(tuple(ext)):
            convertImagetoH5(pathImg, img, pathH5)

# Loop thru HDF5 files and read image dataset (as an array)
for h5name in os.listdir(pathH5):
    if h5name.endswith(".hdf5"):
        with h5f = h5py.File(f"{pathH5}/{h5name}", "r") as h5f:
            key = list(h5f.keys())[0]
            image = h5f[key][:]
            print(f'{h5name}: {image.shape}, {image.dtype}')

What is the most efficient way to read an hdf5 file containing an image stored as a numpy array?

Question

1 answers

solution1
1 2021-04-20 20:11:32

What is the most efficient way to read an hdf5 file containing an image stored as a numpy array?

Question

1 answers

solution1 1 2021-04-20 20:11:32

solution1
1 2021-04-20 20:11:32