简体   繁体   中英

Extract rectangular text boxes from an image

I have image of application form in that i want to extract only name, DOB, signature and tick text boxes which are surrounded by text boxes, but i am getting the result along with others unexpected results.

Input image:

输入图像

Expected result:

预期的

My result:

结果

i have tried the below code

import numpy as np
from PIL import Image
import tensorflow as tf
import os
import pytesseract 
import sys 
import re

#from pdf2image import convert_from_path #need proppeler windows distrubution 
import cv2
#from pdf2image.exceptions import PDFInfoNotInstalledError,PDFPageCountError,PDFSyntaxError

pdftoppm_path = r"C:\Program Files (x86)\Poppler\poppler-0.68.0\bin\pdftoppm.exe"

#path to tesseract instalattion
pytesseract.pytesseract.tesseract_cmd = 'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'

def sort_contours(cnts, method="left-to-right"):
    # initialize the reverse flag and sort index
    reverse = False
    i = 0

    # handle if we need to sort in reverse
    if method == "right-to-left" or method == "bottom-to-top":
        reverse = True

    # handle if we are sorting against the y-coordinate rather than
    # the x-coordinate of the bounding box
    if method == "top-to-bottom" or method == "bottom-to-top":
        i = 1

    # construct the list of bounding boxes and sort them from top to
    # bottom
    boundingBoxes = [cv2.boundingRect(c) for c in cnts]
    (cnts, boundingBoxes) = zip(*sorted(zip(cnts, boundingBoxes),
                                        key=lambda b: b[1][i], reverse=reverse))

    # return the list of sorted contours and bounding boxes
    return (cnts, boundingBoxes)

def box_extraction(img_for_box_extraction_path, cropped_dir_path):

    img1 = cv2.imread(img_for_box_extraction_path, 0)  # Read the image
    img = cv2.resize(img1, (800, 800))
    (thresh, img_bin) = cv2.threshold(img, 120, 255,
                                      cv2.THRESH_BINARY | cv2.THRESH_OTSU)  # Thresholding the image
    img_bin = 255-img_bin  # Invert the image

    ##cv2.imshow("Image_bin.jpg",img_bin)

    # Defining a kernel length
    kernel_length = np.array(img).shape[1]//150

    # A verticle kernel of (1 X kernel_length), which will detect all the verticle lines from the image.
    verticle_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, kernel_length))
    # A horizontal kernel of (kernel_length X 1), which will help to detect all the horizontal line from the image.
    hori_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_length, 1))
    # A kernel of (3 X 3) ones.
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))

    # Morphological operation to detect verticle lines from an image
    img_temp1 = cv2.erode(img_bin, verticle_kernel, iterations=3)
    verticle_lines_img = cv2.dilate(img_temp1, verticle_kernel, iterations=3)
    #cv2.imwrite("verticle_lines.jpg",verticle_lines_img)

    # Morphological operation to detect horizontal lines from an image
    img_temp2 = cv2.erode(img_bin, hori_kernel, iterations=3)
    horizontal_lines_img = cv2.dilate(img_temp2, hori_kernel, iterations=3)
    #cv2.imwrite("horizontal_lines.jpg",horizontal_lines_img)

    # Weighting parameters,a=0.4 and b=0.8. this will decide the quantity of an image to be added to make a new image.
    alpha = 0.8
    beta = 5.0 - alpha
    # This function helps to add two image with specific weight parameter to get a third image as summation of two image.
    img_final_bin = cv2.addWeighted(verticle_lines_img, alpha, horizontal_lines_img, beta, 0.0)
    img_final_bin = cv2.erode(~img_final_bin, kernel, iterations=2)
    (thresh, img_final_bin) = cv2.threshold(img_final_bin, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)

    # For Debugging
    # Enable this line to see verticle and horizontal lines in the image which is used to find boxes
    ##cv2.imshow("img_final_bin.jpg",img_final_bin)
    # Find contours for image, which will detect all the boxes
    contours, hierarchy = cv2.findContours(
        img_final_bin, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
    # Sort all the contours by top to bottom.
    (contours, boundingBoxes) = sort_contours(contours)#, method="top-to-bottom")

    idx = 0
    for c in contours:
        # Returns the location and width,height for every contour
        x, y, w, h = cv2.boundingRect(c)
        #print(x,y,w,h)

        # If the box height is less than 80, widht is <400, then only save it as a box in "cropped/" folder.
        if (w < 400 and h < 80): # and h < 6*w:
            idx += 1
            new_img = img[y:y+h+10, x:x+w+10]
            contours1, hierarchy1 = cv2.findContours(img_final_bin, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
            # Sort all the contours by top to bottom.
            (contours1, boundingBoxes) = sort_contours(contours1, method="top-to-bottom")
            # get the thresholded crop
            retval, thresh_crop = cv2.threshold(new_img, thresh=200, maxval=255, type=cv2.THRESH_BINARY)
            cv2.imwrite(cropped_dir_path+str(idx) + '.png', thresh_crop)

    # For Debugging
    # Enable this line to see all contours.
    # cv2.drawContours(img, contours, -1, (0, 0, 255), 3)
    # cv2.imwrite("./Temp/img_contour.jpg", img)


box_extraction("X:\PDF2IMG\TEST.jpeg", "X:\PDF2IMG\cropped")

#cv2.waitKey(0)
#cv2.destroyAllWindows()

How can i get the expected result only?

I have image of application form in that i want to extract only name, DOB, signature and tick textboxes which are surrounded by text boxes, but i am getting the result along with others unexpected results.

To extract the desired regions, we can use the property of a rectangular box in that they can be isolated using contour approximation and contour area. Here's an approach:

  • Convert image to grayscale, blur, and threshold
  • Perform morphological operations to smooth image and remove noise
  • Find contours
    • Filter using contour approximation and contour area
    • Extract and save ROI using Numpy slicing

Here are the detected rectangular text boxes highlighted in green

在此处输入图像描述

Since we have the bounding boxes, we simply extract the ROIs

在此处输入图像描述

在此处输入图像描述

在此处输入图像描述

在此处输入图像描述

import cv2

image = cv2.imread('1.jpg')
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
blur = cv2.GaussianBlur(gray, (5,5), 0)
thresh = cv2.threshold(blur,0,255,cv2.THRESH_OTSU + cv2.THRESH_BINARY_INV)[1]

kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))
opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations=1)
cnts = cv2.findContours(opening, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]

ROI_number = 0
for c in cnts:
    area = cv2.contourArea(c)
    peri = cv2.arcLength(c, True)
    approx = cv2.approxPolyDP(c, 0.02 * peri, True)
    x,y,w,h = cv2.boundingRect(approx)
    if len(approx) == 4 and (area > 1000) and (area < 80000):
        ROI = image[y:y+h, x:x+w]
        cv2.imwrite('ROI_{}.png'.format(ROI_number), ROI)
        ROI_number += 1

cv2.imshow('thresh', thresh)
cv2.imshow('opening', opening)
cv2.waitKey()

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM