Remove the background noise for OCR with opencv

Question

I'm trying to do OCR with tesseract, to get a better result, I'd like to remove the background noise before sending it to tessseract.

I already knew the text has the fixed color and use cv2.inrange to remove the noise background, but the problem is the background noise has the similar color to the text color, so I've get stuck on this situation.

here is the my image for processing original test:

what I tried:

use cv2.inRange by filtering the inner-text-color(color code: #d7d4cf, like white color but not white, it's a little grey), but has a lot background noise. Result image use white color

use cv2.inRange by filtering the black-like color(#171510), it looks better, but still not as good as I want, Result image: use black color

I also tried to use bitwise_and to merge white and black together, but got the similar result, not good neither. can someone help me or recommend anything to me, thank you in advance.


    from PIL import Image
    from pytesseract import *
    import cv2
    import numpy as np

    def img_hsv_mask_white(img):
        hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)           
        # for hsv, OpenCV uses H: 0-179, S: 0-255, V: 0-255
        lower_hsv = np.array([0,0,185])           
        upper_hsv = np.array([179,17,235])
        mask = cv2.inRange(hsv, lower_hsv, upper_hsv)
        blur = cv2.blur(mask,(3,3))
        img2 = cv2.bitwise_and(img, img, mask = blur)
        #cv2.imshow("mask", mask)
        #cv2.waitKey (0)
        return img2

    def img_hsv_mask_black(img):
        hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)           
        # for hsv, OpenCV uses H: 0-179, S: 0-255, V: 0-255
        lower_hsv = np.array([0,0,0])
        upper_hsv = np.array([60,80,70])
        mask = cv2.inRange(hsv, lower_hsv, upper_hsv)

        blur = cv2.blur(mask,(8,8))
        #return blur
        img2 = cv2.bitwise_and(img, img, mask = blur)
        #cv2.imshow("mask", mask)
        #cv2.waitKey (0)
        return img2

    def immerge(img1, img2):
        img = cv2.bitwise_and(img1,img2)
        return img

    #require module: numpy, opencv-python, Pillow, pytesseract

    if __name__ == "__main__":
        pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract'
        #print(pytesseract.get_tesseract_version())
        for x in range(1,9):
            file = str.format("0711/{0}.png",x)
            srcimg = cv2.imread(file,cv2.IMREAD_UNCHANGED)
            white = img_hsv_mask_white(srcimg)
            black = img_hsv_mask_black(srcimg)
            merged = immerge(white, black)
            #cv2.imwrite("result.png",mask)
            code = pytesseract.image_to_string(merged, lang ='eng')
            print(code)
            cv2.imshow(file, merged)
            cv2.waitKey(0)
            #break

Answer 1

Starting with your first result you could remove noise that is:

too large or too small to be letters
not vertically centered with the rest of the text

import cv2 as cv
import numpy as np

im = cv.imread('ocr.png')
imgray = cv.cvtColor(im, cv.COLOR_BGR2GRAY)
ret, thresh = cv.threshold(imgray, 127, 255, 0)

def size_threshold(bw, minimum, maximum):
    retval, labels, stats, centroids = cv.connectedComponentsWithStats(bw)
    for val in np.where((stats[:, 4] < minimum) + (stats[:, 4] > maximum))[0]:
      labels[labels==val] = 0
    return (labels > 0).astype(np.uint8) * 255

def y_centroid_threshold(bw, minimum, maximum):
    retval, labels, stats, centroids = cv.connectedComponentsWithStats(bw)
    for val in np.where((centroids[:, 1] < minimum) + (centroids[:, 1] > maximum))[0]:
      labels[labels==val] = 0
    return (labels > 0).astype(np.uint8) * 255

sized = size_threshold(thresh, 60, 300)
centered = y_centroid_threshold(sized, 40, 63)
cv.imwrite('ocr_out.png', centered)

Remove the background noise for OCR with opencv

Question

1 answers

solution1
0 ACCPTED 2019-07-12 04:12:01

Remove the background noise for OCR with opencv

Question

1 answers

solution1 0 ACCPTED 2019-07-12 04:12:01

solution1
0 ACCPTED 2019-07-12 04:12:01