简体   繁体   中英

OCR in OpenCv Python

Please Help Me !.

the code below was organized according to jupyter notebook please rewrite a code which suits for pycharm.Also can you change it not to read specific words just scan what is what. in the table some cells have lines somes doesn't as you see see despite that for every floating number or word will correspond an excel cell .

Or you can create a new code which converts this input data to csv file .Most thanks!

İnput image michael_jordan_stats

Code:

from sklearn.cluster import AgglomerativeClustering
from pytesseract import Output
from tabulate import tabulate
import pandas as pd
import numpy as np
import pytesseract
import argparse
import imutils
import cv2
pytesseract.pytesseract.tesseract_cmd = 'C:\\Users\\AppData\\Local\\Programs\\Tesseract-OCR\\tesseract.exe'



# construct the argument parser and parse the arguments
ap = argparse.ArgumentParser()
ap.add_argument("-i", "--image", required=True,
   help="path to input image to be OCR'd")
ap.add_argument("-o", "--output", required=True,
   help="path to output CSV file")
ap.add_argument("-c", "--min-conf", type=int, default=0,
   help="minimum confidence value to filter weak text detection")
ap.add_argument("-d", "--dist-thresh", type=float, default=25.0,
   help="distance threshold cutoff for clustering")
ap.add_argument("-s", "--min-size", type=int, default=2,
   help="minimum cluster size (i.e., # of entries in column)")
args = vars(ap.parse_args())

# set a seed for our random number generator
np.random.seed(42)

# load the input image and convert it to grayscale
image = cv2.imread(args["image"])
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# initialize a rectangular kernel that is ~5x wider than it is tall,
# then smooth the image using a 3x3 Gaussian blur and then apply a
# blackhat morpholigical operator to find dark regions on a light
# background
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (51, 11))
gray = cv2.GaussianBlur(gray, (3, 3), 0)
blackhat = cv2.morphologyEx(gray, cv2.MORPH_BLACKHAT, kernel)

# compute the Scharr gradient of the blackhat image and scale the
# result into the range [0, 255]
grad = cv2.Sobel(blackhat, ddepth=cv2.CV_32F, dx=1, dy=0, ksize=-1)
grad = np.absolute(grad)
(minVal, maxVal) = (np.min(grad), np.max(grad))
grad = (grad - minVal) / (maxVal - minVal)
grad = (grad * 255).astype("uint8")

# apply a closing operation using the rectangular kernel to close
# gaps in between characters, apply Otsu's thresholding method, and
# finally a dilation operation to enlarge foreground regions
grad = cv2.morphologyEx(grad, cv2.MORPH_CLOSE, kernel)
thresh = cv2.threshold(grad, 0, 255,
   cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
thresh = cv2.dilate(thresh, None, iterations=3)
cv2.imshow("Thresh", thresh)

# find contours in the thresholded image and grab the largest one,
# which we will assume is the stats table
cnts = cv2.findContours(thresh.copy(), cv2.RETR_EXTERNAL,
   cv2.CHAIN_APPROX_SIMPLE)
cnts = imutils.grab_contours(cnts)
tableCnt = max(cnts, key=cv2.contourArea)

# compute the bounding box coordinates of the stats table and extract
# the table from the input image
(x, y, w, h) = cv2.boundingRect(tableCnt)
table = image[y:y + h, x:x + w]

# show the original input image and extracted table to our screen
cv2.imshow("Input", image)
cv2.imshow("Table", table)

# set the PSM mode to detect sparse text, and then localize text in
# the table
options = "--psm 6"
results = pytesseract.image_to_data(
   cv2.cvtColor(table, cv2.COLOR_BGR2RGB),
   config=options,
   output_type=Output.DICT)

# initialize a list to store the (x, y)-coordinates of the detected
# text along with the OCR'd text itself
coords = []
ocrText = []

# loop over each of the individual text localizations
for i in range(0, len(results["text"])):
   # extract the bounding box coordinates of the text region from
   # the current result
   x = results["left"][i]
   y = results["top"][i]
   w = results["width"][i]
   h = results["height"][i]

   # extract the OCR text itself along with the confidence of the
   # text localization
   text = results["text"][i]
   conf = int(results["conf"][i])

   # filter out weak confidence text localizations
   if conf > args["min_conf"]:
      # update our text bounding box coordinates and OCR'd text,
      # respectively
      coords.append((x, y, w, h))
      ocrText.append(text)

# extract all x-coordinates from the text bounding boxes, setting the
# y-coordinate value to zero
xCoords = [(c[0], 0) for c in coords]

# apply hierarchical agglomerative clustering to the coordinates
clustering = AgglomerativeClustering(
   n_clusters=None,
   affinity="manhattan",
   linkage="complete",
   distance_threshold=args["dist_thresh"])
clustering.fit(xCoords)

# initialize our list of sorted clusters
sortedClusters = []

# loop over all clusters
for l in np.unique(clustering.labels_):
   # extract the indexes for the coordinates belonging to the
   # current cluster
   idxs = np.where(clustering.labels_ == l)[0]

   # verify that the cluster is sufficiently large
   if len(idxs) > args["min_size"]:
      # compute the average x-coordinate value of the cluster and
      # update our clusters list with the current label and the
      # average x-coordinate
      avg = np.average([coords[i][0] for i in idxs])
      sortedClusters.append((l, avg))

# sort the clusters by their average x-coordinate and initialize our
# data frame
sortedClusters.sort(key=lambda x: x[1])
df = pd.DataFrame()

# loop over the clusters again, this time in sorted order
for (l, _) in sortedClusters:
   # extract the indexes for the coordinates belonging to the
   # current cluster
   idxs = np.where(clustering.labels_ == l)[0]

   # extract the y-coordinates from the elements in the current
   # cluster, then sort them from top-to-bottom
   yCoords = [coords[i][1] for i in idxs]
   sortedIdxs = idxs[np.argsort(yCoords)]

   # generate a random color for the cluster
   color = np.random.randint(0, 255, size=(3,), dtype="int")
   color = [int(c) for c in color]

   # loop over the sorted indexes
   for i in sortedIdxs:
      # extract the text bounding box coordinates and draw the
      # bounding box surrounding the current element
      (x, y, w, h) = coords[i]
      cv2.rectangle(table, (x, y), (x + w, y + h), color, 2)

   # extract the OCR'd text for the current column, then construct
   # a data frame for the data where the first entry in our column
   # serves as the header
   cols = [ocrText[i].strip() for i in sortedIdxs]
   currentDF = pd.DataFrame({cols[0]: cols[1:]})

   # concatenate *original* data frame with the *current* data
   # frame (we do this to handle columns that may have a varying
   # number of rows)
   df = pd.concat([df, currentDF], axis=1)

# replace NaN values with an empty string and then show a nicely
# formatted version of our multi-column OCR'd text
df.fillna("", inplace=True)
print(tabulate(df, headers="keys", tablefmt="psql"))

# write our table to disk as a CSV file
print("[INFO] saving CSV file to disk...")
df.to_csv(args["output"], index=False)

# show the output image after performing multi-column OCR
cv2.imshow("Output", image)
cv2.waitKey(0

I has tried your scenario with easyocr. You can install easyocr by pip install easyocr

import openpyxl
import easyocr

## Easyocr Extraction:

# initialize easyocr reader
reader = easyocr.Reader(['en'], gpu=False)
# read text from the image
result = reader.readtext('4.png')
print(result)

## grouping the detections to list. It is nothing but i grouped the values with bounding boxes by 20 pixel variation. change C for different image and check it out. 

flag = 0
Main_list = []
sub_list = []
C = 20 # change value of C according to your image
# first loop for get text value one by one value from result
for i in range(len(result)):
    # second for loop for check the text is with in flag range. For example if C value is 10, then check text is between 0 to 10 pixel or not, if not then next check 10 to 20.. so on 
    for detection in result:
        # Setting range (0-10) for first loop, then (10-20) for second loop and so on.
        Range = range(flag, flag + C)
        # check if text is with in range, if yes -> Then store those text in one list (i.e) these text which store in second for loop execution are one line.
        if detection[0][0][1] in Range:
            sub_list.append(detection[1])
    # increment the range flag
    flag += C
    Main_list.append(sub_list)
    # reset the line list and again store text for next line when for loop started.
    sub_list = []
# remove empty list in main list and make line by line list group. The data stored in format of list (main_list) in list (sub_list -> line data)
Main_list = res = [ele for ele in Main_list if ele != []]

## Store the list of groups to each row of excel
# open workbook
wb = openpyxl.Workbook()
sheet = wb.active
p = 0
# add row by row in excel from Main list
for i in Main_list:
    k = 0
    for j in i:
        c1 = sheet.cell(row=p + 1, column=k + 1)
        c1.value = str(j)
        k += 1
    p += 1
# save data in excel
wb.save("demo1.xlsx")

The result for detection is,

[([[158, 16], [526, 16], [526, 64], [158, 64]], 'Michael Jordan', 0.9538746996511557), ([[46, 68], [128, 68], [128, 92], [46, 92]], 'HT: 6\'6"', 0.9022663242510451), ([[142, 66], [224, 66], [224, 92], [142, 92]], 'WT: 195', 0.9794856399441807), ([[504, 64], [636, 64], [636, 90], [504, 90]], 'BATS: RIGHT', 0.9180541692221128), ([[46, 92], [216, 92], [216, 118], [46, 118]], 'THROWS: RIGHT', 0.9988370326395578), ([[336, 90], [638, 90], [638, 116], [336, 116]], 'BORN: 2/17/63 BROOKLYN, NY', 0.5608722283116995), ([[212, 128], [480, 128], [480, 154], [212, 154]], 'COMPLETE NBA RECORD', 0.994466992975932), ([[45, 165], [87, 165], [87, 183], [45, 183]], 'Year', 0.9999420642852783), ([[119, 161], [173, 161], [173, 181], [119, 181]], 'CLUB', 0.9996228218078613), ([[255, 161], [305, 161], [305, 181], [255, 181]], 'FG %', 0.770339752426771), ([[319, 161], [359, 161], [359, 181], [319, 181]], 'REB', 0.9977237369863976), ([[376, 158], [418, 158], [418, 182], [376, 182]], 'AST', 0.9908741040902402), ([[438, 158], [478, 158], [478, 182], [438, 182]], 'STL', 0.9958523521229901), ([[489, 159], [531, 159], [531, 179], [489, 179]], 'BLK', 0.9996744624119349), ([[551, 159], [591, 159], [591, 179], [551, 179]], 'PTS', 0.9975498700305881), ([[605, 159], [645, 159], [645, 179], [605, 179]], 'AVG', 0.9946003929625006), ([[42, 182], [108, 182], [108, 206], [42, 206]], '1984-85', 0.9697204690556599), ([[118, 182], [202, 182], [202, 206], [118, 206]], 'CHICAGO', 0.827501337214855), ([[221, 183], [245, 183], [245, 203], [221, 203]], '82', 0.9999991570631338), ([[258, 182], [294, 182], [294, 206], [258, 206]], '515', 0.9999973846308725), ([[327, 183], [359, 183], [359, 203], [327, 203]], '534', 0.9997685551643372), ([[381, 183], [413, 183], [413, 203], [381, 203]], '481', 0.9999993117448777), ([[438, 180], [474, 180], [474, 204], [438, 204]], '196', 0.999649204592322), ([[499, 183], [521, 183], [521, 201], [499, 201]], '69', 0.9999957010241584), ([[550, 180], [596, 180], [596, 204], [550, 204]], '2313', 0.9999939224281105), ([[607, 181], [645, 181], [645, 201], [607, 201]], '28.2', 0.9215702034707423), ([[42, 206], [110, 206], [110, 230], [42, 230]], '1985-86', 0.9988607125395502), ([[118, 204], [202, 204], [202, 228], [118, 228]], 'CHICAGO', 0.7866959856545104), ([[221, 207], [247, 207], [247, 227], [221, 227]], '18', 0.9998248421909477), ([[259, 207], [293, 207], [293, 227], [259, 227]], '457', 0.9999995870469189), ([[337, 207], [359, 207], [359, 225], [337, 225]], '64', 0.9999973026027609), ([[389, 205], [413, 205], [413, 225], [389, 225]], '53', 0.9999902219451473), ([[447, 205], [469, 205], [469, 225], [447, 225]], '37', 0.5204940438270569), ([[497, 205], [519, 205], [519, 225], [497, 225]], '21', 0.9999997471189183), ([[560, 202], [596, 202], [596, 226], [560, 226]], '408', 0.9999997246979434), ([[607, 205], [643, 205], [643, 225], [607, 225]], '22.7', 0.7145749726719521), ([[42, 228], [108, 228], [108, 252], [42, 252]], '1986-87', 0.9372055783002436), ([[119, 229], [201, 229], [201, 249], [119, 249]], 'CHICAGO', 0.961156637843396), ([[223, 229], [245, 229], [245, 249], [223, 249]], '82', 0.9999998314126102), ([[261, 229], [293, 229], [293, 249], [261, 249]], '482', 0.9615980386734009), ([[327, 229], [361, 229], [361, 249], [327, 249]], '430', 0.9999977975838318), ([[380, 226], [414, 226], [414, 250], [380, 250]], '377', 0.7901569788649613), ([[439, 227], [473, 227], [473, 247], [439, 247]], '236', 0.518980085849762), ([[489, 227], [523, 227], [523, 247], [489, 247]], '125', 0.9999940121825609), ([[551, 227], [593, 227], [593, 247], [551, 247]], '3041', 0.6635605989714807), ([[607, 227], [643, 227], [643, 247], [607, 247]], '37.1', 0.811654016689427), ([[43, 253], [109, 253], [109, 273], [43, 273]], '1987-88', 0.952293038138347), ([[118, 250], [202, 250], [202, 274], [118, 274]], 'CHICAGO', 0.779227282246309), ([[223, 251], [245, 251], [245, 271], [223, 271]], '82', 1.0), ([[261, 251], [293, 251], [293, 271], [261, 271]], '535', 0.9999807288805623), ([[327, 251], [361, 251], [361, 271], [327, 271]], '449', 0.9997013211250305), ([[381, 251], [415, 251], [415, 271], [381, 271]], '485', 0.9999998623489704), ([[439, 251], [473, 251], [473, 271], [439, 271]], '259', 0.9999940810080215), ([[488, 248], [522, 248], [522, 272], [488, 272]], '131', 0.9253582954406738), ([[551, 249], [595, 249], [595, 269], [551, 269]], '2868', 0.999986469745636), ([[609, 249], [645, 249], [645, 269], [609, 269]], '35,0', 0.9788451194763184), ([[43, 275], [109, 275], [109, 295], [43, 295]], '1988-89', 0.9981049585781775), ([[118, 272], [202, 272], [202, 296], [118, 296]], 'CHICAGO', 0.9991157677251202), ([[221, 275], [245, 275], [245, 295], [221, 295]], '81', 0.9999858386882693), ([[261, 275], [293, 275], [293, 295], [261, 295]], '538', 0.9448465704917908), ([[327, 273], [361, 273], [361, 293], [327, 293]], '652', 0.8911423683166504), ([[381, 273], [415, 273], [415, 293], [381, 293]], '650', 0.5588765144348145), ([[439, 273], [471, 273], [471, 293], [439, 293]], '234', 0.9999980040603199), ([[499, 273], [523, 273], [523, 293], [499, 293]], '65', 0.9999974711900259), ([[553, 273], [595, 273], [595, 293], [553, 293]], '2633', 0.5814210505851025), ([[608, 270], [646, 270], [646, 294], [608, 294]], '32.5', 0.6914864778518677), ([[43, 297], [109, 297], [109, 317], [43, 317]], '1989-90', 0.9567499652890459), ([[119, 297], [201, 297], [201, 317], [119, 317]], 'CHICAGO', 0.9615609797995148), ([[223, 297], [245, 297], [245, 317], [223, 317]], '82', 0.9999991570631338), ([[261, 297], [295, 297], [295, 317], [261, 317]], '526', 0.9999985546643163), ([[327, 297], [361, 297], [361, 317], [327, 317]], '565', 0.9999848584019675), ([[380, 294], [416, 294], [416, 318], [380, 318]], '519', 0.9999924291971442), ([[439, 295], [471, 295], [471, 315], [439, 315]], '227', 0.9999969005584717), ([[499, 295], [521, 295], [521, 315], [499, 315]], '54', 1.0), ([[551, 295], [595, 295], [595, 315], [551, 315]], '2753', 0.6430389920874392), ([[608, 292], [646, 292], [646, 316], [608, 316]], '33.6', 0.8408543199226736), ([[42, 318], [112, 318], [112, 342], [42, 342]], 'TOTALS', 0.9998710792792682), ([[215, 319], [245, 319], [245, 339], [215, 339]], '427', 0.9999999311744849), ([[261, 319], [295, 319], [295, 339], [261, 339]], '516', 0.9999960081211735), ([[319, 319], [361, 319], [361, 339], [319, 339]], '2694', 0.9999968409538269), ([[372, 316], [416, 316], [416, 340], [372, 340]], '2565', 0.9997212886810303), ([[431, 317], [473, 317], [473, 337], [431, 337]], '1189', 0.9861575365066528), ([[491, 317], [523, 317], [523, 337], [491, 337]], '465', 0.9942613244056702), ([[543, 317], [595, 317], [595, 337], [543, 337]], '14016', 0.9999915233853749), ([[609, 317], [645, 317], [645, 337], [609, 337]], '32.8', 0.5459674409727212), ([[72, 342], [639, 342], [639, 372], [72, 372]], 'If Michaels Statistics were converted into Baseball Stats they', 0.925877969382897), ([[48, 372], [268, 372], [268, 396], [48, 396]], 'would read as follows:', 0.7505235858520506), ([[128, 402], [568, 402], [568, 428], [128, 428]], 'YEARLY AVERAGE-APPROXIMATE FIGURES', 0.9073742277577199), ([[99, 437], [127, 437], [127, 457], [99, 457]], 'AB', 0.9999450409518938), ([[255, 435], [281, 435], [281, 455], [255, 455]], '2B', 0.9677621401524785), ([[301, 435], [329, 435], [329, 455], [301, 455]], '3B', 0.489314228431593), ([[351, 435], [381, 435], [381, 455], [351, 455]], 'HR', 0.9587694372074819), ([[403, 435], [439, 435], [439, 455], [403, 455]], 'RBI', 0.9990161352988146), ([[458, 432], [490, 432], [490, 456], [458, 456]], 'SB', 0.9993483033492261), ([[509, 433], [539, 433], [539, 453], [509, 453]], 'BB', 0.9998220606427959), ([[561, 433], [591, 433], [591, 453], [561, 453]], 'SO', 0.9755136775502071), ([[611, 433], [651, 433], [651, 453], [611, 453]], 'AVG', 0.9102216313054892), ([[45, 465], [77, 465], [77, 485], [45, 485]], '185', 0.9999960081211735), ([[95, 465], [129, 465], [129, 485], [95, 485]], '615', 0.9999984858388146), ([[148, 462], [184, 462], [184, 486], [148, 486]], '115', 0.9994537518764426), ([[201, 463], [235, 463], [235, 483], [201, 483]], '180', 0.9999273894344267), ([[257, 463], [281, 463], [281, 483], [257, 483]], '18', 0.9999039065377323), ([[353, 463], [379, 463], [379, 483], [353, 483]], '29', 0.9984502751536507), ([[409, 463], [431, 463], [431, 483], [409, 483]], '91', 0.9996258407840374), ([[462, 460], [488, 460], [488, 484], [462, 484]], '26', 0.9079724095674649), ([[513, 461], [537, 461], [537, 481], [513, 481]], '26', 0.5143618995869372), ([[565, 461], [589, 461], [589, 481], [565, 481]], '32', 0.9999695791841804), ([[617, 461], [651, 461], [651, 481], [617, 481]], '302', 0.9999946316117284)]

the data that stored in excel is, 在此处输入图像描述

Hope this will help you.

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM