简体   繁体   中英

use python open-cv for segmenting newspaper article

I'm using the code below for segmenting the articles from an image of newspaper.

def segmenter(image_received):
    # Process 1: Lines Detection

    img = image_received
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)  # convert to binary gray image
    edges = cv2.Canny(gray, 75, 150)  # determine contours
    lines = cv2.HoughLinesP(edges, 0.017, np.pi / 180, 60, minLineLength=100, maxLineGap=0.1)  # houghlines generation

    # drawing houghlines
    for line in lines:
        x1, y1, x2, y2 = line[0]
        cv2.line(img, (x1, y1), (x2, y2), (0, 0, 128), 12)  # the houghlines of color (0,0,128) is drawn

    # Drawing brown border
    bold = cv2.copyMakeBorder(
        img,  # image source
        5,  # top width
        5,  # bottomm width
        5,  # left width
        5,  # right width
        cv2.BORDER_CONSTANT,
        value=(0, 0, 128)  # brown color value
    )

    image = bold
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]

    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 1))
    detected_lines = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, horizontal_kernel, iterations=2)
    cnts = cv2.findContours(detected_lines, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
    cnts = cnts[0] if len(cnts) == 2 else cnts[1]
    for c in cnts:
        if int(len(c) >= 10):
            cv2.drawContours(image, [c], 0, (0, 17, 255), 1)

    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 1))
    detected_lines = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, vertical_kernel, iterations=2)
    cnts = cv2.findContours(detected_lines, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
    cnts = cnts[0] if len(cnts) == 2 else cnts[1]
    for c in cnts:
        if int(len(c) >= 10):
            cv2.drawContours(image, [c], 0, (0, 17, 255), 1)

    cv2.imwrite(f'tmp/{str(str(uuid.uuid4()))}.jpg', image)

for instance the input image is在此处输入图片说明 and the output image is : 在此处输入图片说明

There are three problems:

  1. the output rectangles aren't complete in all cases.
  2. Images also are segmented inside articles as part of articles. But what I need is to segment only the text of the newspaper and crop all the other things out. Something like this one: 在此处输入图片说明
  3. Consider the following image: 在此处输入图片说明 The article indicated by borders is not rectangular and is much more complicated. How can I achieve the correct borders using python open-cv or other image processing libraries?

(the question has an answer here for matlab . But I need a python code.

here my pipeline. I think can be optimized.

Initialization

%matplotlib inline
import numpy as np
import cv2
from matplotlib import pyplot as plt

Load image

image_file_name = 'paper.jpg' 
image = cv2.imread(image_file_name)

# gray convertion
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

The first important thing is to remove the lines. So I search the lines.

grad_x = cv2.Sobel(gray, cv2.CV_64F, 1, 0, ksize=3)
grad_y = cv2.Sobel(gray, cv2.CV_64F, 0, 1, ksize=3)

abs_grad_x = cv2.convertScaleAbs(grad_x)
abs_grad_y = cv2.convertScaleAbs(grad_y)

# threshold
thresh_x = cv2.threshold(abs_grad_x, 0, 255,  cv2.THRESH_OTSU)[1]
thresh_y = cv2.threshold(abs_grad_y, 0, 255,  cv2.THRESH_OTSU)[1]

# bluring 
kernel_size = 3
blur_thresh_x = cv2.GaussianBlur(thresh_x,(kernel_size, kernel_size),0)
blur_thresh_y = cv2.GaussianBlur(thresh_y,(kernel_size, kernel_size),0)

# Run Hough on edge detected image

rho = 1  # distance resolution in pixels of the Hough grid   
theta = np.pi / 180  # angular resolution in radians of the Hough grid
threshold = 15  # minimum number of votes (intersections in Hough grid cell)  
min_line_length = 200  # minimum number of pixels making up a line   
max_line_gap = 1  # maximum gap in pixels between connectable line segments   
line_image = np.copy(gray) * 0  # creating a blank to draw lines on

# Vertical lines
vertical_lines = cv2.HoughLinesP(blur_thresh_x, rho, theta, threshold, np.array([]), min_line_length, max_line_gap)

if vertical_lines is not None:
    for line in vertical_lines:
        for x1,y1,x2,y2 in line:
            # here it's possible to add a selection of only vertical lines
            if np.abs(y1-y2)> 0.1 * np.abs(x1-x2):
                cv2.line(line_image,(x1,y1),(x2,y2),255,5)

# Horizontal lines
horizontal_lines = cv2.HoughLinesP(blur_thresh_y, rho, theta, threshold, np.array([]), min_line_length, max_line_gap)

if horizontal_lines is not None:
    for line in horizontal_lines:
        for x1,y1,x2,y2 in line:
            # here it's possible to add a selection of only horizontal lines
            if np.abs(x1-x2)> 0.1 * np.abs(y1-y2):
                cv2.line(line_image,(x1,y1),(x2,y2),255,5)   

线图像

After I remove the lines from the threshold

# threshold
thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]

# remove lines
clean_thresh = cv2.subtract(thresh, line_image)

干净的门槛

Then I search the phrases

# search the phrases
dilatation_type = cv2.MORPH_RECT
horizontal_dilatation = 20 #This is the gap. 20 for the first image, 10 for the second image
vertical_dilatation = 1
element = cv2.getStructuringElement(dilatation_type, (2*horizontal_dilatation + 1, 2*vertical_dilatation+1), (horizontal_dilatation, vertical_dilatation))
dilatation_thresh = cv2.dilate(clean_thresh, element)

# Fill
filled_tresh = dilatation_thresh.copy()
contours, hierarchy = cv2.findContours(dilatation_thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)

for cnt in contours:
    cv2.drawContours(filled_tresh, [cnt], -1, 255, cv2.FILLED)

填充阈值

Now I detect the bounding boxes

# Draw bounding boxes
bounding_box1 = filled_tresh.copy()
contours, hierarchy = cv2.findContours(bounding_box1, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)

for cnt in contours:
    x,y,w,h = cv2.boundingRect(cnt)
    cv2.rectangle(bounding_box1,(x,y),(x+w,y+h),255,cv2.FILLED)

边界框 1

# REPEAT Draw bounding boxes and Find the mean text width
mean_bb_width = 0 # mean bounding box width

bounding_box2 = bounding_box1.copy()

contours, hierarchy = cv2.findContours(bounding_box2, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)

num_cnt=0
for cnt in contours:
    x,y,w,h = cv2.boundingRect(cnt)
    cv2.rectangle(bounding_box2,(x,y),(x+w,y+h),255,cv2.FILLED)
    mean_bb_width = mean_bb_width+w
    num_cnt=num_cnt+1
    
mean_bb_width=mean_bb_width/num_cnt

边界框 2

Now I separate the titles from the text

# define title what has width bigger than 1.5* mean_width 
min_title_width = 1.5 * mean_bb_width

raw_title = np.copy(gray) * 0  
raw_text = np.copy(gray) * 0  

# separate titles from phrases
contours, hierarchy = cv2.findContours(bounding_box2, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)

for cnt in contours:
    x,y,w,h = cv2.boundingRect(cnt)
    if w >=min_title_width :
        cv2.drawContours(raw_title, [cnt], -1, 255, cv2.FILLED)
    else :
        cv2.drawContours(raw_text, [cnt], -1, 255, cv2.FILLED)

分离标题文本

and then the final processing

image_out = image.copy()

# Closing parameters
horizontal_closing = 1 
vertical_closing = 20
kernel = cv2.getStructuringElement(cv2.MORPH_RECT,(horizontal_closing,vertical_closing))

# Processing titles
# Closing
closing_title = cv2.morphologyEx(raw_title, cv2.MORPH_CLOSE, kernel)
# Find contours
contours, hierarchy = cv2.findContours(closing_title, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
# Draw bounding boxes
bounding_title = closing_title.copy()
for cnt in contours:
    x,y,w,h = cv2.boundingRect(cnt)
    cv2.rectangle(image_out,(x,y),(x+w,y+h),(255,0,0),2)

# Processing text
# Closing
closing_text = cv2.morphologyEx(raw_text, cv2.MORPH_CLOSE, kernel)
# Find contours
contours, hierarchy = cv2.findContours(closing_text , cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
# Draw bounding boxes
bounding_text = closing_text.copy()
for cnt in contours:
    x,y,w,h = cv2.boundingRect(cnt)
    cv2.rectangle(image_out,(x,y),(x+w,y+h),(0,255,0),2)

The result is

图像输出

Changing the parameter horizontal_dilatation from 20 to 10, I obtain for the second image (where I remove the red border that you added) the following result

图像输出 2

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM