简体   繁体   中英

writing scanned image to word document by preserving original layout

I am doing a project in python where i need to scan and image and write it to word file by preserving the orignal layout of input image. For scanning image i am using pytesseract and drawing contours on original image, then i sort contours from left to right, using the x,y co-ordinates of contours and i write the text in word document, but the results are poor (if 2 contours are adjacent to each other horizontally, my method prints them vertically sorted), is there any enhanced method for doing so? here is my code:

    document = Document()
    path = "C:/xampp/htdocs/implementation/"
    image = 'detecttable.jpg'
    img = cv2.imread(path+image,0)
    entire_tesseract_response  = image_to_string(img)
    
    entireText = nlp(entire_tesseract_response)
    # done pre-processing and results are assigned here
    dilation,enhanceimage = preprocessing(img)

    rect_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 3))

    contours, hierarchy = cv2.findContours(dilation, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
    contours.sort(key=lambda x:get_contour_precedence(x, img.shape[1]))
    bgr = np.ones((img.shape[0], img.shape[1]), dtype= 'uint8')*255 
    cv2.drawContours(bgr, cnts, -1, (0,0,255), 1)
    areaThr = 61000 # let suppose if any countour is greater than 61000 then it will be a table.
    i = 0
    number_of_tables=0
    data1=[]
    ypre = 0
    xpre = 0
    datapre=""
    p=""
    
    for cnt in contours:
        x, y, width, height = cv2.boundingRect(cnt)
        area = cv2.contourArea(cnt)
        if (area > areaThr):
            number_of_tables=number_of_tables+1
            i = i + 1
            table = img[y:y+height-1, x:x+width-1]
            data,vertical=table_processing(table)
            cols= vertical
            rows1 = int(data/vertical)
            data_images=imageReader(data)
            table = document.add_table(0, cols)
            table.style = 'TableGrid'
            itera=0
            for i in range(rows1):
                tableimg=1
                row_cells = table.add_row().cells
                for j in range(cols):
                    cv2.imwrite(str(itera)+".png",data_images[itera])
                    roi = cv2.resize(data_images[itera],None,fx=4, fy=4, interpolation = cv2.INTER_CUBIC)
                    text = image_to_string(roi)
                    row_cells[j].text = text
                    data1.append(text)
                    itera+=1
        else:               
            table = img[y:y+height-1, x:x+width-1]
            roi = cv2.resize(table,None,fx=4, fy=4, interpolation = cv2.INTER_CUBIC)
            text  = image_to_string(roi)
                
            if not text:
                segment=charsegment(roi)
                # text = " "+image_to_string(segment)

            data1.append(text)
            if ypre!=0 and y==ypre:
                space = x-xpre
                space = int(space/7)
                datapace=""
                for i in range(space):
                    datapace=datapace+" "
                text = datapace+text
                p.add_run(text)
            else:
                if x<100:
                    if len(datapre)<=70:
                        p=document.add_paragraph(text)
                    else:
                        if(len(datapre))>70:
                            p.add_run(text)     
                        else:
                            p=document.add_paragraph(text)
                            p.alignment = 0
                                
                elif x>=100 and x<=300:
                    p = document.add_paragraph(text)
                    p.alignment = 1
                    
                else:
                    p = document.add_paragraph(text)
                    p.alignment = 2
                ypre=  y
                xpre = x
                datapre=text
    name = image.split(".")
    filePath = name[0].split("/")
    fileName = filePath[0]+"/wordFiles/"+filePath[2]+".docx"
    entireText = image_to_string(img)
    document.save(fileName)
    doc = docx.Document(fileName)

Use Hocr Format of Tesserat OCR for this purpose

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM