Extract a Word table from multiple docx files using python docx

Question

I have a quite a few word files that have same table structure that I need to extract and save them into a csv/excel as a separate sheet (in.xls) for each word.docx.

Below only extracts first table.. and doesn't loop through whole docx.. is there a way we can loop through entire.doc and all the files in the folder

import os
from docx import Document
import pandas as pd
folder = 'C:/Users/trans/downloads/test'
file_names = [f for f in os.listdir(folder) if f.endswith(".docx") ]
file_names = [os.path.join(folder, file) for file in file_names]
print(file_names)
tables = []
for file in file_names:
    document = Document(file)
    for table in document.tables:
     df = [['' for i in range(len(table.columns))] for j in range(len(table.rows))]
     for i, row in enumerate(table.rows):
         for j, cell in enumerate(row.cells):
            if cell.text:
                df[i][j] = cell.text
    tables.append(pd.DataFrame(df))
    print(df)
    for nr, i in enumerate(tables):
        i.to_csv('C:/Users/trans/downloads/test/'"table_" + str(nr) + ".csv")

Answer 1

All you need is to install "docx2txt" library and import it and follow the below instructions. Go to this link

Answer 2

import glob
from docx import Document
import pandas as pd
folder = 'C:/Users/trans/downloads/test'
file_names = glob.glob(folder + '/*.docx')

tables = []
for file in file_names:
    document = Document(file)
    for table in document.tables:
        df = [['' for i in range(len(table.columns))] for j in range(len(table.rows))]
        for i, row in enumerate(table.rows):
            for j, cell in enumerate(row.cells):
                if cell.text:
                    df[i][j] = cell.text
        tables.append(pd.DataFrame(df))`
for index, table in enumerate(tables):
    table.to_csv('C:/Users/trans/downloads/test/table_' + str(index) + ".csv")

Extract a Word table from multiple docx files using python docx

Question

1 answers

solution1
0 2023-01-12 15:15:02

solution2
0 2023-01-23 20:31:14

Extract a Word table from multiple docx files using python docx

Question

1 answers

solution1 0 2023-01-12 15:15:02

solution2 0 2023-01-23 20:31:14

solution1
0 2023-01-12 15:15:02

solution2
0 2023-01-23 20:31:14