I have a quite a few word files that have same table structure that I need to extract and save them into a csv/excel as a separate sheet (in.xls) for each word.docx.
Below only extracts first table.. and doesn't loop through whole docx.. is there a way we can loop through entire.doc and all the files in the folder
import os
from docx import Document
import pandas as pd
folder = 'C:/Users/trans/downloads/test'
file_names = [f for f in os.listdir(folder) if f.endswith(".docx") ]
file_names = [os.path.join(folder, file) for file in file_names]
print(file_names)
tables = []
for file in file_names:
document = Document(file)
for table in document.tables:
df = [['' for i in range(len(table.columns))] for j in range(len(table.rows))]
for i, row in enumerate(table.rows):
for j, cell in enumerate(row.cells):
if cell.text:
df[i][j] = cell.text
tables.append(pd.DataFrame(df))
print(df)
for nr, i in enumerate(tables):
i.to_csv('C:/Users/trans/downloads/test/'"table_" + str(nr) + ".csv")
All you need is to install "docx2txt" library and import it and follow the below instructions. Go to this link
import glob
from docx import Document
import pandas as pd
folder = 'C:/Users/trans/downloads/test'
file_names = glob.glob(folder + '/*.docx')
tables = []
for file in file_names:
document = Document(file)
for table in document.tables:
df = [['' for i in range(len(table.columns))] for j in range(len(table.rows))]
for i, row in enumerate(table.rows):
for j, cell in enumerate(row.cells):
if cell.text:
df[i][j] = cell.text
tables.append(pd.DataFrame(df))`
for index, table in enumerate(tables):
table.to_csv('C:/Users/trans/downloads/test/table_' + str(index) + ".csv")
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.