My work demands I use pymupdf
to extract tables from pdf files and export to csv format.
You are out of luck, I'm afraid. The PDF format has no internal representation of a table structure, which makes it difficult to extract tables for analysis. You have to infer the existence of a table by seeing where the columns of data have been lined up.
There are modules that will do this for you: one is Excalibur . But pymupdf
is about extracting text as text and that will leave you having to do the parsing and inferencing yourself. That is a fairly ambitious project.
Excalibur is the GUI version of camelot
Installation https://camelot-py.readthedocs.io/en/master/user/install.html
Tutorial https://camelot-py.readthedocs.io/en/master/
This should do the job for you. There is direct export to csv. The default ouput is a dataframe which can be exported to excel or csv
I needed to work with pymupdf so I made a custom solution.
If it works for at least one person, I would be happy.
Please remember I did it for a specific problematic and the code may break for you.
I did it in an open way so you will be able to upgrade this code to your specific purpose.
If you make a better generalize solution, I would gladly use it, don't hesitate to post it. (this one took me 3 hours...)
import fitz # this is pymupd, pip3 install PyMuPDF
# WARNING, this is a bad code, please use it knowing it may break easely
# Author: nah, I'm joking, nobody wants to own this shit XD
def get_page_bloc_tuple_2list(pdf_path):
doc = fitz.open(pdf_path)
page_bloc_tuple_2list = [page.get_text_words() for page in doc] # for local (up to date)
# page_bloc_tuple_2list = [page.getTextWords() for page in doc] # for lambda (outdated)
return page_bloc_tuple_2list
def get_line_dict_list(pdf_path):
line_dict_list = []
for page_block_list in get_page_bloc_tuple_2list(pdf_path):
if len(page_block_list) == 0: continue
word_dict_list = []
y0_temp = page_block_list[0][1]
y1_temp = page_block_list[0][3]
for bloc in page_block_list:
x0, y0, x1, y1, word, _, _, _ = bloc
if y0 != y0_temp:
line_dict = {
"y": (y0_temp, y1_temp),
"word_dict_list": word_dict_list
}
line_dict_list.append(line_dict)
word_dict_list = []
y0_temp = y0
y1_temp = y1
word_dict = {
"word": word,
"x": (x0, x1)
}
word_dict_list.append(word_dict)
return line_dict_list
def get_word_list(line_dict):
return [word_dict['word'] for word_dict in line_dict['word_dict_list']]
def is_title_line(line_dict, title_word_list):
line_word_list = get_word_list(line_dict)
for w in title_word_list:
for sub_word in w.split(): # we need to comaprate substring
if sub_word not in line_word_list:
return False
return True
def get_title_line(line_dict_list, title_word_list):
for line_dict in line_dict_list:
if is_title_line(line_dict, title_word_list):
return line_dict
def get_word(title_x, line_dict):
title_x0, title_x1 = title_x
for word_dict in line_dict['word_dict_list']:
word = word_dict['word']
x0 = word_dict['x'][0]
if title_x0 <= x0 and x0 <= title_x1:
return word
def get_row_list(pdf_path, title_word_list):
line_dict_list = get_line_dict_list(pdf_path)
title_line = get_title_line(line_dict_list, title_word_list)
row_list = [get_word_list(title_line)]
title_index = line_dict_list.index(title_line)
for idx, line_dict in enumerate(line_dict_list):
if idx <= title_index: continue
row = []
for title_word_dict in title_line['word_dict_list']:
row.append(get_word(title_word_dict['x'], line_dict))
row_list.append(row)
return row_list
if __name__ == "__main__":
pdf_path = "my_filename.pdf"
title_word_list = ["name", "surname", "whatever"]
get_row_list(pdf_path, title_word_list)
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.