I am trying to loop through some Excel spreadsheets and pull information from certain tabs using regex. I have written the following code so far:
import pandas as pd
import os
import re
root = r"my_dir"
agg_df = pd.DataFrame()
for directory, subdirectory, files in os.walk(root):
for file in files:
if file.endswith('.xlsm'):
filepath = os.path.join(directory, file)
xls = pd.ExcelFile(filepath)
for i in xls.sheet_names:
if re.search(r'Apples', i):
df_temp = pd.read_excel(filepath, sheet_names=i)
df_temp['Filepath'] = filepath
df_temp['Sheet_Name'] = i
agg_df = agg_df.append(df_temp)
elif re.search(r'Oranges', i):
df_temp = pd.read_excel(filepath, sheet_names=i)
df_temp['Filepath'] = filepath
df_temp['Sheet_Name'] = i
agg_df = agg_df.append(df_temp)
elif re.search('Grapes', i):
df_temp = pd.read_excel(filepath, sheet_names=i)
df_temp['Filepath'] = filepath
df_temp['Sheet_Name'] = i
agg_df = agg_df.append(df_temp)
elif re.search(r'Tomatoes', i):
df_temp = pd.read_excel(filepath, sheet_names=i)
df_temp['Filepath'] = filepath
df_temp['Sheet_Name'] = i
agg_df = agg_df.append(df_temp)
elif re.search(r'Peaches', i):
df_temp = pd.read_excel(filepath, sheet_names=i)
df_temp['Filepath'] = filepath
df_temp['Sheet_Name'] = i
agg_df = agg_df.append(df_temp)
elif re.search(r'Pears', i):
df_temp = pd.read_excel(filepath, sheet_names=i)
df_temp['Filepath'] = filepath
df_temp['Sheet_Name'] = i
agg_df = agg_df.append(df_temp)
elif re.search(r'Bananas', i):
df_temp = pd.read_excel(filepath, sheet_names=i)
df_temp['Filepath'] = filepath
df_temp['Sheet_Name'] = i
agg_df = agg_df.append(df_temp)
elif re.search(r'Mangos', i):
df_temp = pd.read_excel(filepath, sheet_names=i)
df_temp['Filepath'] = filepath
df_temp['Sheet_Name'] = i
agg_df = agg_df.append(df_temp)
However, this is pulling the first tab and not the tabs I'm trying to specify.
You are passing parameter sheet_names , but the pandas documentation is mentioning the parameter is sheet_name . See read_excel() function.
I did some changes in your code, try execute:
import os
import pandas as pd
def parse_excel(sheet_name: str, abs_file_path: str):
df = pd.read_excel(abs_file_path, sheet_name=sheet_name)
df['Filepath'] = abs_file_path
df['Sheet_Name'] = sheet_name
return df
root = "my_dir"
agg_df = pd.DataFrame()
for directory, subdirectory, files in os.walk(root):
for file in files:
if file.endswith('.xlsm'):
file_path = os.path.join(directory, file)
xls = pd.ExcelFile(file_path)
sheet_names = ['Apples', 'Oranges', 'Grapes', 'Tomatoes', 'Peaches',
'Pears', 'Bananas', 'Mangos']
available_sheets = [sheet_name for sheet_name in xls.sheet_names if sheet_name in sheet_names]
for available_sheet in available_sheets:
agg_df = agg_df.append(parse_excel(available_sheet, file_path))
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.