[英]Loop through folders and extract Excel Columns with Python
您好,我有几个 Excel 工作表中的数据,它们分布在不同的子文件夹中,到目前为止,我已经能够编写一个代码来提取所需的列并将它们保存在字典中,代码如下:
import os
import pandas as pd
#Path to file using os
FOLDER_PATH = r'C:\Users\Sarah\Desktop\test'
def listDir(dir):
filenames = os.listdir(dir)
for filename in filenames:
print('File Name:'+ filename)
print('folder Path:'+ os.path.abspath(os.path.join(dir, filename)), sep='\n')
listDir(FOLDER_PATH)
#Display sheets names using pandas
pd.set_option('display.width',300)
mosul_file = (r'C:\Users\Sarah\Desktop\test\Months\March.xlsx')
mosul_file2 =(r'C:\Users\Sarah\Desktop\test\Months\April.xlsx')
mosul_file3 =(r'C:\Users\Sarah\Desktop\test\Months\May.xlsx')
mosul_file7 =(r'C:\Users\Sarah\Desktop\test\Months\July.xlsx')
xl = pd.ExcelFile(mosul_file)
xl2 = pd.ExcelFile(mosul_file2)
xl3 = pd.ExcelFile(mosul_file3)
xl7 = pd.ExcelFile(mosul_file7)
#Display headers index
mosul_df = xl.parse(0, header=[1], index_col=[0,1,2])
mosul_df2 = xl2.parse(0, header=[0], index_col=[0,1,2])
mosul_df3 = xl3.parse(0, header=[0], index_col=[0,1,2])
mosul_df7 = xl7.parse(1, header=[0], index_col=[0,1,2])
#Read Excel and Select columns
mosul_file = pd.read_excel(r'C:\Users\Sarah\Desktop\test\Months\March.xlsx', sheet_name = 0 ,
index_clo=None, na_values= ['NA'], usecols = "C , F ,G")
mosul_file2 = pd.read_excel(r'C:\Users\Sarah\Desktop\test\Months\April.xlsx', sheet_name = 0 ,
index_clo=None, na_values= ['NA'], usecols = "C , F , G")
mosul_file3 = pd.read_excel(r'C:\Users\Sarah\Desktop\test\Months\May.xlsx', sheet_name = 0 ,
index_clo=None, na_values= ['NA'], usecols = "C , F , G")
mosul_file7 = pd.read_excel(r'C:\Users\Sarah\Desktop\test\Months\July.xlsx', sheet_name = 0 ,
index_clo=None, na_values= ['NA'], usecols = "C, F, G")
#Remove NaN values
data_mosul_df = mosul_file.apply (pd.to_numeric, errors='coerce')
data_mosul_df = mosul_file.dropna()
data_mosul_df2 = mosul_file2.apply (pd.to_numeric, errors='coerce')
data_mosul_df2 = mosul_file2.dropna()
data_mosul_df3 = mosul_file3.apply (pd.to_numeric, errors='coerce')
data_mosul_df3 = mosul_file3.dropna()
data_mosul_df7 = mosul_file3.apply (pd.to_numeric, errors='coerce')
data_mosul_df7 = mosul_file7.dropna()
#Save to Dictionary
datamosul1 = data_mosul_df.to_dict()
datamosul2 = data_mosul_df2.to_dict()
datamosul3 = data_mosul_df3.to_dict()
datamosul7 = data_mosul_df7.to_dict()
如何让它自动循环遍历所有文件夹和子文件夹? 谢谢
from os import walk
import pandas as pd
path = './Results'
my_files = []
for (dirpath, dirnames, filenames) in walk(path):
my_files.extend(filenames)
print(my_files)
all_dicts_list = []
for file_name in my_files:
#.....
#Read Excel and Select columns
mosul_file = pd.read_excel(file_name, sheet_name = 0 ,
index_clo=None, na_values= ['NA'], usecols = "C , F ,G")
#Remove NaN values
data_mosul_df = mosul_file.apply (pd.to_numeric, errors='coerce')
data_mosul_df = mosul_file.dropna()
#Save to Dictionary
datamosul1 = data_mosul_df.to_dict()
all_dicts_list.append(datamosul1)
#all dictionaries will be in all_dicts_list
如果我理解正确,您想从文件夹和子文件夹中获取所有文件名。 我希望以下代码对您有用,请将路径设置为您的根文件夹。
from os import walk
path = './test'
my_files = []
for (dirpath, dirnames, filenames) in walk(path):
my_files.extend(filenames)
print(my_files)
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.