How to read & combine specific rows from multiple excel files into a single file using python?

Question

I have multiple excel files which are in the below format:

How can we merge row 2 where column A has value Orange from each excel file into a single excel with below format:

I am trying the below code but it fails, could you please suggest how to generate the summary:

def excel_summary_gen():
# this is the extension you want to detect
extension = '.xlsx'
file_list = []
i = 0
for root, dirs_list, files_list in os.walk(PATH):
    for file_name in files_list:
        if os.path.splitext(file_name)[-1] == extension and 'flatReport' in file_name:
            file_name_path = os.path.join(root, file_name)
            # print(file_name_path)  # This is the full path of the filter file
            file_list.append(file_name_path)
            file_dir = os.path.dirname(file_name_path)
            folder_name = os.path.basename(file_dir)
            print(folder_name)
            if i < 1:
                df_report = pd.read_excel(file_name_path, sheet_name='Summary', nrows=1)
                i = i + 1
            else:
                df_r = pd.read_excel(file_name_path, sheet_name='Summary')
                df_r.iloc[0, 0] = folder_name
                # renaming the column by index
                # df.columns.values[0:1] = [folder_name]
                # print(df_r.iloc[0, 0])
                df_report[i] = df_r[2]
                i = i + 1
print(df_report)

# saving the dataframe
df_report.to_excel(result_file_path, index=True)

Answer 1

You don't say what is failing however assume its selecting/copying the required row(s).
While pandas can be useful for merging datasets in this case I don't see its necessary, you can achieve what you want just with openpyxl.
Below is skeleton code for opening an excel file, searching for the required row and copying the cells from that row to the summary workbook.
If it's what you want you can incorporate in your function at the point a source workbook is opened.
Code operation;
Both the source and destination workbooks are opened as wb1 & wb2 respectively with source and destination sheets as ws1 & ws2.

import openpyxl as op


wb1 = op.load_workbook('Book1.xlsx')
ws1 = wb1['Sheet1']

result_file_path = 'xl_out.xlsx'
wb2 = op.load_workbook(result_file_path)
ws2 = wb2['Sheet1']

# If there any rows already populated in the destination sheet start at next row otherwise start at row 1
if ws2.max_row == 1:
    new_row = ws2.max_row
else:
    new_row = ws2.max_row+1

for cell in ws1.iter_rows(min_col=3, max_col=3):
    print(cell[0].value)
    # search for 'orange' lower case against the cell value converted to lower case
    if 'orange' in cell[0].value.lower():
        print("Match, copying cells to new workbook")
        # Add 'File #' to first cell in destination row using row number as #
        ws2.cell(row=new_row, column=1).value = 'File {0}'.format(new_row)
        for x in range(2, ws1.max_column + 1):
            # Read each cell from col 1 to last used col
            cell_value = ws1.cell(row=cell[0].row, column=x)

            # Write last read cell to next empty row 
            ws2.cell(row=new_row, column=x).value = cell_value.value
        
        # Increment to next unused row
        new_row += 1

wb2.save(result_file_path)

How to read & combine specific rows from multiple excel files into a single file using python?

Question

1 answers

solution1
1 ACCPTED 2022-07-30 02:05:07

How to read & combine specific rows from multiple excel files into a single file using python?

Question

1 answers

solution1 1 ACCPTED 2022-07-30 02:05:07

solution1
1 ACCPTED 2022-07-30 02:05:07