简体   繁体   中英

Upload multiple Excel workbooks and concatanate - dcc.Upload, Plotly Dash

I'm developing a interactive dashboard using Plotly Dash, which takes an Excel workbook as an input, formats the data into a pandas dataframe and displays as a bar graph.

It works well with a single workbook but when I add a variable to allow for multiple works to be loaded and concatenated into one long dataframe and visualized I am running into a persistence issue. Where the data is kept after the browser is refreshed, even though storage_type is set to memory per the documentation.

app = dash.Dash(__name__, external_stylesheets=external_stylesheets)

dfmeans = []

app.layout = html.Div([ # this code section taken from Dash docs https://dash.plotly.com/dash-core-components/upload
    dcc.Store(id='stored-data', storage_type='memory'),
    dcc.Upload(
        id='upload-data',
        children=html.Div([
            'Drag and Drop or ',
            html.A('Select Files')
        ]),
        style={
            'width': '100%',
            'height': '60px',
            'lineHeight': '60px',
            'borderWidth': '1px',
            'borderStyle': 'dashed',
            'borderRadius': '5px',
            'textAlign': 'center',
            'margin': '10px'
        },
        # Allow multiple files to be uploaded
        multiple=True

I suspect this is because I have declared the list variable df_means =[] outside of the main function but that's the only place I have been able to get it to work. When I place it inside the parse_contents() function the data is replaced each time I add a new workbook.

Has anyone out there successfully implemented the Dash Upload component dcc.Upload taking multiple workbooks/excel files as an input? The documentation out there on uploading more that one file is really sparse from what I can find. Full code here -

import base64
import datetime
import io
import re

import dash
from dash.dependencies import Input, Output, State
import dash_core_components as dcc
import dash_html_components as html
import dash_table
import plotly.express as px

import pandas as pd
from read_workbook import *

import pdb

suppress_callback_exceptions=True

external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']

app = dash.Dash(__name__, external_stylesheets=external_stylesheets)

dfmeans = []

app.layout = html.Div([ # this code section taken from Dash docs https://dash.plotly.com/dash-core-components/upload
    dcc.Store(id='stored-data', storage_type='memory'),
    dcc.Upload(
        id='upload-data',
        children=html.Div([
            'Drag and Drop or ',
            html.A('Select Files')
        ]),
        style={
            'width': '100%',
            'height': '60px',
            'lineHeight': '60px',
            'borderWidth': '1px',
            'borderStyle': 'dashed',
            'borderRadius': '5px',
            'textAlign': 'center',
            'margin': '10px'
        },
        # Allow multiple files to be uploaded
        multiple=True
    ),
    html.Div(id='output-div'),
    html.Div(id='output-datatable'),
])

def parse_contents(contents, filename, date):
    content_type, content_string = contents.split(',')
    
    decoded = base64.b64decode(content_string)
    try:
        workbook_xl = pd.ExcelFile(io.BytesIO(decoded))
        # print(workbook_xl)
        
        #aggregates all months data into a single data frame
        def get_all_months(workbook_xl):
            months = ['July', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'June']
            xl_file = pd.ExcelFile(workbook_xl)
            
            months_data = []
            for month in months:
                months_data.append(get_month_dataframe(xl_file, month))
                print(months_data)
            return pd.concat(months_data)
        
        #run get all months function and produce behavior dataframe 
        df = get_all_months(workbook_xl)

        #convert episode values to float and aggregate mean per shift 
        df['value'] = df['value'].astype(float)
        dfmean = df.groupby(['Date', 'variable'],sort=False,)['value'].mean().round(2).reset_index()
        dfmeans.append(dfmean)
        dfmean = pd.concat(dfmeans)

    
    except Exception as e:
        print(e)
        return html.Div([
            'There was an error processing this file.'
        ])

    return html.Div([
        html.H5(filename),
        # html.H6(datetime.datetime.fromtimestamp(date)),
        
        dash_table.DataTable(
            data=dfmean.to_dict('records'),
            columns=[{'name': i, 'id': i} for i in dfmean.columns],
            page_size=15
        ),
        dcc.Store(id='stored-data', data=dfmean.to_dict('records')),
        
        html.Hr(),  # horizontal line

        # For debugging, display the raw contents provided by the web browser
        html.Div('Raw Content'),
        html.Pre(contents[0:200] + '...', style={
            'whiteSpace': 'pre-wrap',
            'wordBreak': 'break-all'
        })
    ])

@app.callback(Output('output-datatable', 'children'),
              Input('upload-data', 'contents'),
              State('upload-data', 'filename'),
              State('upload-data', 'last_modified'))

def update_output(list_of_contents, list_of_names, list_of_dates):
    
    if list_of_contents is not None:
        children = [
            parse_contents(c, n, d) for c, n, d in
            zip(list_of_contents, list_of_names, list_of_dates)]
        return children


@app.callback(Output('output-div', 'children'),
              Input('stored-data','data'))

def make_graphs(data):
    
    df_agg = pd.DataFrame(data)
    
    # df_agg['Date'] = pd.to_datetime(df_agg['Date'])
    
    if df_agg.empty:
        print("Dataframe epmty")
    else:
        bar_fig = px.bar(df_agg, x=df_agg['Date'], y=df_agg['value'], color = 'variable',barmode='group')
        return dcc.Graph(figure=bar_fig)
    
if __name__ == '__main__':
    app.run_server(debug=True)

Defining dfmeans outside the scope of callbacks will definitely make your data persistent until you kill the server because it is treated as a global variable. According to Dash documentation :

One of the core Dash principles explained in the Getting Started Guide on Callbacks is that Dash Callbacks must never modify variables outside of their scope. It is not safe to modify any global variables. This chapter explains why and provides some alternative patterns for sharing state between callbacks.

One alternative would be to create a global store component to store dfmeans and pass its state to update_output such that it gets appended every time a new file is uploaded:

@app.callback(Output('output-datatable', 'children'),
          Output('global-stored-data', 'data')
          Input('upload-data', 'contents'),
          State('upload-data', 'filename'),
          State('upload-data', 'last_modified'),
          State('global-stored-data', 'data'))

def update_output(list_of_contents, list_of_names, list_of_dates, global_stored_data):
    dfmeans = [pd.DataFrame(data) for data in global_stored_data]
    if list_of_contents is not None:
        children = [
            parse_contents(c, n, d, dfmeans) for c, n, d in
            zip(list_of_contents, list_of_names, list_of_dates)]
        global_stored_data = [df.to_dict('records') for df in dfmeans]
        return children, global_stored_data
    else:
        return dash.no_update

The global store should be created with storage_type='memory' , such that its content is not persisent when you refresh the page.


That being said, I noticed that children , the output of update_output , is a list of html.Div() , each returned by parse_contents . However, part of the content of each Div is dcc.Store(id='stored-data', data=dfmean.to_dict('records')) , so multiple instances of dcc.Store with the same id stored-data are output simultaneously, doesn't that generate an error? Unless I misunderstood your layout, I think you have only one graph (with multiple data file contents overlayed in it), so I think you should revise that part of code to use only one dcc.Store for the concatenated data, as suggested above.

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM