Python: Dataframe with Start Date and End Date, Decompress to 1 Date Field

Question

I have a dataframe like this *EDITED

StartDate EndDate Company Location
2019-01-15  2019-01-31  1.0 121.0
2019-02-01  2020-03-10  1.0 136.0
2006-10-02  2020-03-10  2.0 136.0
2003-07-31  2020-03-10  2.0 321.0
2010-11-03  2020-03-10  3.0 322.0
2013-02-01  2017-02-07  4.0 375.0
2017-02-08  2019-01-14  4.0 375.0
2019-01-15  2019-04-29  4.0 375.0
2019-04-30  2020-03-10  4.0 375.0

As mentioned in this link: Pandas: decompress date range to individual dates I wanted it to decompressed to only 1 field which is the date. I followed the step by step in the solution. However, when I try to do group by with resample I get this error: ValueError: cannot reindex a non-unique index with a method or limit What is the reason this occur?

To be more clear, this is my code (the index of original dataframe is just normal index 1, 2, 3, ...

df=read_parquet('company_location.parquet')
df=df[['COMPANY','STARTDATE','ENDDATE','LOCATION']]
df['STARTDATE']=pd.to_datetime(df['STARTDATE'])
df['ENDDATE']=pd.to_datetime(df['ENDDATE'])
df=df.dropna(axis=0,how='any')
df['rows']=range(len(df))
starts=df[['COMPANY','STARTDATE','LOCATION','rows']].rename(columns={'STARTDATE':'DATE'})
ends=df[['COMPANY','ENDDATE','LOCATION','rows']].rename(columns={'ENDDATE':'DATE'})
df_decomp=pd.concat([starts,ends])
df_decomp=df_decomp.set_index('rows', append=True)
df_decomp.sort_index()

Everything is fine until here.

then when I wrote this line, there is an error:

df_decomp=df_decomp.groupby(level=[0,1]).apply(lambda x: x.set_index('DATE').resample('D').fillna(method='pad'))

The error is: (this is Jupyter Notebook)

ValueError                                Traceback (most recent call last)
/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/groupby/groupby.py in apply(self, func, *args, **kwargs)
    688             try:
--> 689                 result = self._python_apply_general(f)
    690             except Exception:

/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/groupby/groupby.py in _python_apply_general(self, f)
    706         keys, values, mutated = self.grouper.apply(f, self._selected_obj,
--> 707                                                    self.axis)
    708 

/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/groupby/ops.py in apply(self, f, data, axis)
    189             group_axes = _get_axes(group)
--> 190             res = f(group)
    191             if not _is_indexed_like(res, group_axes):

<ipython-input-29-e5d0ce53cd1c> in <lambda>(x)
----> 1 rep_movement_decomp=rep_movement_decomp.groupby(level=[0,1]).apply(lambda x: x.set_index('DATE').resample('D').fillna(method='pad'))

/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/resample.py in fillna(self, method, limit)
    759         """
--> 760         return self._upsample(method, limit=limit)
    761 

/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/resample.py in _upsample(self, method, limit, fill_value)
   1072             result = obj.reindex(res_index, method=method,
-> 1073                                  limit=limit, fill_value=fill_value)
   1074 

/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
    196         def wrapper(*args, **kwargs):
--> 197             return func(*args, **kwargs)
    198 

/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py in reindex(self, *args, **kwargs)
   3808         kwargs.pop('labels', None)
-> 3809         return super(DataFrame, self).reindex(**kwargs)
   3810 

/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/generic.py in reindex(self, *args, **kwargs)
   4355         return self._reindex_axes(axes, level, limit, tolerance, method,
-> 4356                                   fill_value, copy).__finalize__(self)
   4357 

/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
   3740             frame = frame._reindex_index(index, method, copy, level,
-> 3741                                          fill_value, limit, tolerance)
   3742 

/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py in _reindex_index(self, new_index, method, copy, level, fill_value, limit, tolerance)
   3748                                                 level=level, limit=limit,
-> 3749                                                 tolerance=tolerance)
   3750         return self._reindex_with_indexers({0: [new_index, indexer]},

/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/base.py in reindex(self, target, method, level, limit, tolerance)
   3137                     if method is not None or limit is not None:
-> 3138                         raise ValueError("cannot reindex a non-unique index "
   3139                                          "with a method or limit")

ValueError: cannot reindex a non-unique index with a method or limit

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
<ipython-input-29-e5d0ce53cd1c> in <module>()
----> 1 rep_movement_decomp=rep_movement_decomp.groupby(level=[0,1]).apply(lambda x: x.set_index('DATE').resample('D').fillna(method='pad'))

/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/groupby/groupby.py in apply(self, func, *args, **kwargs)
    699 
    700                 with _group_selection_context(self):
--> 701                     return self._python_apply_general(f)
    702 
    703         return result

/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/groupby/groupby.py in _python_apply_general(self, f)
    705     def _python_apply_general(self, f):
    706         keys, values, mutated = self.grouper.apply(f, self._selected_obj,
--> 707                                                    self.axis)
    708 
    709         return self._wrap_applied_output(

/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/groupby/ops.py in apply(self, f, data, axis)
    188             # group might be modified
    189             group_axes = _get_axes(group)
--> 190             res = f(group)
    191             if not _is_indexed_like(res, group_axes):
    192                 mutated = True

<ipython-input-29-e5d0ce53cd1c> in <lambda>(x)
----> 1 rep_movement_decomp=rep_movement_decomp.groupby(level=[0,1]).apply(lambda x: x.set_index('DATE').resample('D').fillna(method='pad'))

/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/resample.py in fillna(self, method, limit)
    758         2018-01-01 02:00:00  6.0  5
    759         """
--> 760         return self._upsample(method, limit=limit)
    761 
    762     @Appender(_shared_docs['interpolate'] % _shared_docs_kwargs)

/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/resample.py in _upsample(self, method, limit, fill_value)
   1071         else:
   1072             result = obj.reindex(res_index, method=method,
-> 1073                                  limit=limit, fill_value=fill_value)
   1074 
   1075         result = self._apply_loffset(result)

/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
    195         @wraps(func)
    196         def wrapper(*args, **kwargs):
--> 197             return func(*args, **kwargs)
    198 
    199         if not PY2:

/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py in reindex(self, *args, **kwargs)
   3807         kwargs.pop('axis', None)
   3808         kwargs.pop('labels', None)
-> 3809         return super(DataFrame, self).reindex(**kwargs)
   3810 
   3811     @Appender(_shared_docs['reindex_axis'] % _shared_doc_kwargs)

/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/generic.py in reindex(self, *args, **kwargs)
   4354         # perform the reindex on the axes
   4355         return self._reindex_axes(axes, level, limit, tolerance, method,
-> 4356                                   fill_value, copy).__finalize__(self)
   4357 
   4358     def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value,

/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
   3739         if index is not None:
   3740             frame = frame._reindex_index(index, method, copy, level,
-> 3741                                          fill_value, limit, tolerance)
   3742 
   3743         return frame

/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py in _reindex_index(self, new_index, method, copy, level, fill_value, limit, tolerance)
   3747         new_index, indexer = self.index.reindex(new_index, method=method,
   3748                                                 level=level, limit=limit,
-> 3749                                                 tolerance=tolerance)
   3750         return self._reindex_with_indexers({0: [new_index, indexer]},
   3751                                            copy=copy, fill_value=fill_value,

/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/base.py in reindex(self, target, method, level, limit, tolerance)
   3136                 else:
   3137                     if method is not None or limit is not None:
-> 3138                         raise ValueError("cannot reindex a non-unique index "
   3139                                          "with a method or limit")
   3140                     indexer, missing = self.get_indexer_non_unique(target)

ValueError: cannot reindex a non-unique index with a method or limit

Answer 1

I was able to follow Pandas: decompress date range to individual dates and I did not get any errors on your data set. See code below in answer

import pandas as pd

df = pd.DataFrame([['2019-01-15','2019-01-31','A',121.0],
                   ['2019-02-01','2020-03-10','A',136.0],
                   ['2006-10-02','2020-03-10','B',136.0],
                   ['2003-07-31','2020-03-10','B',321.0],
                   ['2010-11-03','2020-03-10','C',322.0],
                   ['2013-02-01','2017-02-07','D',375.0],
                   ['2017-02-08','2019-01-14','D',375.0],
                   ['2019-01-15','2019-04-29','D',375.0],
                   ['2019-04-30','2020-03-10','D',375.0]],
                  columns=['StartDate','EndDate','Company','Location'])
df['StartDate'] = pd.to_datetime(df['StartDate'])
df['EndDate'] = pd.to_datetime(df['EndDate'])
df.set_index('Company', inplace=True)
df['row'] = range(len(df))
print(df)

starts = df[['StartDate', 'Location', 'row']].rename(columns={'StartDate': 'Date'})
ends = df[['EndDate', 'Location', 'row']].rename(columns={'EndDate':'Date'})
df_decomp = pd.concat([starts, ends])
df_decomp = df_decomp.set_index('row', append=True)
df_decomp.sort_index()
print(df_decomp)

df_decomp = df_decomp.groupby(level=[0,1]).apply(lambda x: x.set_index('Date').resample('D').fillna(method='pad'))
df_decomp = df_decomp.reset_index(level=1, drop=True)
print(df_decomp.loc['D'])

Answer 2

    StartDate   EndDate Company Location
0   1/15/2019   1/31/2019   A   121
1   2/1/2019    3/10/2020   A   136
2   10/2/2006   3/10/2020   B   136
3   7/31/2003   3/10/2020   B   321
4   11/3/2010   3/10/2020   C   322
5   2/7/2017    2/7/2017    D   375
6   2/8/2017    1/14/2019   D   375
7   1/15/2019   4/29/2019   D   375
8   4/30/2019   3/10/2020   D   375

Not sure what's going wrong, but copy the above and run the below:

import pandas as pd
df = pd.read_clipboard()

Then, put two lines of code in addition to the post you linked:

df['StartDate'] = pd.to_datetime(df['StartDate'])
df['EndDate'] = pd.to_datetime(df['EndDate'])

Run the below, and it should work:

df['StartDate'] = pd.to_datetime(df['StartDate'])
df['EndDate'] = pd.to_datetime(df['EndDate'])
df['row'] = range(len(df))
starts = df[['StartDate', 'Location', 'Company', 'row']].rename(columns={'StartDate': 'date'})
ends = df[['EndDate', 'Location', 'Company', 'row']].rename(columns={'EndDate':'date'})
df_decomp = pd.concat([starts, ends]).drop_duplicates()
df_decomp = df_decomp.set_index('row', append=True)
df_decomp.sort_index()
df_decomp = df_decomp.groupby(level=[0,1]).apply(lambda x: 
x.set_index('date').resample('D').fillna(method='pad'))
df_decomp = df_decomp.reset_index(level=1, drop=True)
df_decomp

Python: Dataframe with Start Date and End Date, Decompress to 1 Date Field

Question

2 answers

solution1
2 2020-03-11 10:28:43

solution2
1 ACCPTED 2020-03-11 10:21:41

Python: Dataframe with Start Date and End Date, Decompress to 1 Date Field

Question

2 answers

solution1 2 2020-03-11 10:28:43

solution2 1 ACCPTED 2020-03-11 10:21:41

solution1
2 2020-03-11 10:28:43

solution2
1 ACCPTED 2020-03-11 10:21:41