I have been attempting turn 0 to 1 with pandas if the column name matches the year in each list. the last if
statement gives an error.
df
+-----------+------------+------+------+------+------+------+------+------+------+------+------+
| start | end | 2012 | 2013 | 2014 | 2015 | 2016 | 2017 | 2018 | 2019 | 2020 | 2021 |
+-----------+------------+------+------+------+------+------+------+------+------+------+------+
| 2017/1/29 | 2019/9/10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2012/1/30 | 2015/9/11 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2019/1/31 | 2021/05/08 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2016/2/1 | 2017/9/13 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2017/2/2 | 2019/9/14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2017/2/3 | 2021/05/08 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2020/2/4 | 2020/9/16 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+-----------+------------+------+------+------+------+------+------+------+------+------+------+
for index, row in df.iterrows():
s_year = int(row['start'][:4])
e_year = int(row['end'][:4])
l_years = []
l_years.append(s_year)
while s_year < e_year:
s_year += 1
l_years.append(s_year)
for i in l_years:
for column_name in df.columns.values:
if i == column_name:
df[index][column_name] = 1
Error
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
2897 try:
-> 2898 return self._engine.get_loc(casted_key)
2899 except KeyError as err:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 0
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
2 frames
/usr/local/lib/python3.7/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
2898 return self._engine.get_loc(casted_key)
2899 except KeyError as err:
-> 2900 raise KeyError(key) from err
2901
2902 if tolerance is not None:
KeyError: 0
Let's try broadcasting year columns across rows to create a mask, then replace with 1 where mask is True:
# Ensure Types are Correct
df['start'] = pd.to_datetime(df['start'])
df['end'] = pd.to_datetime(df['end'])
# Grab Year Columns
year_columns = df.columns[2:]
# Get Year Columns as numpy array as numbers
c = year_columns.to_numpy().astype(int)
# Broadcast start year vs column column (as int) and end year vs year column (as int)
m = ((df['start'].dt.year.to_numpy()[:, None] <= c) &
(c <= df['end'].dt.year.to_numpy()[:, None]))
# Update year_columns where mask is True
df[year_columns] = df[year_columns].mask(m, 1)
print(df.to_string())
df
:
start end 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021
0 2017-01-29 2019-09-10 0 0 0 0 0 1 1 1 0 0
1 2012-01-30 2015-09-11 1 1 1 1 0 0 0 0 0 0
2 2019-01-31 2021-05-08 0 0 0 0 0 0 0 1 1 1
3 2016-02-01 2017-09-13 0 0 0 0 1 1 0 0 0 0
4 2017-02-02 2019-09-14 0 0 0 0 0 1 1 1 0 0
5 2017-02-03 2021-05-08 0 0 0 0 0 1 1 1 1 1
6 2020-02-04 2020-09-16 0 0 0 0 0 0 0 0 1 0
In terms of correcting your implementation, try iterating over only year columns (excluding 'start' and 'end') as ints instead:
# Ensure Types are Correct
df['start'] = pd.to_datetime(df['start'])
df['end'] = pd.to_datetime(df['end'])
for index, row in df.iterrows():
# Get years from timestamps
s_year = row['start'].year
e_year = row['end'].year
l_years = []
l_years.append(s_year)
while s_year < e_year:
s_year += 1
l_years.append(s_year)
for c_year in l_years:
# Iterate Over Date Columns Only (Exclude Start and End)
for col_idx, column_name in enumerate(df.columns[2:].values.astype(int)):
if c_year == column_name:
# Replace at index, col_idx + 2 because 'start' and 'end'
# were excluded so col_idx is off by 2
df.iloc[index, col_idx + 2] = 1
Or much more simply:
# Ensure Types are Correct
df['start'] = pd.to_datetime(df['start'])
df['end'] = pd.to_datetime(df['end'])
for index, row in df.iterrows():
# Get years from timestamps
s_year = row['start'].year
e_year = row['end'].year
# Create Range of years as strings
l_years = list(map(str, np.arange(s_year, e_year + 1)))
# Update based on index and year column names
df.loc[index, l_years] = 1
Or you can try:
df['start'] = pd.to_datetime(df['start'])
df['end'] = pd.to_datetime(df['end'])
def change(x):
k = range(x['start'].year,x['end'].year+1)
for col in x.index[2:]:
if int(col) in k:
x[col] = 1
return x
df = df.apply(change,1)
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.