I'm trying to create a summary of call logs. There are 4 cases
What I tried is looping on the groups. But it is terribly slow when dealing with huge amount of data. I think I need to use pandas methods instead of looping. How to use pandas methods to achieve the same, with multiple conditions. Thanks.
import pandas as pd
def get_summarized_call_logs_df(df):
data_list = []
phone_groups = df.groupby('phone')
unique_phones = df.phone.unique()
for ph in unique_phones:
row_data = {"phone": ph}
group = phone_groups.get_group(ph)
group_len = len(group)
if True in group['outcome'].to_list():
outcome = group.loc[group['outcome'] == True]
row_data.update({"has_outcome": True})
if outcome.phone.count() == 1:
# Cases where there is outcome for single calls
row_data.update({"status": outcome.status.iloc[0],
"duration": outcome.duration.iloc[0],
"outcome_record": outcome.id.iloc[0]})
else:
# Cases where there is outcome for multiple calls
# We choose the status and duration of outcome record with maximum duration
out_rec = outcome.loc[outcome['duration'] == outcome['duration'].max()]
row_data.update({"status": out_rec.status.iloc[0],
"duration": out_rec.duration.iloc[0],
"outcome_record": out_rec.id.iloc[0]})
else:
row_data.update({"has_outcome": False, "outcome_record": None})
if group_len == 1:
# Cases where there is no outcome for single calls
row_data.update({"status": group.status.iloc[0], "duration": group.duration.iloc[0]})
else:
# Cases where there is no outcome for multiple calls
# We choose the status and duration of the record with maximum duration
row_data.update({"status": group.loc[group['duration'] == group['duration'].max()].status.iloc[0],
"duration": group.loc[group['duration'] == group['duration'].max()].duration.iloc[0]})
data_list.append(row_data)
new_df = pd.DataFrame(data_list)
return new_df
if __name__ == "__main__":
data = [
{"id": 1, "phone": "123", "outcome": True, "status": "sale", "duration": 1550},
{"id": 2, "phone": "123", "outcome": False, "status": "failed", "duration": 3},
{"id": 3, "phone": "123", "outcome": False, "status": "no_ring", "duration": 5},
{"id": 4, "phone": "456", "outcome": True, "status": "call_back", "duration": 550},
{"id": 5, "phone": "456", "outcome": True, "status": "sale", "duration": 2500},
{"id": 6, "phone": "456", "outcome": False, "status": "no_ring", "duration": 5},
{"id": 7, "phone": "789", "outcome": False, "status": "no_pick", "duration": 4},
{"id": 8, "phone": "741", "outcome": False, "status": "try_again", "duration": 25},
{"id": 9, "phone": "741", "outcome": False, "status": "try_again", "duration": 10},
{"id": 10, "phone": "741", "outcome": False, "status": "no_ring", "duration": 5},
]
df = pd.DataFrame(data)
new_df = get_summarized_call_logs_df(df)
print(new_df)
It should produce an output
phone has_outcome status duration outcome_record
0 123 True sale 1550 1.0
1 456 True sale 2500 5.0
2 789 False no_pick 4 NaN
3 741 False try_again 25 NaN
I think you can simplify the logic. If you sort your values mainly by 'outcome' and 'duration', you just have to drop duplicates and keep the last row of each sorted groups like this:
cols = ['phone', 'outcome', 'duration']
new_df = df.sort_values(cols).drop_duplicates('phone', keep='last')
print(new_df)
# Output:
id phone outcome status duration
0 1 123 True sale 1550
4 5 456 True sale 2500
7 8 741 False try_again 25
6 7 789 False no_pick 4
Just to give an alternative option, based on convtools :
from convtools import conversion as c
# fmt: off
data = [
{"id": 1, "phone": "123", "outcome": True, "status": "sale", "duration": 1550},
{"id": 2, "phone": "123", "outcome": False, "status": "failed", "duration": 3},
{"id": 3, "phone": "123", "outcome": False, "status": "no_ring", "duration": 5},
{"id": 4, "phone": "456", "outcome": True, "status": "call_back", "duration": 550},
{"id": 5, "phone": "456", "outcome": True, "status": "sale", "duration": 2500},
{"id": 6, "phone": "456", "outcome": False, "status": "no_ring", "duration": 5},
{"id": 7, "phone": "789", "outcome": False, "status": "no_pick", "duration": 4},
{"id": 8, "phone": "741", "outcome": False, "status": "try_again", "duration": 25},
{"id": 9, "phone": "741", "outcome": False, "status": "try_again", "duration": 10},
{"id": 10, "phone": "741", "outcome": False, "status": "no_ring", "duration": 5},
]
# fmt: on
# you are interested in rows with max duration
max_duration_call_log = c.ReduceFuncs.MaxRow(c.item("duration"))
# you need to know whether there's been an outcome
has_outcome = c.ReduceFuncs.Count(where=c.item("outcome")) > 0
converter = (
c.group_by(c.item("phone"))
.aggregate(
{
"phone": c.item("phone"),
"has_outcome": has_outcome,
"status": max_duration_call_log.item("status"),
"duration": max_duration_call_log.item("duration"),
"outcome_record": c.if_(
has_outcome,
max_duration_call_log.item("id"),
None,
),
}
)
# this step generates and compiles ad hoc function
.gen_converter()
)
# fmt: off
assert converter(data) == [
{'phone': '123', 'has_outcome': True, 'status': 'sale', 'duration': 1550, 'outcome_record': 1},
{'phone': '456', 'has_outcome': True, 'status': 'sale', 'duration': 2500, 'outcome_record': 5},
{'phone': '789', 'has_outcome': False, 'status': 'no_pick', 'duration': 4, 'outcome_record': None},
{'phone': '741', 'has_outcome': False, 'status': 'try_again', 'duration': 25, 'outcome_record': None},
]
# fmt: on
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.