[英]Pandas groupby with multiple conditions
我正在嘗試創建通話記錄摘要。 有4個案例
我嘗試的是循環組。 但是在處理大量數據時速度非常慢。 我想我需要使用 pandas 方法而不是循環。 如何使用 pandas 方法來實現相同的,具有多個條件。 謝謝。
import pandas as pd
def get_summarized_call_logs_df(df):
data_list = []
phone_groups = df.groupby('phone')
unique_phones = df.phone.unique()
for ph in unique_phones:
row_data = {"phone": ph}
group = phone_groups.get_group(ph)
group_len = len(group)
if True in group['outcome'].to_list():
outcome = group.loc[group['outcome'] == True]
row_data.update({"has_outcome": True})
if outcome.phone.count() == 1:
# Cases where there is outcome for single calls
row_data.update({"status": outcome.status.iloc[0],
"duration": outcome.duration.iloc[0],
"outcome_record": outcome.id.iloc[0]})
else:
# Cases where there is outcome for multiple calls
# We choose the status and duration of outcome record with maximum duration
out_rec = outcome.loc[outcome['duration'] == outcome['duration'].max()]
row_data.update({"status": out_rec.status.iloc[0],
"duration": out_rec.duration.iloc[0],
"outcome_record": out_rec.id.iloc[0]})
else:
row_data.update({"has_outcome": False, "outcome_record": None})
if group_len == 1:
# Cases where there is no outcome for single calls
row_data.update({"status": group.status.iloc[0], "duration": group.duration.iloc[0]})
else:
# Cases where there is no outcome for multiple calls
# We choose the status and duration of the record with maximum duration
row_data.update({"status": group.loc[group['duration'] == group['duration'].max()].status.iloc[0],
"duration": group.loc[group['duration'] == group['duration'].max()].duration.iloc[0]})
data_list.append(row_data)
new_df = pd.DataFrame(data_list)
return new_df
if __name__ == "__main__":
data = [
{"id": 1, "phone": "123", "outcome": True, "status": "sale", "duration": 1550},
{"id": 2, "phone": "123", "outcome": False, "status": "failed", "duration": 3},
{"id": 3, "phone": "123", "outcome": False, "status": "no_ring", "duration": 5},
{"id": 4, "phone": "456", "outcome": True, "status": "call_back", "duration": 550},
{"id": 5, "phone": "456", "outcome": True, "status": "sale", "duration": 2500},
{"id": 6, "phone": "456", "outcome": False, "status": "no_ring", "duration": 5},
{"id": 7, "phone": "789", "outcome": False, "status": "no_pick", "duration": 4},
{"id": 8, "phone": "741", "outcome": False, "status": "try_again", "duration": 25},
{"id": 9, "phone": "741", "outcome": False, "status": "try_again", "duration": 10},
{"id": 10, "phone": "741", "outcome": False, "status": "no_ring", "duration": 5},
]
df = pd.DataFrame(data)
new_df = get_summarized_call_logs_df(df)
print(new_df)
它應該產生一個 output
phone has_outcome status duration outcome_record
0 123 True sale 1550 1.0
1 456 True sale 2500 5.0
2 789 False no_pick 4 NaN
3 741 False try_again 25 NaN
我認為您可以簡化邏輯。 如果您主要按“結果”和“持續時間”對值進行排序,則只需刪除重復項並保留每個排序組的最后一行,如下所示:
cols = ['phone', 'outcome', 'duration']
new_df = df.sort_values(cols).drop_duplicates('phone', keep='last')
print(new_df)
# Output:
id phone outcome status duration
0 1 123 True sale 1550
4 5 456 True sale 2500
7 8 741 False try_again 25
6 7 789 False no_pick 4
只是給出一個替代選項,基於convtools :
from convtools import conversion as c
# fmt: off
data = [
{"id": 1, "phone": "123", "outcome": True, "status": "sale", "duration": 1550},
{"id": 2, "phone": "123", "outcome": False, "status": "failed", "duration": 3},
{"id": 3, "phone": "123", "outcome": False, "status": "no_ring", "duration": 5},
{"id": 4, "phone": "456", "outcome": True, "status": "call_back", "duration": 550},
{"id": 5, "phone": "456", "outcome": True, "status": "sale", "duration": 2500},
{"id": 6, "phone": "456", "outcome": False, "status": "no_ring", "duration": 5},
{"id": 7, "phone": "789", "outcome": False, "status": "no_pick", "duration": 4},
{"id": 8, "phone": "741", "outcome": False, "status": "try_again", "duration": 25},
{"id": 9, "phone": "741", "outcome": False, "status": "try_again", "duration": 10},
{"id": 10, "phone": "741", "outcome": False, "status": "no_ring", "duration": 5},
]
# fmt: on
# you are interested in rows with max duration
max_duration_call_log = c.ReduceFuncs.MaxRow(c.item("duration"))
# you need to know whether there's been an outcome
has_outcome = c.ReduceFuncs.Count(where=c.item("outcome")) > 0
converter = (
c.group_by(c.item("phone"))
.aggregate(
{
"phone": c.item("phone"),
"has_outcome": has_outcome,
"status": max_duration_call_log.item("status"),
"duration": max_duration_call_log.item("duration"),
"outcome_record": c.if_(
has_outcome,
max_duration_call_log.item("id"),
None,
),
}
)
# this step generates and compiles ad hoc function
.gen_converter()
)
# fmt: off
assert converter(data) == [
{'phone': '123', 'has_outcome': True, 'status': 'sale', 'duration': 1550, 'outcome_record': 1},
{'phone': '456', 'has_outcome': True, 'status': 'sale', 'duration': 2500, 'outcome_record': 5},
{'phone': '789', 'has_outcome': False, 'status': 'no_pick', 'duration': 4, 'outcome_record': None},
{'phone': '741', 'has_outcome': False, 'status': 'try_again', 'duration': 25, 'outcome_record': None},
]
# fmt: on
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.