Pandas groupby 多條件

Question

我正在嘗試創建通話記錄摘要。 有4個案例

一部電話只有一個通話記錄記錄並且它有結果，我們選擇它的持續時間、狀態和結果記錄值
同一部電話的多個通話記錄有結果，我們選擇通話記錄的摘要，持續時間和結果記錄，最長持續時間
一部電話只有一個通話記錄記錄並且沒有結果，我們選擇其持續時間和狀態的值。 結果記錄將為無
同一部電話的多個通話記錄沒有結果，我們選擇通話記錄的摘要和持續時間，最長持續時間。 結果記錄將為無

我嘗試的是循環組。 但是在處理大量數據時速度非常慢。 我想我需要使用 pandas 方法而不是循環。 如何使用 pandas 方法來實現相同的，具有多個條件。 謝謝。

import pandas as pd
def get_summarized_call_logs_df(df):
    data_list = []
    phone_groups = df.groupby('phone')
    unique_phones = df.phone.unique()
    for ph in unique_phones:
        row_data = {"phone": ph}
        group = phone_groups.get_group(ph)
        group_len = len(group)
        if True in group['outcome'].to_list():
            outcome = group.loc[group['outcome'] == True]
            row_data.update({"has_outcome": True})
            if outcome.phone.count() == 1:
                # Cases where there is outcome for single calls
                row_data.update({"status": outcome.status.iloc[0],
                                 "duration": outcome.duration.iloc[0],
                                 "outcome_record": outcome.id.iloc[0]})
            else:
                # Cases where there is outcome for multiple calls
                # We choose the status and duration of outcome record with maximum duration
                out_rec = outcome.loc[outcome['duration'] == outcome['duration'].max()]
                row_data.update({"status": out_rec.status.iloc[0],
                                 "duration": out_rec.duration.iloc[0],
                                 "outcome_record": out_rec.id.iloc[0]})
        else:
            row_data.update({"has_outcome": False, "outcome_record": None})
            if group_len == 1:
                # Cases where there is no outcome for single calls
                row_data.update({"status": group.status.iloc[0], "duration": group.duration.iloc[0]})
            else:
                # Cases where there is no outcome for multiple calls
                # We choose the status and duration of the record with maximum duration
                row_data.update({"status": group.loc[group['duration'] == group['duration'].max()].status.iloc[0],
                                "duration": group.loc[group['duration'] == group['duration'].max()].duration.iloc[0]})
        data_list.append(row_data)
    new_df = pd.DataFrame(data_list)
    return new_df

if __name__ == "__main__":
    data = [
    {"id": 1, "phone": "123", "outcome": True, "status": "sale", "duration": 1550},
    {"id": 2, "phone": "123", "outcome": False, "status": "failed", "duration": 3},
    {"id": 3, "phone": "123", "outcome": False, "status": "no_ring", "duration": 5},
    {"id": 4, "phone": "456", "outcome": True, "status": "call_back", "duration": 550},
    {"id": 5, "phone": "456", "outcome": True, "status": "sale", "duration": 2500},
    {"id": 6, "phone": "456", "outcome": False, "status": "no_ring", "duration": 5},
    {"id": 7, "phone": "789", "outcome": False, "status": "no_pick", "duration": 4},
    {"id": 8, "phone": "741", "outcome": False, "status": "try_again", "duration": 25},
    {"id": 9, "phone": "741", "outcome": False, "status": "try_again", "duration": 10},
    {"id": 10, "phone": "741", "outcome": False, "status": "no_ring", "duration": 5},
    ]
    df = pd.DataFrame(data)
    new_df = get_summarized_call_logs_df(df)
    print(new_df)

它應該產生一個 output

  phone  has_outcome     status  duration  outcome_record
0   123         True       sale      1550             1.0
1   456         True       sale      2500             5.0
2   789        False    no_pick         4             NaN
3   741        False  try_again        25             NaN

Answer 1

我認為您可以簡化邏輯。 如果您主要按“結果”和“持續時間”對值進行排序，則只需刪除重復項並保留每個排序組的最后一行，如下所示：

cols = ['phone', 'outcome', 'duration']
new_df = df.sort_values(cols).drop_duplicates('phone', keep='last')
print(new_df)

# Output:
   id phone  outcome     status  duration
0   1   123     True       sale      1550
4   5   456     True       sale      2500
7   8   741    False  try_again        25
6   7   789    False    no_pick         4

Answer 2

只是給出一個替代選項，基於convtools ：

from convtools import conversion as c

# fmt: off
data = [
    {"id": 1, "phone": "123", "outcome": True, "status": "sale", "duration": 1550},
    {"id": 2, "phone": "123", "outcome": False, "status": "failed", "duration": 3},
    {"id": 3, "phone": "123", "outcome": False, "status": "no_ring", "duration": 5},
    {"id": 4, "phone": "456", "outcome": True, "status": "call_back", "duration": 550},
    {"id": 5, "phone": "456", "outcome": True, "status": "sale", "duration": 2500},
    {"id": 6, "phone": "456", "outcome": False, "status": "no_ring", "duration": 5},
    {"id": 7, "phone": "789", "outcome": False, "status": "no_pick", "duration": 4},
    {"id": 8, "phone": "741", "outcome": False, "status": "try_again", "duration": 25},
    {"id": 9, "phone": "741", "outcome": False, "status": "try_again", "duration": 10},
    {"id": 10, "phone": "741", "outcome": False, "status": "no_ring", "duration": 5},
]
# fmt: on

# you are interested in rows with max duration
max_duration_call_log = c.ReduceFuncs.MaxRow(c.item("duration"))

# you need to know whether there's been an outcome
has_outcome = c.ReduceFuncs.Count(where=c.item("outcome")) > 0

converter = (
    c.group_by(c.item("phone"))
    .aggregate(
        {
            "phone": c.item("phone"),
            "has_outcome": has_outcome,
            "status": max_duration_call_log.item("status"),
            "duration": max_duration_call_log.item("duration"),
            "outcome_record": c.if_(
                has_outcome,
                max_duration_call_log.item("id"),
                None,
            ),
        }
    )
    # this step generates and compiles ad hoc function
    .gen_converter()
)

# fmt: off
assert converter(data) == [
    {'phone': '123', 'has_outcome': True, 'status': 'sale', 'duration': 1550, 'outcome_record': 1},
    {'phone': '456', 'has_outcome': True, 'status': 'sale', 'duration': 2500, 'outcome_record': 5},
    {'phone': '789', 'has_outcome': False, 'status': 'no_pick', 'duration': 4, 'outcome_record': None},
    {'phone': '741', 'has_outcome': False, 'status': 'try_again', 'duration': 25, 'outcome_record': None},
]
# fmt: on

Pandas groupby 多條件

問題描述

2 個解決方案

解決方案1
0 2021-12-22 15:14:44

解決方案2
0 2021-12-22 15:59:12

Pandas groupby 多條件

問題描述

2 個解決方案

解決方案1 0 2021-12-22 15:14:44

解決方案2 0 2021-12-22 15:59:12

解決方案1
0 2021-12-22 15:14:44

解決方案2
0 2021-12-22 15:59:12