简体   繁体   中英

Expanding Pandas Column to Separate Rows based on calculation

I have DataFrame that looks like this.

df1 = pd.DataFrame(columns=['ID', 'Divide', 'Object', 'List'], data=[ ['A, B', 2, 20, [0, 5]], ['C, D', 2, 40, [10, 15, 35]], ['E, F', 2, 20, [11, 15]], ['G', 1, 10, [1, 5]], ['H', 1, 10, ''], ['I, J', 2, 20, ''] ])

|    | ID   |   Divide |   Object | List         |
|---:|:-----|---------:|---------:|:-------------|
|  0 | A, B |        2 |       20 | [0, 5]       |
|  1 | C, D |        2 |       40 | [10, 15, 35] |
|  2 | E, F |        2 |       20 | [11, 15]     |
|  3 | G    |        1 |       10 | [1, 5]       |
|  4 | H    |        1 |       10 |              |
|  5 | I, J |        2 |       20 |              |

Each ID needs to have its own row. However, List column has data belong to each ID . The logic is the following:

  1. If there is single ID in the columns (no , ), no change is needed. a
  2. If there are two IDs ( ID contains , )
  3. Then, First ID has items from List from 0: Object/Divided - 1
  4. Second ID has the items from List from Object/Divided: Object - 1

So, the final table looks like this:

|    | ID   |   Divide |   Object | List   |
|---:|:-----|---------:|---------:|:-------|
|  0 | A    |        2 |       20 | 0, 5   |
|  1 | B    |        2 |       20 |        |
|  2 | C    |        2 |       40 | 10, 15 |
|  3 | D    |        2 |       40 | 35     |
|  4 | E    |        2 |       20 |        |
|  5 | F    |        2 |       20 | 11, 15 |
|  6 | G    |        1 |       10 | 1, 5   |
|  7 | H    |        1 |       10 |        |
|  8 | I    |        2 |       20 |        |
|  9 | J    |        2 |       20 |        |

If it was lists, then explode could be used to flatten out the list. But I don't know who to apply the calculation logic within the DataFrame to parse out the Detail. Thanks

You can try this:

import pandas as pd
df = pd.DataFrame(columns=['ID', 'Divide', 'Object', 'List'], data=[ ['A, B', 2, 20, [0, 5]], ['C, D', 2, 40, [10, 15, 35]], ['E, F', 2, 20, [11, 15]], ['G', 1, 10, [1, 5]], ['H', 1, 10, ''], ['I, J', 2, 20, ''] ])

def split_list(lst, limit):
    l1 = list()
    l2 = list()
    for e in lst:
        if e <= limit:
            l1.append(e)
        else:
            l2.append(e)
    return l1, l2

df['ID'] = df['ID'].str.split(', ')
df['Limit'] = df['Object'] / df['Divide']
df['List'] = df.apply(lambda row: dict(zip(row['ID'], split_list(row['List'], row['Limit']))), axis=1)
df = df.explode('ID')
df['List'] = df.apply(lambda row: row['List'].get(row['ID']), axis=1)

print(df)


# Out[192]:
#   ID  Divide  Object      List  Limit
# 0  A       2      20    [0, 5]   10.0
# 0  B       2      20        []   10.0
# 1  C       2      40  [10, 15]   20.0
# 1  D       2      40      [35]   20.0
# 2  E       2      20        []   10.0
# 2  F       2      20  [11, 15]   10.0
# 3  G       1      10    [1, 5]   10.0
# 4  H       1      10        []   10.0
# 5  I       2      20        []   10.0
# 5  J       2      20        []   10.0
  • Most code is getting your sample data in right shape....
  • assumption an individual ID is 1 character df.ID_r==df["ID"].str.strip().str[:1]
  • logic is then as stated
import io, json
df = (pd.read_csv(io.StringIO("""|    | ID   |   Divide |   Object | List         |
|  0 | A, B |        2 |       20 | [0, 5]       |
|  1 | C, D |        2 |       40 | [10, 15, 35] |
|  2 | E, F |        2 |       20 | [11, 15]     |
|  3 | G    |        1 |       10 | [1, 5]       |
|  4 | H    |        1 |       10 |              |
|  5 | I, J |        2 |       20 |              |"""), sep="|")
      .pipe(lambda d: d.rename(columns={c:c.strip() for c in d.columns}))
      .pipe(lambda d: d.drop(columns=[c for c in d.columns if "Unnamed" in c or c==""]))
      .assign(List=lambda d: d["List"].apply(lambda l: json.loads(l) if "[" in l else []))
     )
### end make sample data work... NB List is a list and empty if no list..
# explode ID column
df = df.join(df["ID"].apply(lambda id: [t.strip() for t in id.split(",")]).explode(), rsuffix="_r")
# real logic, take first two list items if first, else ...
df["List"] = np.where(df.ID_r==df["ID"].str.strip().str[:1], df["List"].apply(lambda l: l[:2]),df["List"].apply(lambda l: l[2:]))
df.reset_index(drop=True).drop(columns=["ID"]).rename(columns={"ID_r":"ID"})

output

Divide Object List ID
0 2 20 [0, 5] A
1 2 20 [] B
2 2 40 [10, 15] C
3 2 40 [35] D
4 2 20 [11, 15] E
5 2 20 [] F
6 1 10 [1, 5] G
7 1 10 [] H
8 2 20 [] I
9 2 20 [] J

Try exploding both ID and List, then conditionally filtering based on which order the IDs came in.

import pandas as pd

df1 = pd.DataFrame(columns=['ID', 'Divide', 'Object', 'List'],
                   data=[['A, B', 2, 20, [0, 5]],
                         ['C, D', 2, 40, [10, 15, 35]],
                         ['E, F', 2, 20, [11, 15]],
                         ['G', 1, 10, [1, 5]],
                         ['H', 1, 10, ''],
                         ['I, J', 2, 20, '']])

# Split and Explode ID
df1['ID'] = df1['ID'].str.split(', ')

# Group By Each ID and set index so that First and Second IDs are tracked
df1 = df1.explode('ID') \
    .groupby(level=0) \
    .apply(lambda x: x.reset_index()) \
    .droplevel(0)
# Calculate Cap For Later
df1['cap'] = df1['Object'] // df1['Divide'] - 1


def split_lists(g):
    # If more than 1 row and non-empty list
    if len(g) > 1 and not g['List'].empty:
        # Check if is the First ID
        if g['level_0'].iloc[0] == 0:
            # Filter Less Than Equal To Cap
            g['List'] = g['List'][g['List'] <= g['cap']]
        else:
            # Filter Greater Than Cap
            g['List'] = g['List'][g['List'] > g['cap']]
    return g


# Explode Lists Group By ID filter using function
# Regroup and convert back to lists
df2 = df1 \
    .explode('List') \
    .reset_index() \
    .groupby('ID') \
    .apply(split_lists) \
    .groupby('ID')['List'] \
    .apply(lambda x: x.dropna().tolist())

# Drop Extra Columns from df1 and merge back
out = df1.drop(columns=['List', 'index', 'cap']) \
    .merge(df2, left_on='ID', right_index=True, how='left') \
    .reset_index(drop=True)

print(out)

Out:

  ID  Divide  Object      List
0  A       2      20    [0, 5]
1  B       2      20        []
2  C       2      40  [10, 15]
3  D       2      40      [35]
4  E       2      20        []
5  F       2      20  [11, 15]
6  G       1      10    [1, 5]
7  H       1      10        []
8  I       2      20        []
9  J       2      20        []

DF1 with additional columns

   index ID  Divide  Object          List  cap
0      0  A       2      20        [0, 5]    9
1      0  B       2      20        [0, 5]    9
0      1  C       2      40  [10, 15, 35]   19
1      1  D       2      40  [10, 15, 35]   19
0      2  E       2      20      [11, 15]    9
1      2  F       2      20      [11, 15]    9
0      3  G       1      10        [1, 5]    9
0      4  H       1      10                  9
0      5  I       2      20                  9
1      5  J       2      20                  9

DF2 after filter and regroup

ID
A      [0, 5]
B          []
C    [10, 15]
D        [35]
E          []
F    [11, 15]
G      [1, 5]
H          []
I          []
J          []
Name: List, dtype: object

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM