简体   繁体   中英

Python Split multiple values in a cell into multiple rows

What i'm trying to get is divide multiple values of cells into multiple rows and then get only the row who has the larger number value in fruit_weight column.

I've the following format:

fruit_type;fruit_color;fruit_weight
Apple|Banana;Red|Yellow;2|1
Orange;Orange;4
Pineapple|Grape|Watermelon;Brown|Purple|Green;12|1|15

The desired result output:

fruit_type;fruit_color;fruit_weight
Apple;Red;2
Orange;Orange;4
Watermelon;Green;15

What i was thinking is divide the cells into rows and then parse the values to get the correct one but i don't know how to start.

Some help will be appreciated.

EDIT 1:

#!/bin/env python
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
fileData = pd.read_csv('articles.csv',delimiter=';')
fileData.replace('', np.nan, inplace=True)
fileData.dropna(subset=['fruit_type','fruit_color','fruit_weight'], inplace=True)
fileData = fileData.applymap(lambda x: x.split('|'))
idx = fileData.index.repeat(fileData.fruit_weight.str.len())
fileData =  fileData.apply(lambda x: pd.Series(np.concatenate(x.tolist())), 0)
print fileData
fileData.assign(idx=idx).groupby('idx', group_keys=False).apply(lambda x: x.sort_values('fruit_weight', ascending=False).head(1))
import pandas as pd
import numpy as np
import io

text = '''fruit_type;fruit_color;fruit_weight
Apple|Banana;Red|Yellow;2|1
Orange;Orange;4
Pineapple|Grape|Watermelon;Brown|Purple|Green;12|1|15'''

buf = io.StringIO(text)

df = pd.read_csv(buf, sep=';')   # replace "buf" with your CSV filename
df = df.applymap(lambda x: x.split('|'))

df

                       fruit_type             fruit_color fruit_weight
0                 [Apple, Banana]           [Red, Yellow]       [2, 1]
1                        [Orange]                [Orange]          [4]
2  [Pineapple, Grape, Watermelon]  [Brown, Purple, Green]  [12, 1, 15]

After loading and setup, flatten your dataframe using apply + pd.Series + np.concatenate . At the same time, create an index that makes grouping easy in the next step.

idx = df.index.repeat(df.fruit_weight.str.len())

idx
Int64Index([0, 0, 1, 2, 2, 2], dtype='int64')

df =  df.apply(lambda x: pd.Series(np.concatenate(x.tolist())), 0)
df

   fruit_type fruit_color fruit_weight
0       Apple         Red            2
1      Banana      Yellow            1
2      Orange      Orange            4
3   Pineapple       Brown           12
4       Grape      Purple            1
5  Watermelon       Green           15

Now, call groupby + apply and extract one row from each group with the highest value of weight.

df.assign(idx=idx).groupby('idx', group_keys=False)\
       .apply(lambda x: x.sort_values('fruit_weight', ascending=False).head(1))

   fruit_type fruit_color fruit_weight  idx
0       Apple         Red            2    0
2      Orange      Orange            4    1
5  Watermelon       Green           15    2

A very simple way to do it would be like this:

#!/bin/env python

# hardcoding inputs for testing
inputs = [
    "Apple|Banana;Red|Yellow;2|1",
    "Orange;Orange;4",
    "Pineapple|Grape|Watermelon;Brown|Purple|Green;12|1|15"]

# iterate over the hardcoded inputs
for input in inputs:
    # split the input string into fruit properties
    [ fruit_types, fruit_colors, fruit_weights ] = input.split(";")

    # if there is more than one fruit, split the string into a list of fruit_types
    if "|" in fruit_types:
        # assuming that there is one color and one weight for each fruit type,
        # split the other fruit properties as well
        fruit_types = fruit_types.split("|")
        fruit_colors = fruit_colors.split("|")
        fruit_weights = fruit_weights.split("|")

        # get highest value
        max_weight = max(fruit_weights)
        # get index of highest values in fruit_weights list
        i = fruit_weights.index(max_weight)

        print("{};{};{}").format(fruit_types[i], fruit_colors[i], fruit_weights[i])

    # if there is no more than one fruit
    else:
        print("{};{};{}").format(fruit_types, fruit_colors, fruit_weights)

Use:

#create dataframe
df = pd.read_csv(filename, sep=';') 

#split all values
df = df.applymap(lambda x: x.split('|'))
print (df)
                       fruit_type             fruit_color fruit_weight
0                 [Apple, Banana]           [Red, Yellow]       [2, 1]
1                        [Orange]                [Orange]          [4]
2  [Pineapple, Grape, Watermelon]  [Brown, Purple, Green]  [12, 1, 15]

#get position of max weight
a = pd.DataFrame(df['fruit_weight'].values.tolist()).astype(float).idxmax(1).tolist()
print (a)
[0, 0, 2]

#convert df to dictionary
b = df.to_dict('list')
print (b)
{'fruit_weight': [['2', '1'], ['4'], ['12', '1', '15']], 
'fruit_color': [['Red', 'Yellow'], ['Orange'], ['Brown', 'Purple', 'Green']], 
 'fruit_type': [['Apple', 'Banana'], ['Orange'], ['Pineapple', 'Grape', 'Watermelon']]}

#extract values by position
a = {k: [k1[v1] for k1,v1 in zip(v, a)] for k, v in b.items()}
print (a)
{'fruit_weight': ['2', '4', '15'], 
'fruit_color': ['Red', 'Orange', 'Green'], 
'fruit_type': ['Apple', 'Orange', 'Watermelon']}

#create DataFrame
df = pd.DataFrame(a)
print (df)
  fruit_color  fruit_type fruit_weight
0         Red       Apple            2
1      Orange      Orange            4
2       Green  Watermelon           15

Timings :

df = pd.concat([df]*1000).reset_index(drop=True)


def col(df):
    df = df.applymap(lambda x: x.split('|'))
    idx = df.index.repeat(df.fruit_weight.str.len())
    df =  df.apply(lambda x: pd.Series(np.concatenate(x.tolist())), 0)
    return df.assign(idx=idx).groupby('idx', group_keys=False).apply(lambda x: x.sort_values('fruit_weight', ascending=False).head(1))

def jez(df):
    df = df.applymap(lambda x: x.split('|'))
    a = pd.DataFrame(df['fruit_weight'].values.tolist()).astype(float).idxmax(1).tolist()
    b = df.to_dict('list')
    a = {k: [k1[v1] for k1,v1 in zip(v, a)] for k, v in b.items()}
    return pd.DataFrame(a)

print (col(df))
print (jez(df))


In [229]: %timeit (col(df))
1 loop, best of 3: 1.58 s per loop

In [230]: %timeit (jez(df))
100 loops, best of 3: 19.3 ms per loop

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM