简体   繁体   中英

Transform pandas Series with variable length comma separated values to Dataframe

I have a pandas-Series 'A' containing comma separated values like this :

index    A

1        null
2        5,6
3        3
4        null
5        5,18,22
...      ...

I need a dataframe like this :

index    A_5    A_6    A_18    A_20

1        0      0      0       ...
2        1      1      0       ...
3        0      0      0       ...
4        0      0      0       ...
5        1      0      1       ...
...      ...    ...    ...     ...

Values that don't occur at least MIN_OBS times should be ignored and not get an own column, because there are so many distinct values that the df would become too big if this threshold isn't applied.

I designed the solution below. It works, but is way too slow (due to iterating over rows I suppose). Could anyone suggest a faster approach ?

temp_dict = defaultdict(int)
for k, v in A.iteritems():
    temp_list = v.split(',')
    for item in temp_list:
        temp_dict[item] += 1

cols_to_make = []
for k, v in temp_dict.iteritems():
    if v > MIN_OBS:
        cols_to_make.append('A_' + k)

result_df = pd.DataFrame(0, index = the_series.index, columns = cols_to_make)
for k, v in A.iteritems():
    temp_list = v.split(',')
    for item in temp_list:
    if ('A_' + item) in cols_to_make:
        temp_df['A_' + item][k] = 1

You can use get_dummies for creating indicator variables, then convert columns to numbers by to_numeric and last filter columns by variable TRESH and ix :

print df
             A
index         
1         null
2          5,6
3            3
4         null
5      5,18,22

df = df.A.str.get_dummies(sep=",")
print df
       18  22  3  5  6  null
index                       
1       0   0  0  0  0     1
2       0   0  0  1  1     0
3       0   0  1  0  0     0
4       0   0  0  0  0     1
5       1   1  0  1  0     0

df.columns = pd.to_numeric(df.columns, errors='coerce')
df = df.sort_index(axis=1)

TRESH = 5
cols = [col for col in df.columns if col > TRESH]
print cols
[6.0, 18.0, 22.0]
df = df.ix[:, cols]
print df
       6   18  22
index            
1       0   0   0
2       1   0   0
3       0   0   0
4       0   0   0
5       0   1   1

df.columns = ["A_" + str(int(col)) for col in df.columns]
print df
       A_6  A_18  A_22
index                 
1        0     0     0
2        1     0     0
3        0     0     0
4        0     0     0
5        0     1     1

EDIT:

I try modified perfect original unutbu answer and change creating Series , removing Series with null values in index and add parameter prefix to get_dummies :

import numpy as np
import pandas as pd

s = pd.Series(['null', '5,6', '3', 'null', '5,18,22', '3,4'])
print s

#result = s.str.split(',').apply(pd.Series).stack()
#replacing to:
result = pd.DataFrame([ x.split(',') for x in s ]).stack()
count = pd.value_counts(result)

min_obs = 2

#add removing Series, which contains null
count = count[(count >= min_obs) & ~(count.index.isin(['null'])) ]

result = result.loc[result.isin(count.index)]
#add prefix to function get_dummies
result = pd.get_dummies(result, prefix="A")

result.index = result.index.droplevel(1)
result = result.reindex(s.index)

print(result)
   A_3  A_5
0  NaN  NaN
1    0    1
2    1    0
3  NaN  NaN
4    0    1
5    1    0

Timings:

In [143]: %timeit pd.DataFrame([ x.split(',') for x in s ]).stack()
1000 loops, best of 3: 866 µs per loop

In [144]: %timeit s.str.split(',').apply(pd.Series).stack()
100 loops, best of 3: 2.46 ms per loop

Since memory is an issue, we have to be careful not to build large intermediate data structures if possible.

Let's start with the OP's posted code that works:

def orig(A, MIN_OBS):
    temp_dict = collections.defaultdict(int)
    for k, v in A.iteritems():
        temp_list = v.split(',')
        for item in temp_list:
            temp_dict[item] += 1
    cols_to_make = []
    for k, v in temp_dict.iteritems():
        if v > MIN_OBS:
            cols_to_make.append('A_' + k)

    result_df = pd.DataFrame(0, index=A.index, columns=cols_to_make)
    for k, v in A.iteritems():
        temp_list = v.split(',')
        for item in temp_list:
            if ('A_' + item) in cols_to_make:
                result_df['A_' + item][k] = 1
    return result_df

and extract the first loop into its own function:

def count(A, MIN_OBS):
    temp_dict = collections.Counter()
    for k, v in A.iteritems():
        temp_list = v.split(',')
        for item in temp_list:
            temp_dict[item] += 1
    temp_dict = {k:v for k, v in temp_dict.items() if v > MIN_OBS}
    return temp_dict

From experimentation in an interactive session, we can see this is not the bottleneck; even for "large" DataFrames, count(A, MIN_OBS) completes fairly quickly.

The slowness of orig occurs in the the double for-loop at the end of orig which increments modifies cells in the DataFrame one value at a time (eg result_df['A_' + item][k] = 1 .)

We could replace that double-for loop with a single for-loop over the columns of the DataFrame, using the vectorized string method, A.str.contains to search for values in the strings. Since we never split the original strings into Python lists of strings (or Pandas DataFrames holding the string fragments), we save some memory. Since orig and alt use similar data structures, their memory footprint is about the same.

def alt(A, MIN_OBS):
    temp_dict = count(A, MIN_OBS)
    df = pd.DataFrame(0, index=A.index, columns=temp_dict)
    for col in df:
        df[col] = A.str.contains(r'^{v}|,{v},|,{v}$'.format(v=col)).astype(int)
    df.columns = ['A_{}'.format(col) for col in df]
    return df

Here is an example, on a 200K row DataFrame with 40K different possible values:

import numpy as np
import pandas as pd
import collections

np.random.seed(2016)
ncols = 5
nrows = 200000
nvals = 40000
MIN_OBS = 200

# nrows = 20
# nvals = 4
# MIN_OBS = 2

idx = np.random.randint(ncols, size=nrows).cumsum()
data = np.random.choice(np.arange(nvals), size=idx[-1])
data = np.array_split(data, idx[:-1])
data = map(','.join, [map(str, arr) for arr in data])
A = pd.Series(data)
A.loc[A == ''] = 'null'

def orig(A, MIN_OBS):
    temp_dict = collections.defaultdict(int)
    for k, v in A.iteritems():
        temp_list = v.split(',')
        for item in temp_list:
            temp_dict[item] += 1
    cols_to_make = []
    for k, v in temp_dict.iteritems():
        if v > MIN_OBS:
            cols_to_make.append('A_' + k)

    result_df = pd.DataFrame(0, index=A.index, columns=cols_to_make)
    for k, v in A.iteritems():
        temp_list = v.split(',')
        for item in temp_list:
            if ('A_' + item) in cols_to_make:
                result_df['A_' + item][k] = 1
    return result_df

def count(A, MIN_OBS):
    temp_dict = collections.Counter()
    for k, v in A.iteritems():
        temp_list = v.split(',')
        for item in temp_list:
            temp_dict[item] += 1
    temp_dict = {k:v for k, v in temp_dict.items() if v > MIN_OBS}
    return temp_dict

def alt(A, MIN_OBS):
    temp_dict = count(A, MIN_OBS)
    df = pd.DataFrame(0, index=A.index, columns=temp_dict)
    for col in df:
        df[col] = A.str.contains(r'^{v}|,{v},|,{v}$'.format(v=col)).astype(int)
    df.columns = ['A_{}'.format(col) for col in df]
    return df

Here is a benchmark:

In [48]: %timeit expected = orig(A, MIN_OBS)
1 loops, best of 3: 3.03 s per loop

In [49]: %timeit expected = alt(A, MIN_OBS)
1 loops, best of 3: 483 ms per loop

Note that the majority of the time required for alt to complete is spent in count :

In [60]: %timeit count(A, MIN_OBS)
1 loops, best of 3: 304 ms per loop

Would something like this work or could it be modified to fit your need?

df = pd.DataFrame({'A': ['null', '5,6', '3', 'null', '5,18,22']}, columns=['A'])

         A
0     null
1      5,6
2        3
3     null
4  5,18,22

Then use get_dummies()

pd.get_dummies(df['A'].str.split(',').apply(pd.Series), prefix=df.columns[0])

Result:

       A_3  A_5  A_null  A_18  A_6  A_22
index                                   
1        0    0       1     0    0     0
2        0    1       0     0    1     0
3        1    0       0     0    0     0
4        0    0       1     0    0     0
5        0    1       0     1    0     1

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM