简体   繁体   中英

Get sample of Pandas dataframe but keep all unique values

I have the following function in Pandas:

def get_sample(X_test, y_true, frac_sample = True, sample_n = 10, sample_perc = 100):
    if not frac_sample:
        test_joined = pd.concat([X_test,y_true], axis=1).sample(n=sample_n).reset_index(drop=True)
        X_test_temp = test_joined.iloc[:, :-1]
        y_true_temp = test_joined.iloc[:, -1]
    else:
        test_joined = pd.concat([X_test,y_true], axis=1).sample(frac=sample_perc/100).reset_index(drop=True)
        X_test_temp = test_joined.iloc[:, :-1]
        y_true_temp = test_joined.iloc[:, -1]

    return X_test_temp, y_true_temp

I have two dataframes where I want to sample them either by an specific number of rows or my a percentage. The y_true has one single column.

However, what I try to do is:

  • Make sure that the sample will have at least 1 row of each unique value from y_true .
  • If sample size is not big enough to get 1 of each unique value, then get the smallest sample that can do it.

Here is my take on your interesting question.

You can define the following helper functions:

def get_sample(X_test, y_true, **args):
    """Get a sample dataframe from X_test and y_true.
    
    Args:
        X_test: first input dataframe.
        y_true: second input dataframe.
    
    Returns:
        Sample dataframe.
    
    """
    df = pd.concat([X_test, y_true], axis=1)
    sample_df = df.sample(**args).reset_index(drop=True)
    target = (
        len(y_true.unique())
        if sample_df.shape[0] > len(y_true.unique())
        else sample_df.shape[0]
    )
    while sample_df.iloc[:, -1].nunique() != target:
        sample_df = df.sample(**args).reset_index(drop=True)
    return sample_df


def get_x_test_and_y_true(df):
    """Split dataframe.
    
    Args:
        df: input dataframe.
    
    Returns:
        X_test and y_true dataframes.
    
    """
    return df.iloc[:, :-1], df.iloc[:, -1]

And so, with this toy dataframe which y_true column has 10 unique values (0 to 9):

import random

import pandas as pd

df = pd.DataFrame(
    {
        "date_time": pd.to_datetime(
            pd.date_range(start="1/1/2021", end="12/31/2021", freq="D"), unit="D"
        )
    }
)
df["X_test"] = [int(random.random() * 100) for _ in range(df.shape[0])]
df["y_true"] = [int(random.random() * 10) for _ in range(df.shape[0])]

print(df.info())
# Output
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 365 entries, 0 to 364
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   date_time  365 non-null    datetime64[ns]
 1   X_test     365 non-null    int64
 2   y_true     365 non-null    int64
dtypes: datetime64[ns](1), int64(2)

You can use the previously defined helper functions like this:

X_test, y_true = get_x_test_and_y_true(
    get_sample(df["X_test"], df["y_true"], n=8)
)
print(y_true.shape[0])  # 8
print(y_true.nunique())  # 8

X_test, y_true = get_x_test_and_y_true(
    get_sample(df["X_test"], df["y_true"], n=12)
)
print(y_true.shape[0])  # 12
print(y_true.nunique())  # 10

X_test, y_true = get_x_test_and_y_true(
    get_sample(df["X_test"], df["y_true"], frac=0.01)
)
print(y_true.shape[0])  # 4
print(y_true.nunique())  # 4

X_test, y_true = get_x_test_and_y_true(
    get_sample(df["X_test"], df["y_true"], frac=0.5)
)
print(y_true.shape[0])  # 182
print(y_true.nunique())  # 10

X_test, y_true = get_x_test_and_y_true(
    get_sample(df["X_test"], df["y_true"], frac=1.1, replace=True)
)
print(y_true.shape[0])  # 402
print(y_true.nunique())  # 10

I would first shuffle the df to pick a minimal set of rows (with at least one row per value of y_true ). Then I'd complement that with however many more samples from the rest we need:

def get_sample(df, ycol, n=None, frac=None):
    n = round(frac * len(df)) if n is None else n
    z = df.sample(frac=1).duplicated(subset=ycol)
    a = df.loc[~z]
    n = max(0, min(z.sum(), n - len(a)))
    b = df.loc[z].sample(n=n)
    return pd.concat([a, b], axis=0)

Example:

np.random.seed(0)

df = pd.DataFrame(
    np.random.uniform(size=(20, 3)),
    columns=list('abc')
).assign(y_true=np.random.randint(0,3,20))

>>> get_sample(df, 'y_true', n=0)
          a         b         c  y_true
4  0.568045  0.925597  0.071036       1
7  0.799159  0.461479  0.780529       0
9  0.944669  0.521848  0.414662       2

>>> get_sample(df, 'y_true', n=4)
           a         b         c  y_true
2   0.437587  0.891773  0.963663       1
12  0.612096  0.616934  0.943748       2
15  0.670638  0.210383  0.128926       0
19  0.253292  0.466311  0.244426       1

>>> print(get_sample(df, 'y_true', frac=.3))
           a         b         c  y_true
0   0.548814  0.715189  0.602763       1
9   0.944669  0.521848  0.414662       2
13  0.681820  0.359508  0.437032       0
12  0.612096  0.616934  0.943748       2
4   0.568045  0.925597  0.071036       1
8   0.118274  0.639921  0.143353       1

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM