I have the following function in Pandas:
def get_sample(X_test, y_true, frac_sample = True, sample_n = 10, sample_perc = 100):
if not frac_sample:
test_joined = pd.concat([X_test,y_true], axis=1).sample(n=sample_n).reset_index(drop=True)
X_test_temp = test_joined.iloc[:, :-1]
y_true_temp = test_joined.iloc[:, -1]
else:
test_joined = pd.concat([X_test,y_true], axis=1).sample(frac=sample_perc/100).reset_index(drop=True)
X_test_temp = test_joined.iloc[:, :-1]
y_true_temp = test_joined.iloc[:, -1]
return X_test_temp, y_true_temp
I have two dataframes where I want to sample them either by an specific number of rows or my a percentage. The y_true
has one single column.
However, what I try to do is:
y_true
.Here is my take on your interesting question.
You can define the following helper functions:
def get_sample(X_test, y_true, **args):
"""Get a sample dataframe from X_test and y_true.
Args:
X_test: first input dataframe.
y_true: second input dataframe.
Returns:
Sample dataframe.
"""
df = pd.concat([X_test, y_true], axis=1)
sample_df = df.sample(**args).reset_index(drop=True)
target = (
len(y_true.unique())
if sample_df.shape[0] > len(y_true.unique())
else sample_df.shape[0]
)
while sample_df.iloc[:, -1].nunique() != target:
sample_df = df.sample(**args).reset_index(drop=True)
return sample_df
def get_x_test_and_y_true(df):
"""Split dataframe.
Args:
df: input dataframe.
Returns:
X_test and y_true dataframes.
"""
return df.iloc[:, :-1], df.iloc[:, -1]
And so, with this toy dataframe which y_true
column has 10 unique values (0 to 9):
import random
import pandas as pd
df = pd.DataFrame(
{
"date_time": pd.to_datetime(
pd.date_range(start="1/1/2021", end="12/31/2021", freq="D"), unit="D"
)
}
)
df["X_test"] = [int(random.random() * 100) for _ in range(df.shape[0])]
df["y_true"] = [int(random.random() * 10) for _ in range(df.shape[0])]
print(df.info())
# Output
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 365 entries, 0 to 364
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 date_time 365 non-null datetime64[ns]
1 X_test 365 non-null int64
2 y_true 365 non-null int64
dtypes: datetime64[ns](1), int64(2)
You can use the previously defined helper functions like this:
X_test, y_true = get_x_test_and_y_true(
get_sample(df["X_test"], df["y_true"], n=8)
)
print(y_true.shape[0]) # 8
print(y_true.nunique()) # 8
X_test, y_true = get_x_test_and_y_true(
get_sample(df["X_test"], df["y_true"], n=12)
)
print(y_true.shape[0]) # 12
print(y_true.nunique()) # 10
X_test, y_true = get_x_test_and_y_true(
get_sample(df["X_test"], df["y_true"], frac=0.01)
)
print(y_true.shape[0]) # 4
print(y_true.nunique()) # 4
X_test, y_true = get_x_test_and_y_true(
get_sample(df["X_test"], df["y_true"], frac=0.5)
)
print(y_true.shape[0]) # 182
print(y_true.nunique()) # 10
X_test, y_true = get_x_test_and_y_true(
get_sample(df["X_test"], df["y_true"], frac=1.1, replace=True)
)
print(y_true.shape[0]) # 402
print(y_true.nunique()) # 10
I would first shuffle the df
to pick a minimal set of rows (with at least one row per value of y_true
). Then I'd complement that with however many more samples from the rest we need:
def get_sample(df, ycol, n=None, frac=None):
n = round(frac * len(df)) if n is None else n
z = df.sample(frac=1).duplicated(subset=ycol)
a = df.loc[~z]
n = max(0, min(z.sum(), n - len(a)))
b = df.loc[z].sample(n=n)
return pd.concat([a, b], axis=0)
Example:
np.random.seed(0)
df = pd.DataFrame(
np.random.uniform(size=(20, 3)),
columns=list('abc')
).assign(y_true=np.random.randint(0,3,20))
>>> get_sample(df, 'y_true', n=0)
a b c y_true
4 0.568045 0.925597 0.071036 1
7 0.799159 0.461479 0.780529 0
9 0.944669 0.521848 0.414662 2
>>> get_sample(df, 'y_true', n=4)
a b c y_true
2 0.437587 0.891773 0.963663 1
12 0.612096 0.616934 0.943748 2
15 0.670638 0.210383 0.128926 0
19 0.253292 0.466311 0.244426 1
>>> print(get_sample(df, 'y_true', frac=.3))
a b c y_true
0 0.548814 0.715189 0.602763 1
9 0.944669 0.521848 0.414662 2
13 0.681820 0.359508 0.437032 0
12 0.612096 0.616934 0.943748 2
4 0.568045 0.925597 0.071036 1
8 0.118274 0.639921 0.143353 1
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.