简体   繁体   中英

Removing duplicates in python pandas dataframe based in column value condition

I want to remove duplicates based on TIMESTEP condition. I want to remove duplicates based on 'id' and 'y' column values for every 500000 TIMESTEP (ie I want to remove those duplicate id's which have negative 'y' values). My data has upto 10 million timestep values. I have tried this below code. Is there better way to do this? Image is my df_initial_conv dataset. df_initial_conv

df1 = df_initial_conv.loc[(df_initial_conv['TIMESTEP'] >= 0) & (df_initial_conv['TIMESTEP'] < 500000)] df2 = df_initial_conv.loc[(df_initial_conv['TIMESTEP'] >= 500000) & (df_initial_conv['TIMESTEP'] < 1000000)] df3 = df_initial_conv.loc[(df_initial_conv['TIMESTEP'] >= 1000000) & (df_initial_conv['TIMESTEP'] < 1500000)] df4 = df_initial_conv.loc[(df_initial_conv['TIMESTEP'] >= 1500000) & (df_initial_conv['TIMESTEP'] < 2000000)] df5 = df_initial_conv.loc[(df_initial_conv['TIMESTEP'] >= 2000000) & (df_initial_conv['TIMESTEP'] < 2500000)] df6 = df_initial_conv.loc[(df_initial_conv['TIMESTEP'] >= 2500000) & (df_initial_conv['TIMESTEP'] < 3000000)] df7 = df_initial_conv.loc[(df_initial_conv['TIMESTEP'] >= 3000000) & (df_initial_conv['TIMESTEP'] < 3500000)] df8 = df_initial_conv.loc[(df_initial_conv['TIMESTEP'] >= 3500000) & (df_initial_conv['TIMESTEP'] < 4000000)] df9 = df_initial_conv.loc[(df_initial_conv['TIMESTEP'] >= 4000000) & (df_initial_conv['TIMESTEP'] < 4500000)] df10 = df_initial_ conv.loc[(df_initial_conv['TIMESTEP'] >= 4500000) & (df_initial_conv['TIMESTEP'] < 5000000)] df11 = df_initial_conv.loc[(df_initial_conv['TIMESTEP'] >= 5000000) & (df_initial_conv['TIMESTEP'] < 5500000)] df12 = df_initial_conv.loc[(df_initial_conv['TIMESTEP'] >= 5500000) & (df_initial_conv['TIMESTEP'] < 6000000)] df13 = df_initial_conv.loc[(df_initial_conv['TIMESTEP'] >= 6000000) & (df_initial_conv['TIMESTEP'] < 6500000)] df14 = df_initial_conv.loc[(df_initial_conv['TIMESTEP'] >= 6500000) & (df_initial_conv['TIMESTEP'] < 7000000)] df15 = df_initial_conv.loc[(df_initial_conv['TIMESTEP'] >= 7000000) & (df_initial_conv['TIMESTEP'] < 7500000)] df16 = df_initial_conv.loc[(df_initial_conv['TIMESTEP'] >= 7500000) & (df_initial_conv['TIMESTEP'] < 8000000)] df17 = df_initial_conv.loc[(df_initial_conv['TIMESTEP'] >= 8000000) & (df_initial_conv['TIMESTEP'] < 8500000)] df18 = df_initial_conv.loc[(df_initial_conv['TIMESTEP'] >= 8500000) & (df_initial_conv['TIMESTEP'] < 9000000)] df19 = df_initial_c onv.loc[(df_initial_conv['TIMESTEP'] >= 9000000) & (df_initial_conv['TIMESTEP'] < 9500000)] df20 = df_initial_conv.loc[(df_initial_conv['TIMESTEP'] >= 9500000) & (df_initial_conv['TIMESTEP'] <= 10000000)]

df_negatives1 = df1.query('y < 0') df_nonduplicate1 = df_negatives1.drop_duplicates(subset=["id"])

df_negatives2 = df2.query('y < 0') df_nonduplicate2 = df_negatives2.drop_duplicates(subset=["id"])

df_negatives3 = df3.query('y < 0') df_nonduplicate3 = df_negatives3.drop_duplicates(subset=["id"])

df_negatives4 = df4.query('y < 0') df_nonduplicate4 = df_negatives4.drop_duplicates(subset=["id"])

df_negatives5 = df5.query('y < 0') df_nonduplicate5 = df_negatives5.drop_duplicates(subset=["id"])

df_negatives6 = df6.query('y < 0') df_nonduplicate6 = df_negatives6.drop_duplicates(subset=["id"])

df_negatives7 = df7.query('y < 0') df_nonduplicate7 = df_negatives7.drop_duplicates(subset=["id"])

df_negatives8 = df8.query('y < 0') df_nonduplicate8 = df_negatives8.drop_duplicates(subset=["id"])

df_negatives9 = df9.query('y < 0') df_nonduplicate9 = df_negatives9.drop_duplicates(subset=["id"])

df_negatives10 = df10.query('y < 0') df_nonduplicate10 = df_negatives10.drop_duplicates(subset=["id"])

df_negatives11 = df11.query('y < 0') df_nonduplicate11 = df_negatives11.drop_duplicates(subset=["id"])

df_negatives12 = df12.query('y < 0') df_nonduplicate12 = df_negatives12.drop_duplicates(subset=["id"])

df_negatives13 = df13.query('y < 0') df_nonduplicate13 = df_negatives13.drop_duplicates(subset=["id"])

df_negatives14 = df14.query('y < 0') df_nonduplicate14 = df_negatives14.drop_duplicates(subset=["id"])

df_negatives15 = df15.query('y < 0') df_nonduplicate15 = df_negatives15.drop_duplicates(subset=["id"])

df_negatives16 = df16.query('y < 0') df_nonduplicate16 = df_negatives16.drop_duplicates(subset=["id"])

df_negatives17 = df17.query('y < 0') df_nonduplicate17 = df_negatives17.drop_duplicates(subset=["id"])

df_negatives18 = df18.query('y < 0') df_nonduplicate18 = df_negatives18.drop_duplicates(subset=["id"])

df_negatives19 = df19.query('y < 0') df_nonduplicate19 = df_negatives19.drop_duplicates(subset=["id"])

df_negatives20 = df20.query('y < 0') df_nonduplicate20 = df_negatives20.drop_duplicates(subset=["id"])

df_nonduplicat_final = pd.concat([df_nonduplicate1, df_nonduplicate2, df_nonduplicate3, df_nonduplicate4, df_nonduplicate5, df_nonduplicate6, df_nonduplicate7, df_nonduplicate8, df_nonduplicate9, df_nonduplicate10, df_nonduplicate11, df_nonduplicate12, df_nonduplicate13, df_nonduplicate14, df_nonduplicate15, df_nonduplicate16, df_nonduplicate17, df_nonduplicate18, df_nonduplicate19, df_nonduplicate20])

We can do with groupby

cut_op = pd.cut(df_initial_conv['TIMESTEP'] , np.arange(0,1500000,50000))

out = df_initial_conv.query('y < 0').groupby(cut_op).head(1)

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM