简体   繁体   English

根据列值条件删除python pandas数据框中的重复项

[英]Removing duplicates in python pandas dataframe based in column value condition

I want to remove duplicates based on TIMESTEP condition.我想根据 TIMESTEP 条件删除重复项。 I want to remove duplicates based on 'id' and 'y' column values for every 500000 TIMESTEP (ie I want to remove those duplicate id's which have negative 'y' values).我想根据每 500000 TIMESTEP 的 'id' 和 'y' 列值删除重复项(即,我想删除那些具有负 'y' 值的重复 id)。 My data has upto 10 million timestep values.我的数据有多达 1000 万个时间步长值。 I have tried this below code.我已经尝试过下面的代码。 Is there better way to do this?有没有更好的方法来做到这一点? Image is my df_initial_conv dataset.图片是我的 df_initial_conv 数据集。 df_initial_conv df_initial_conv

df1 = df_initial_conv.loc[(df_initial_conv['TIMESTEP'] >= 0) & (df_initial_conv['TIMESTEP'] < 500000)] df2 = df_initial_conv.loc[(df_initial_conv['TIMESTEP'] >= 500000) & (df_initial_conv['TIMESTEP'] < 1000000)] df3 = df_initial_conv.loc[(df_initial_conv['TIMESTEP'] >= 1000000) & (df_initial_conv['TIMESTEP'] < 1500000)] df4 = df_initial_conv.loc[(df_initial_conv['TIMESTEP'] >= 1500000) & (df_initial_conv['TIMESTEP'] < 2000000)] df5 = df_initial_conv.loc[(df_initial_conv['TIMESTEP'] >= 2000000) & (df_initial_conv['TIMESTEP'] < 2500000)] df6 = df_initial_conv.loc[(df_initial_conv['TIMESTEP'] >= 2500000) & (df_initial_conv['TIMESTEP'] < 3000000)] df7 = df_initial_conv.loc[(df_initial_conv['TIMESTEP'] >= 3000000) & (df_initial_conv['TIMESTEP'] < 3500000)] df8 = df_initial_conv.loc[(df_initial_conv['TIMESTEP'] >= 3500000) & (df_initial_conv['TIMESTEP'] < 4000000)] df9 = df_initial_conv.loc[(df_initial_conv['TIMESTEP'] >= 4000000) & (df_initial_conv['TIMESTEP'] < 4500000)] df10 = df_initial_ df1 = df_initial_conv.loc[(df_initial_conv['TIMESTEP'] >= 0) & (df_initial_conv['TIMESTEP'] < 500000)] df2 = df_initial_conv.loc[(df_initial_conv['TIMESTEP'] >= 500000) & (df_initial_conv[ 'TIMESTEP'] < 1000000)] df3 = df_initial_conv.loc[(df_initial_conv['TIMESTEP'] >= 1000000) & (df_initial_conv['TIMESTEP'] < 1500000)] df4 = df_initial_conv.loc[(df_initial_conv['TIMESTEP'] >= 1500000) & (df_initial_conv['TIMESTEP'] < 2000000)] df5 = df_initial_conv.loc[(df_initial_conv['TIMESTEP'] >= 2000000) & (df_initial_conv['TIMESTEP'] < 2500000)] df6 = df_initial_conv.loc [(df_initial_conv['TIMESTEP'] >= 2500000) & (df_initial_conv['TIMESTEP'] < 3000000)] df7 = df_initial_conv.loc[(df_initial_conv['TIMESTEP'] >= 3000000) & (df_initial_conv['TIMESTEP'] < 3500000)] df8 = df_initial_conv.loc[(df_initial_conv['TIMESTEP'] >= 3500000) & (df_initial_conv['TIMESTEP'] < 4000000)] df9 = df_initial_conv.loc[(df_initial_conv['TIMESTEP'] >= 4000000) & (df_initial_conv['TIMESTEP'] < 4500000)] df10 = df_initial_ conv.loc[(df_initial_conv['TIMESTEP'] >= 4500000) & (df_initial_conv['TIMESTEP'] < 5000000)] df11 = df_initial_conv.loc[(df_initial_conv['TIMESTEP'] >= 5000000) & (df_initial_conv['TIMESTEP'] < 5500000)] df12 = df_initial_conv.loc[(df_initial_conv['TIMESTEP'] >= 5500000) & (df_initial_conv['TIMESTEP'] < 6000000)] df13 = df_initial_conv.loc[(df_initial_conv['TIMESTEP'] >= 6000000) & (df_initial_conv['TIMESTEP'] < 6500000)] df14 = df_initial_conv.loc[(df_initial_conv['TIMESTEP'] >= 6500000) & (df_initial_conv['TIMESTEP'] < 7000000)] df15 = df_initial_conv.loc[(df_initial_conv['TIMESTEP'] >= 7000000) & (df_initial_conv['TIMESTEP'] < 7500000)] df16 = df_initial_conv.loc[(df_initial_conv['TIMESTEP'] >= 7500000) & (df_initial_conv['TIMESTEP'] < 8000000)] df17 = df_initial_conv.loc[(df_initial_conv['TIMESTEP'] >= 8000000) & (df_initial_conv['TIMESTEP'] < 8500000)] df18 = df_initial_conv.loc[(df_initial_conv['TIMESTEP'] >= 8500000) & (df_initial_conv['TIMESTEP'] < 9000000)] df19 = df_initial_c conv.loc[(df_initial_conv['TIMESTEP'] >= 4500000) & (df_initial_conv['TIMESTEP'] < 5000000)] df11 = df_initial_conv.loc[(df_initial_conv['TIMESTEP'] >= 5000000) & (df_initial_conv['TIMESTEP '] < 5500000)] df12 = df_initial_conv.loc[(df_initial_conv['TIMESTEP'] >= 5500000) & (df_initial_conv['TIMESTEP'] < 6000000)] df13 = df_initial_conv.loc[(df_initial_conv['TIMESTEP'] >= 6000000) & (df_initial_conv['TIMESTEP'] < 6500000)] df14 = df_initial_conv.loc[(df_initial_conv['TIMESTEP'] >= 6500000) & (df_initial_conv['TIMESTEP'] < 7000000)] df15 = df_initial_conv.loc[( df_initial_conv['TIMESTEP'] >= 7000000) & (df_initial_conv['TIMESTEP'] < 7500000)] df16 = df_initial_conv.loc[(df_initial_conv['TIMESTEP'] >= 7500000) & (df_initial_conv['TIMESTEP'] < 8000000) ] df17 = df_initial_conv.loc[(df_initial_conv['TIMESTEP'] >= 8000000) & (df_initial_conv['TIMESTEP'] < 8500000)] df18 = df_initial_conv.loc[(df_initial_conv['TIMESTEP'] >= 8500000) & (df_initial_conv ['TIMESTEP'] < 9000000)] df19 = df_initial_c onv.loc[(df_initial_conv['TIMESTEP'] >= 9000000) & (df_initial_conv['TIMESTEP'] < 9500000)] df20 = df_initial_conv.loc[(df_initial_conv['TIMESTEP'] >= 9500000) & (df_initial_conv['TIMESTEP'] <= 10000000)] onv.loc[(df_initial_conv['TIMESTEP'] >= 9000000) & (df_initial_conv['TIMESTEP'] < 9500000)] df20 = df_initial_conv.loc[(df_initial_conv['TIMESTEP'] >= 9500000) & (df_initial_conv['TIMESTEP '] <= 10000000)]

df_negatives1 = df1.query('y < 0') df_nonduplicate1 = df_negatives1.drop_duplicates(subset=["id"]) df_negatives1 = df1.query('y < 0') df_nonduplicate1 = df_negatives1.drop_duplicates(subset=["id"])

df_negatives2 = df2.query('y < 0') df_nonduplicate2 = df_negatives2.drop_duplicates(subset=["id"]) df_negatives2 = df2.query('y < 0') df_nonduplicate2 = df_negatives2.drop_duplicates(subset=["id"])

df_negatives3 = df3.query('y < 0') df_nonduplicate3 = df_negatives3.drop_duplicates(subset=["id"]) df_negatives3 = df3.query('y < 0') df_nonduplicate3 = df_negatives3.drop_duplicates(subset=["id"])

df_negatives4 = df4.query('y < 0') df_nonduplicate4 = df_negatives4.drop_duplicates(subset=["id"]) df_negatives4 = df4.query('y < 0') df_nonduplicate4 = df_negatives4.drop_duplicates(subset=["id"])

df_negatives5 = df5.query('y < 0') df_nonduplicate5 = df_negatives5.drop_duplicates(subset=["id"]) df_negatives5 = df5.query('y < 0') df_nonduplicate5 = df_negatives5.drop_duplicates(subset=["id"])

df_negatives6 = df6.query('y < 0') df_nonduplicate6 = df_negatives6.drop_duplicates(subset=["id"]) df_negatives6 = df6.query('y < 0') df_nonduplicate6 = df_negatives6.drop_duplicates(subset=["id"])

df_negatives7 = df7.query('y < 0') df_nonduplicate7 = df_negatives7.drop_duplicates(subset=["id"]) df_negatives7 = df7.query('y < 0') df_nonduplicate7 = df_negatives7.drop_duplicates(subset=["id"])

df_negatives8 = df8.query('y < 0') df_nonduplicate8 = df_negatives8.drop_duplicates(subset=["id"]) df_negatives8 = df8.query('y < 0') df_nonduplicate8 = df_negatives8.drop_duplicates(subset=["id"])

df_negatives9 = df9.query('y < 0') df_nonduplicate9 = df_negatives9.drop_duplicates(subset=["id"]) df_negatives9 = df9.query('y < 0') df_nonduplicate9 = df_negatives9.drop_duplicates(subset=["id"])

df_negatives10 = df10.query('y < 0') df_nonduplicate10 = df_negatives10.drop_duplicates(subset=["id"]) df_negatives10 = df10.query('y < 0') df_nonduplicate10 = df_negatives10.drop_duplicates(subset=["id"])

df_negatives11 = df11.query('y < 0') df_nonduplicate11 = df_negatives11.drop_duplicates(subset=["id"]) df_negatives11 = df11.query('y < 0') df_nonduplicate11 = df_negatives11.drop_duplicates(subset=["id"])

df_negatives12 = df12.query('y < 0') df_nonduplicate12 = df_negatives12.drop_duplicates(subset=["id"]) df_negatives12 = df12.query('y < 0') df_nonduplicate12 = df_negatives12.drop_duplicates(subset=["id"])

df_negatives13 = df13.query('y < 0') df_nonduplicate13 = df_negatives13.drop_duplicates(subset=["id"]) df_negatives13 = df13.query('y < 0') df_nonduplicate13 = df_negatives13.drop_duplicates(subset=["id"])

df_negatives14 = df14.query('y < 0') df_nonduplicate14 = df_negatives14.drop_duplicates(subset=["id"]) df_negatives14 = df14.query('y < 0') df_nonduplicate14 = df_negatives14.drop_duplicates(subset=["id"])

df_negatives15 = df15.query('y < 0') df_nonduplicate15 = df_negatives15.drop_duplicates(subset=["id"]) df_negatives15 = df15.query('y < 0') df_nonduplicate15 = df_negatives15.drop_duplicates(subset=["id"])

df_negatives16 = df16.query('y < 0') df_nonduplicate16 = df_negatives16.drop_duplicates(subset=["id"]) df_negatives16 = df16.query('y < 0') df_nonduplicate16 = df_negatives16.drop_duplicates(subset=["id"])

df_negatives17 = df17.query('y < 0') df_nonduplicate17 = df_negatives17.drop_duplicates(subset=["id"]) df_negatives17 = df17.query('y < 0') df_nonduplicate17 = df_negatives17.drop_duplicates(subset=["id"])

df_negatives18 = df18.query('y < 0') df_nonduplicate18 = df_negatives18.drop_duplicates(subset=["id"]) df_negatives18 = df18.query('y < 0') df_nonduplicate18 = df_negatives18.drop_duplicates(subset=["id"])

df_negatives19 = df19.query('y < 0') df_nonduplicate19 = df_negatives19.drop_duplicates(subset=["id"]) df_negatives19 = df19.query('y < 0') df_nonduplicate19 = df_negatives19.drop_duplicates(subset=["id"])

df_negatives20 = df20.query('y < 0') df_nonduplicate20 = df_negatives20.drop_duplicates(subset=["id"]) df_negatives20 = df20.query('y < 0') df_nonduplicate20 = df_negatives20.drop_duplicates(subset=["id"])

df_nonduplicat_final = pd.concat([df_nonduplicate1, df_nonduplicate2, df_nonduplicate3, df_nonduplicate4, df_nonduplicate5, df_nonduplicate6, df_nonduplicate7, df_nonduplicate8, df_nonduplicate9, df_nonduplicate10, df_nonduplicate11, df_nonduplicate12, df_nonduplicate13, df_nonduplicate14, df_nonduplicate15, df_nonduplicate16, df_nonduplicate17, df_nonduplicate18, df_nonduplicate19, df_nonduplicate20]) df_nonduplicat_final = pd.concat([df_nonduplicate1, df_nonduplicate2, df_nonduplicate3, df_nonduplicate4, df_nonduplicate5, df_nonduplicate6, df_nonduplicate7, df_nonduplicate8, df_nonduplicate9, df_nonduplicate10, df_nonduplicate11, df_nonduplicate12, df_nonduplicate13, df_nonduplicate14, df_nonduplicate15, df_nonduplicate16, df_nonduplicate17, df_nonduplicate18, df_nonduplicate19, df_nonduplicate20])

We can do with groupby我们可以用groupby

cut_op = pd.cut(df_initial_conv['TIMESTEP'] , np.arange(0,1500000,50000))

out = df_initial_conv.query('y < 0').groupby(cut_op).head(1)

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM