How can I read in from a file, then write out to another file only if certain values are in a range?

Question

This is a sample from my peaks_ef.xpk file, which I am reading in.

label dataset sw sf
1H 1H_2
NOESY_F1eF2f.nv
4807.69238281 4803.07373047
600.402832031 600.402832031
1H.L 1H.P 1H.W 1H.B 1H.E 1H.J 1H.U 1H_2.L 1H_2.P 1H_2.W 1H_2.B 1H_2.E 1H_2.J 1H_2.U vol int stat comment flag0 flag8 flag9
0 {1.H2'} 4.93607 0.05000 0.10000 ++ {0.0} {} {1.H1'} 5.82020 0.05000 0.10000 ++ {0.0} {} 0.0 100.0000 0 {} 0 0 0
1 {1.H2'} 4.93607 0.05000 0.10000 ++ {0.0} {} {1.H1'} 5.82020 0.05000 0.10000 ++ {0.0} {} 0.0 100.0000 0 {} 0 0 0
2 {1.H3'} 4.70891 0.05000 0.10000 ++ {0.0} {} {1.H8} 8.13712 0.05000 0.10000 ++ {0.0} {} 0.0 100.0000 0 {} 0 0 0
3 {1.H2'} 4.93607 0.05000 0.10000 ++ {0.0} {} {1.H8} 8.13712 0.05000 0.10000 ++ {0.0} {} 0.0 100.0000 0 {} 0 0 0
4 {2.H2'} 4.55388 0.05000 0.10000 ++ {0.0} {} {2.H1'} 5.90291 0.05000 0.10000 ++ {0.0} {} 0.0 100.0000 0 {} 0 0 0
5 {2.H2'} 4.55388 0.05000 0.10000 ++ {0.0} {} {2.H1'} 5.90291 0.05000 0.10000 ++ {0.0} {} 0.0 100.0000 0 {} 0 0 0
6 {2.H3'} 4.60420 0.05000 0.10000 ++ {0.0} {} {2.H8} 7.61004 0.05000 0.10000 ++ {0.0} {} 0.0 100.0000 0 {} 0 0 0
7 {2.H2'} 4.55388 0.05000 0.10000 ++ {0.0} {} {2.H8} 7.61004 0.05000 0.10000 ++ {0.0} {} 0.0 100.0000 0 {} 0 0 0
8 {1.H3'} 4.70891 0.05000 0.10000 ++ {0.0} {} {2.H8} 7.61004 0.05000 0.10000 ++ {0.0} {} 0.0 100.0000 0 {} 0 0 0
9 {1.H2'} 4.93607 0.05000 0.10000 ++ {0.0} {} {2.H8} 7.61004 0.05000 0.10000 ++ {0.0} {} 0.0 100.0000 0 {} 0 0 0
10 {3.H5} 5.20481 0.05000 0.10000 ++ {0.0} {} {2.H8} 7.61004 0.05000 0.10000 ++ {0.0} {} 0.0 100.0000 0 {} 0 0 0

I want to take the values in the columns 1H.P and 1H_2.P and write them out to another file, but I only want to include values that are within a certain range. I thought I was doing that for my code. The mask variable should "filter" the values right?

This is my code:

import pandas as pd
import os
import sys
import re

i=0;
contents_peak=[]
peak_lines=[]
with open ("ee_pinkH1.xpk","r") as peakPPM:
    for PPM in peakPPM.readlines():
        float_num = re.findall("[\s][1-9]{1}\.[0-9]+",PPM)
        if (len(float_num)>1):
            i=i+1
            value = ('Peak '+ str(i) + ' ' + str(float_num[0]) + ' 0.05 ' + str(float_num[1]) + ' 0.05' + '\n')
            peak_lines.append(value)
tclust_peak = open("tclust.txt","w+")
tclust_peak.write("rbclust \n")
for value in peak_lines:
    tclust_peak.write(value)
tclust_peak.close()

result={}
text = 'ee'
filename= 'ee_pinkH1.xpk'

if text == 'ee':
    df=pd.read_csv("peaks_ee.xpk",sep=" ", skiprows=5)

    shift1 = df["1H.P"]
    shift2 = df["1H_2.P"]

    if filename=='ee_pinkH1.xpk':
        mask = ((shift1>5.1) & (shift1<6)) & ((shift2>7) & (shift2<8.25))
    elif filename == 'ee_pinkH2.xpk':
        mask = ((shift1>3.25)&(shift1<5))&((shift2>7)&(shift2<8.5))

if text == 'ef':
    df = pd.read_csv('peaks_ef.xpk',sep = " ",skiprows=5)
    shift1=df["1H.P"]
    shift2=df["1H_2.P"]

    if filename == 'ef_blue.xpk':
        mask = ((shift1>5) & (shift1<6)) & ((shift2>7.25) & (shift2<8.25))
    elif filename == 'ef_green.xpk':
        mask = ((shift1>7) & (shift1<9)) & ((shift2>5.25) & (shift2<6.2))
    elif filename == 'ef_orange.xpk':
        mask = ((shift1>3) & (shift1<5)) & ((shift2>5.2) & (shift2<6.25))

if text == 'fe':
    df = pd.read_csv('peaks_fe.xpk', sep=" ",skiprows=5)

    shift1= df["1H.P"]
    shift2= df["1H_2.P"]

    if filename == 'fe_yellow.xpk':
        mask = ((shift1>3) & (shift1<5)) & ((shift2>5) & (shift2<6))
    elif filename == 'fe_green.xpk':
        mask = ((shift1>5.1) & (shift1<6)) & ((shift2>7) & (shift2<8.25))

result = df[mask]
result = result[["1H.L","1H_2.L"]]

for col in result.columns:
    result[col] = result[col].str.strip("{} ")
result.drop_duplicates(keep='first', inplace=True)
result = result.set_index([['Atom '+str(i) for i in range(1,len(result)+1)]])
tclust_atom=open("tclust.txt","a")
result.to_string(tclust_atom, header = False)
df1 = df.copy()[['1H.L','1H.P']]
df2 = df.copy()[['1H_2.L','1H_2.P']]

df2.rename(columns={'1H_2.L': '1H.L', '1H_2.P': '1H.P'}, inplace=True)
df = pd.concat([df1,df2])
df['1H.L']=df['1H.L'].apply(lambda row: row.strip('{}'))
df['new']=0.3
df.drop_duplicates(keep='first',inplace=True)

tclust_atom=open("tclust_ppm.txt","w+")
df.to_csv("tclust_ppm.txt",sep=" ", index=False, header=False)

A sample of my output is:

5.H3' 4.43488 0.3
6.H2' 4.49744 0.3
7.H1' 5.95115 0.3
6.H3' 4.51612 0.3
8.H5 5.39709 0.3
7.H3' 4.62099 0.3
7.H2 7.67414 0.3
8.H2' 4.31783 0.3
9.H1' 5.91813 0.3
8.H3' 4.45577 0.3
10.H5 5.17157 0.3
9.H3' 4.66179 0.3

Based on my code, the filter or "mask" variable is in the if statement:

if text == 'ef':
df = pd.read_csv('peaks_ef.xpk',sep = " ",skiprows=5)
shift1=df["1H.P"]
shift2=df["1H_2.P"]

if filename == 'ef_blue.xpk':
    mask = ((shift1>5) & (shift1<6)) & ((shift2>7.25) & (shift2<8.25))
elif filename == 'ef_green.xpk':
    mask = ((shift1>7) & (shift1<9)) & ((shift2>5.25) & (shift2<6.2))
elif filename == 'ef_orange':
    mask = ((shift1>3) & (shift1<5)) & ((shift2>5.2) & (shift2<6.25))

and it should come from the elif filename =='ef_orange': and both shift1 and shift2 should not be greater than 6.25, but in my output I am getting an answer that is 7.67414. Why is my filtering not working and how can I fix it?

Answer 1

by using

shift1=df["1H.P"]
shift2=df["1H_2.P"]

you are condensining your filter to only one serires, that being your column, when instead you want to filiter on the entire dataframe, for your sake, it will be easier to see as its own function.

def fil(df,oneLow,oneHigh,twoLow,twoHigh):
    df = df[((df['1H.P'] > oneLow) & (df['1H.P'] < oneHigh)) & ((df['1H_2.P'] > twoLow) & (df['1H_2.P'] < twoHigh))]
    return df


if text == 'ef':
    df = pd.read_csv('peaks_ef.xpk',sep = " ",skiprows=5)
    #shift1=df["1H.P"] remove
    #shift2=df["1H_2.P"] remove

    if filename == 'ef_blue.xpk':
        #mask = ((shift1>5) & (shift1<6)) & ((shift2>7.25) & (shift2<8.25))
        df = fil(df,5,6,7.25,8.25)
    elif filename == 'ef_green.xpk':
        #mask = ((shift1>7) & (shift1<9)) & ((shift2>5.25) & (shift2<6.2))
        df = fil(df,7,9,5.25,6.2)
    elif filename == 'ef_orange':
        #mask = ((shift1>3) & (shift1<5)) & ((shift2>5.2) & (shift2<6.25))
        df = fil(df,3,5,5.2,6.25)

Edit with full code

import pandas as pd
import os
import sys
import re

def fil(df,oneLow,oneHigh,twoLow,twoHigh):
    df = df[((df['1H.P'] > oneLow) & (df['1H.P'] < oneHigh)) & ((df['1H_2.P'] > twoLow) & (df['1H_2.P'] < twoHigh))]
    return df



i=0;
contents_peak=[]
peak_lines=[]
with open ("ee_pinkH1.xpk","r") as peakPPM:
    for PPM in peakPPM.readlines():
        float_num = re.findall("[\s][1-9]{1}\.[0-9]+",PPM)
        if (len(float_num)>1):
            i=i+1
            value = ('Peak '+ str(i) + ' ' + str(float_num[0]) + ' 0.05 ' + str(float_num[1]) + ' 0.05' + '\n')
            peak_lines.append(value)
tclust_peak = open("tclust.txt","w+")
tclust_peak.write("rbclust \n")
for value in peak_lines:
    tclust_peak.write(value)
tclust_peak.close()

result={}
text = 'ee'
filename= 'ee_pinkH1.xpk'

if text == 'ee':
    df=pd.read_csv("peaks_ee.xpk",sep=" ", skiprows=5)

    if filename=='ee_pinkH1.xpk':
        result = fil(df,5.1,6,7,8.25)
    elif filename == 'ee_pinkH2.xpk':
        result = fil(df,3.25,5,7,8.5)
if text == 'ef':
    df = pd.read_csv('peaks_ef.xpk',sep = " ",skiprows=5)

    if filename == 'ef_blue.xpk':
        result = fil(df,5,6,7.25,8.25)
    elif filename == 'ef_green.xpk':
        result = fil(df,7,9,5.25,6.2)
    elif filename == 'ef_orange.xpk':
        result = fil(df,3,5,5.2,6.25)
if text == 'fe':
    df = pd.read_csv('peaks_fe.xpk', sep=" ",skiprows=5)

    if filename == 'fe_yellow.xpk':
        result= fil(df,3,5,5,6)
    elif filename == 'fe_green.xpk':
        result= fil(df,5.1,6,7,8.25)

for col in result.columns:
    result[col] = result[col].str.strip("{} ")
result.drop_duplicates(keep='first', inplace=True)
result = result.set_index([['Atom '+str(i) for i in range(1,len(result)+1)]])
tclust_atom=open("tclust.txt","a")
result.to_string(tclust_atom, header = False)
df1 = df.copy()[['1H.L','1H.P']]
df2 = df.copy()[['1H_2.L','1H_2.P']]

df2.rename(columns={'1H_2.L': '1H.L', '1H_2.P': '1H.P'}, inplace=True)
df = pd.concat([df1,df2])
df['1H.L']=df['1H.L'].apply(lambda row: row.strip('{}'))
df['new']=0.3
df.drop_duplicates(keep='first',inplace=True)

tclust_atom=open("tclust_ppm.txt","w+")
df.to_csv("tclust_ppm.txt",sep=" ", index=False, header=False)

How can I read in from a file, then write out to another file only if certain values are in a range?

Question

1 answers

solution1
0 ACCPTED 2017-07-31 19:15:30

How can I read in from a file, then write out to another file only if certain values are in a range?

Question

1 answers

solution1 0 ACCPTED 2017-07-31 19:15:30

solution1
0 ACCPTED 2017-07-31 19:15:30