简体   繁体   中英

How to remove common words from list of lists in Python?

I have a large number of "groups" of words. If any of the words from one group appears both in column A and column B, I want to remove the words in the group from the two columns. How do I loop over all the groups (ie over the sublists in the list)?

The flawed code below only removes the common words from the last group, not all three groups (lists) in stuff. [I first create an indicator if one of the words from the groups is in the string, and then create another indicator if both strings have a word from the group. Only for the pairs of A and B where both have a word from the group, I remove the particular group words.]

How do I correctly specify the loop?

EDIT: In my suggested code, each loop restarted with the original columns instead looping over the columns with words removed from the previous group(s).

The solution suggestions are more elegant and neat but remove the words if they are part of another word (eg the word 'foo' is correctly removed from 'foo hello' but incorrectly also removed from 'foobar'.


# Input data:

data = {'A': ['summer time third grey abc', 'yellow sky hello table', 'fourth autumnwind'],
        'B': ['defg autumn times fourth table', 'not red skies second garnet', 'first blue chair winter']
}
df = pd.DataFrame (data, columns = ['A', 'B'])  

                            A                               B
0  summer time third grey abc  defg autumn times fourth table
1      yellow sky hello table     not red skies second garnet
2           fourth autumnwind         first blue chair winter
# Groups of words to be removed:

colors = ['red skies', 'red sky', 'yellow sky', 'yellow skies', 'red', 'blue', 'black', 'yellow', 'green', 'grey']
seasons = ['summer times', 'summer time', 'autumn times', 'autumn time', 'spring', 'summer', 'winter', 'autumn']
numbers = ['first', 'second', 'third', 'fourth']

stuff = [colors, seasons, numbers]



# Code below only removes the last list in stuff (numbers):

def fA(S,y):
    for word in listed:
        if re.search(r'\b' + re.escape(word) + r'\b', S):
            y = 1
    return y


def fB(T,y):
    for word in listed:
        if re.search(r'\b' + re.escape(word) + r'\b', T):
            y = 1
    return y



def fARemove(S):
    for word in listed:
        if re.search(r'\b' + re.escape(word) + r'\b', S):
            S=re.sub(r'\b{}\b'.format(re.escape(word)), ' ', S)
    return S



def fBRemove(T):
    for word in listed:
        if re.search(r'\b' + re.escape(word) + r'\b', T):
            T=re.sub(r'\b{}\b'.format(re.escape(word)), ' ', T)
    return T

for listed in stuff:

    df['A_Ind'] = 0
    df['B_Ind'] = 0

    df['A_Ind'] = df.apply(lambda x: fA(x.A, x.A_Ind), axis=1)
    df['B_Ind'] = df.apply(lambda x: fB(x.B, x.B_Ind), axis=1)

    df['inboth'] = 0
    df.loc[((df.A_Ind == 1) & (df.B_Ind == 1)), 'inboth'] = 1

    df['A_new'] = df['A']
    df['B_new'] = df['B']

    df.loc[df.inboth == 1, 'A_new'] = df.apply(lambda x: fARemove(x.A), axis=1)
    df.loc[df.inboth == 1, 'B_new'] = df.apply(lambda x: fBRemove(x.B), axis=1)


    del df['inboth']
    del df['A_Ind']
    del df['B_Ind']
    
    df['A_new'] = df['A_new'].str.replace('\s{2,}', ' ')
    df['A_new'] = df['A_new'].str.strip()
    df['B_new'] = df['B_new'].str.replace('\s{2,}', ' ')
    df['B_new'] = df['B_new'].str.strip()

Expected output is:

         A_new              B_new
0     grey abc         defg table
1  hello table   no second garnet
2   autumnwind  blue chair winter

this needs python 3.7+ to work (otherwise more codes are needed). Based on your list of keywords, I think you are trying to prioritize multi words matching.

dummy=0
def splitter(text):
    global dummy
    text=text.strip()
    if not text:
        return []
    for n,s in enumerate(stuff):
        for keyword in s:
            p=text.find(keyword)
            if p>=0:
                return splitter(text[:p])+[((dummy,keyword),n)]+splitter(text[p+len(keyword):])
    else:
        return [((dummy,text),-1)]

def remover(row):
    A=dict(splitter(row['A']))
    B=dict(splitter(row['B']))
    s=set(A.values()).intersection(set(B.values()))
    return [' '.join([k[1] for k,v in A.items() if v<0 or v not in s]),' '.join([k[1] for k,v in B.items() if v<0 or v not in s])]
pd.concat([df,pd.DataFrame(df.apply(remover, axis=1).to_list(), columns=['newA','newB'])],  axis=1)

import re

flatten_list = lambda l: [item for subl in l for item in subl]
def remove_recursive(s, l):
    while len(l) > 0:
        s = s.replace(l[0], '')
        l = l[1:]

    return re.sub(r'\ +', ' ', s).strip()


df['A_new'] = df.apply(lambda x: remove_recursive(x.A, flatten_list([l for l in stuff if (len([e for e in l if e in x.A]) > 0 and len([e for e in l if e in x.B]) > 0)])), axis = 1)
df['B_new'] = df.apply(lambda x: remove_recursive(x.B, flatten_list([l for l in stuff if (len([e for e in l if e in x.A]) > 0 and len([e for e in l if e in x.B]) > 0)])), axis = 1)

df.head()

#            A_new              B_new
# 0  time grey abc         defg table
# 1    hello table  not second garnet
# 2           wind         blue chair

This is similar code to the one in comments, using a recursive lambda for matching words and a flattened list to figure the words in the lists that match in both columns.

Below is the code from the original question using regex r'\b{}\b', corrected for looping over the latest strings and not the original strings.

# Groups of words to be removed:

colors = ['red skies', 'red sky', 'yellow sky', 'yellow skies', 'red', 'blue', 'black', 'yellow', 'green', 'grey']
seasons = ['summer times', 'summer time', 'autumn times', 'autumn time', 'spring', 'summer', 'winter', 'autumn']
numbers = ['first', 'second', 'third', 'fourth']

stuff = [colors, seasons, numbers]


df['A_new'] = df['A']
df['B_new'] = df['B']


def f_indicator(S,y):
    for word in listed:
        if re.search(r'\b' + re.escape(word) + r'\b', S):
            y = 1
    return y


def fRemove(S):
    for word in listed:
        if re.search(r'\b' + re.escape(word) + r'\b', S):
            S=re.sub(r'\b{}\b'.format(re.escape(word)), ' ', S)
    return S


for listed in stuff:

    df['A_Ind'] = 0
    df['B_Ind'] = 0

    df['A_Ind'] = df.apply(lambda x: f_indicator(x.A_new, x.A_Ind), axis=1)
    df['B_Ind'] = df.apply(lambda x: f_indicator(x.B_new, x.B_Ind), axis=1)

    df['inboth'] = 0
    df.loc[((df.A_Ind == 1) & (df.B_Ind == 1)), 'inboth'] = 1



    df.loc[df.inboth == 1, 'A_new'] = df.apply(lambda x: fRemove(x.A_new), axis=1)
    df.loc[df.inboth == 1, 'B_new'] = df.apply(lambda x: fRemove(x.B_new), axis=1)


    del df['inboth']
    del df['A_Ind']
    del df['B_Ind']

    
    df['A_new'] = df['A_new'].str.replace('\s{2,}', ' ')
    df['A_new'] = df['A_new'].str.strip()
    df['B_new'] = df['B_new'].str.replace('\s{2,}', ' ')
    df['B_new'] = df['B_new'].str.strip()

del df['A']
del df['B']
print(df)

Output:

         A_new              B_new
0     grey abc         defg table
1  hello table  not second garnet
2   autumnwind  blue chair winter

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM