简体   繁体   中英

Python regex to find nested parentheses outside double quotes

I have an input string which contains parenthesis inside and outside double quotes.These parentheses can be nested. I want to strip off strings with parentheses present only outside of double quotes.

I tried this regex r'\\((?:[^)(]|\\((?:[^)(]|\\([^)(]*\\))*\\))*\\)' This fetches everything that is enclosed within round brackets no matter inside or outside double quotes.

    import re
    input_string = '''"Hello World (Don't want to strip this (also not this))"  anything outside round brackets should remain as is(strip this (strip this also as it is outside double quotes))'''
    result = re.sub(r'\((?:[^)(]|\((?:[^)(]|\([^)(]*\))*\))*\)','', input_string)
    print result

The actual output I am getting is:

'"Hello World "  anything outside round brackets should remain as is'

I expect the output to be:

'"Hello World (Don't want to strip this (also not this))"  anything outside round brackets should remain as is'

If your parentheses are balanced (with help of this answer):

import re
input_string = '''"Hello World (Don't want to strip this (also not this))"  anything outside round brackets should remain as is(strip this (strip this also as it is outside double quotes) xxx) Also remain this (String this)'''

def strip_parentheses(g):
    n = 1  # run at least once
    while n:
        g, n = re.subn(r'\([^()]*\)', '', g)  # remove non-nested/flat balanced parts
    return g

s = re.sub(r'".*?"|([^"]*)', lambda g: strip_parentheses(g.group(1)) if g.group(1) else g.group(), input_string)

print(s)

Prints:

"Hello World (Don't want to strip this (also not this))"  anything outside round brackets should remain as is Also remain this 

EDIT Running some test-cases:

import re
input_string = '''"Hello World (Don't want to strip this (also not this))"  anything outside round brackets should remain as is(strip this (strip this also as it is outside double quotes) xxx) Also remain this ((String this))'''

test_cases = ['Normal string (strip this)',
'"Normal string (dont strip this)"',
'"Normal string (dont strip this)" but (strip this)',
'"Normal string (dont strip this)" but (strip this) and (strip this)',
'"Normal string (dont strip this)" but (strip this) and (strip this) but "dont strip (this)"',
'"Normal string (dont strip this)" but ((strip this) and this) and (strip (strip this))',
'"Normal string (dont strip this)" but ((strip this) but "remain this (xxx)") ',
]

def strip_parentheses(g):
    n = 1  # run at least once
    while n:
        g, n = re.subn(r'\([^()]*\)', '', g)  # remove non-nested/flat balanced parts
    return g

def my_strip(s):
    return re.sub(r'".*?"|([^"]*)', lambda g: strip_parentheses(g.group(1)) if g.group(1) else g.group(), s)

for test in test_cases:
    print(test)
    print(my_strip(test))
    print()

Prints:

Normal string (strip this)
Normal string 

"Normal string (dont strip this)"
"Normal string (dont strip this)"

"Normal string (dont strip this)" but (strip this)
"Normal string (dont strip this)" but 

"Normal string (dont strip this)" but (strip this) and (strip this)
"Normal string (dont strip this)" but  and 

"Normal string (dont strip this)" but (strip this) and (strip this) but "dont strip (this)"
"Normal string (dont strip this)" but  and  but "dont strip (this)"

"Normal string (dont strip this)" but ((strip this) and this) and (strip (strip this))
"Normal string (dont strip this)" but  and 

"Normal string (dont strip this)" but ((strip this) but "remain this (xxx)") 
"Normal string (dont strip this)" but ( but "remain this (xxx)") 

EDIT: To remove all () , even with quoted strings inside them:

import re
input_string = '''"Hello World (Don't want to strip this (also not this))"  anything outside round brackets should remain as is(strip this (strip this also as it is outside double quotes) xxx) Also remain this ((String this))'''

test_cases = ['"Normal string (dont strip this)" but (strip this) and (strip this) but "dont strip (this)"',
'"Normal string (dont strip this)" but ((strip this) and this) and (strip (strip this))',
'"Normal string (dont strip this)" but ((strip this) but "remain this (xxx)") ',
]

def strip_parentheses(g):
    n = 1  # run at least once
    while n:
        g, n = re.subn(r'\([^()]*\)', '', g)  # remove non-nested/flat balanced parts
    return g

def my_strip(s):
    s = re.sub(r'".*?"|([^"]*)', lambda g: strip_parentheses(g.group(1)) if g.group(1) else g.group(), s)
    return re.sub(r'".*?"|(\(.*\))', lambda g: '' if g.group(1) else g.group(), s)

for test in test_cases:
    print(test)
    print(my_strip(test))
    print()

Prints:

"Normal string (dont strip this)" but (strip this) and (strip this) but "dont strip (this)"
"Normal string (dont strip this)" but  and  but "dont strip (this)"

"Normal string (dont strip this)" but ((strip this) and this) and (strip (strip this))
"Normal string (dont strip this)" but  and 

"Normal string (dont strip this)" but ((strip this) but "remain this (xxx)") 
"Normal string (dont strip this)" but  

Using regex instead of re , you could go with

"[^"]+"(*SKIP)(*FAIL) # ignore anything between double quotes
|                     # or
\(
    (?:[^()]*|(?R))+  # match nested parentheses
\)

See a demo on regex101.com .


In Python this could be

 import regex as re data = """"Hello World (Don't want to strip this (also not this))" anything outside round brackets should remain as is(strip this (strip this also as it is outside double quotes))""" rx = re.compile(r''' "[^"]+"(*SKIP)(*FAIL) | \\( (?:[^()]*|(?R))+ \\)''', re.VERBOSE) data = rx.sub("", data) print(data) 

Yielding

 "Hello World (Don't want to strip this (also not this))" anything outside round brackets should remain as is 

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM