[英]Python Regex matching wrong strings
I have the following python script which does regex matching using 'AND', 'OR' features as well: 我有以下python脚本,它也使用'AND','OR'功能进行正则表达式匹配:
class PyBoolReException(Exception):
def __init__(self, value):
self.value = value
def __str__(self):
return str(self.value)
class PyBoolRe:
def __init__(self, boolstr):
# Require whitespace before words?
self.__needspace = True
# whitespace re
self._wspre = re.compile('^\s*$')
# create regexp string
self.__rexplist = []
oparct = boolstr.count('(')
clparct = boolstr.count(')')
if oparct != clparct:
raise PyBoolReException, 'Mismatched parantheses!'
self.__parse(boolstr)
# if NOT is one of the members, reverse
# the list
# print self.__rexplist
if '!' in self.__rexplist:
self.__rexplist.reverse()
s = self.__makerexp(self.__rexplist)
# print s
self.__rexp = re.compile(s)
def match(self, data):
""" Match the boolean expression, behaviour
is same as the 'match' method of re """
return self.__rexp.match(data)
def search(self, data):
""" Search the boolean expression, behaviour
is same as the 'search' method of re """
return self.__rexp.search(data)
def __parse(self, s):
""" Parse the boolean regular expression string
and create the regexp list """
# The string is a nested parantheses with
# any character in between the parens.
scopy = s[:]
oparmatch, clparmatch = False, False
# Look for a NOT expression
index = scopy.rfind('(')
l = []
if index != -1:
oparmatch = True
index2 = scopy.find(')', index)
if index2 != -1:
clparmatch = True
newstr = scopy[index+1:index2]
# if the string is only of whitespace chars, skip it
if not self._wspre.match(newstr):
self.__rexplist.append(newstr)
replacestr = '(' + newstr + ')'
scopy = scopy.replace(replacestr, '')
self.__parse(scopy)
if not clparmatch and not oparmatch:
if scopy: self.__rexplist.append(scopy)
def is_inbetween(self, l, elem):
""" Find out if an element is in between
in a list """
index = l.index(elem)
if index == 0:
return False
if index>2:
if index in range(1, len(l) -1):
return True
else:
return False
else:
return True
def __makenotexpr(self, s):
""" Make a NOT expression """
if s.find('!') == 0:
return ''.join(('(?!', s[1:], ')'))
else:
return s
def __makerexp(self, rexplist):
""" Make the regular expression string for
the boolean match from the nested list """
is_list = True
if type(rexplist) is str:
is_list = False
elem = rexplist
elif type(rexplist) is list:
elem = rexplist[0]
if type(elem) is list:
elem = elem[0]
eor = False
if not is_list or len(rexplist) == 1:
eor = True
word_str = '.*'
s=''
# Implementing NOT
if elem == '!':
return ''.join(('(?!', self.__makerexp(rexplist[1:]), ')'))
# Implementing OR
elif elem.find(' | ') != -1:
listofors = elem.split(' | ')
for o in listofors:
index = listofors.index(o)
in_bet = self.is_inbetween(listofors, o)
if o:
o = self.__makenotexpr(o)
if in_bet:
s = ''.join((s, '|', word_str, o, '.*'))
else:
s = ''.join((s, word_str, o, '.*'))
# Implementing AND
elif elem.find(' & ') != -1:
listofands = elem.split(' & ')
for a in listofands:
index = listofands.index(a)
in_bet = self.is_inbetween(listofands, a)
if a:
a = self.__makenotexpr(a)
s = ''.join((s, word_str, a, '.*'))
else:
if elem:
elem = self.__makenotexpr(elem)
s = ''.join((elem, '.*'))
if eor:
return s
else:
return ''.join((s, self.__makerexp(rexplist[1:])))
When the search phrase is as follows: 当搜索词如下时:
p = PyBoolRe('Python | Perl')
s1 = 'Guido invented Python'
s2 = 'Guido Perl'
if p.match(s1):
print 'Match found for first string'
else:
print 'No match found for first string'
if p.match(s2):
print 'Match found for second string'
else:
print 'No match found for second string'
Then both s1
& s2
match 然后s1
和s2
匹配
But when the search phrase is: 但是当搜索词是:
p = PyBoolRe('Guido & (Python | Perl)')
s1 = 'Guido invented Python'
s2 = 'Guido Perl is great'
Then it should match if s1 or s2 has "Guido Python"
or "Guido Perl"
. 然后,如果s1或s2具有"Guido Python"
或"Guido Perl"
它应该匹配。 s2 has that but it does not match it. s2具有该功能,但不匹配。 On the other hand, it matches s1, which it should not. 另一方面,它匹配不应该的s1。 Why is that? 这是为什么?
Please help!! 请帮忙!! How can I get it to work?? 我如何使它工作?
Your generated expression is 您生成的表达式是
.*Python.*|.*Perl.*.*Guido.*
while it should look like 虽然看起来像
(?=.*Guido.*)(?:.*Python.*|.*Perl.*)
So the parser needs some revision. 因此,解析器需要进行一些修订。
1) x|y
should be enclosed into (?:...)
(at least when used inside another block). 1) x|y
应该包含在(?:...)
(至少在另一个块中使用时)。 Otherwise, |
否则, |
unluckily takes the global priority in the regexp. 不幸的是在正则表达式中将全局优先级放在第一位。
2) x & y
should be converted into (?=x)y
(trailing context may be used to express the and
between regular expressions) 2) x & y
应该转换为(?=x)y
(可使用尾随上下文来表示and
之间的正则表达式)
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.