如何编辑解析器以正确组合“ AND”和“ OR”谓词？

Question

我目前正在尝试编写一个小型解析器，能够解析非常简单的key = value查询。 但是它应该足够聪明以处理AND和OR组，并且AND具有更高的优先级。 文本输入示例：

a = 10 && b = 20
a = 10 || b = 20
a = 10 && b = 20 || c = 30

前两个很简单。 最后一个应将前两个谓词归为“ AND”组，然后应将该组归为“ OR”组。

我掌握了基础知识 ，但仍无法正确分组。 我正在使用使用flex / bison / lex / yacc语法定义语法的ply 。 如果我完全按照我现有的语法走错了路，请告诉我...这将是有关解析器的宝贵学习经验。

我尝试设置优先级，但是我不认为这实际上是由减少/减少冲突引起的。 我认为这更多是关于我总体上定义语法的方式的问题，但是我无法弄清楚需要更改什么。

以下是我当前的实现和一个单元测试文件。 测试文件应有助于理解预期的输出。 当前有一项测试失败。 那是让我头疼的那件事。

可以使用内置的unittest模块运行测试，但是，由于我在测试中执行了一些print语句，因此我建议使用pytest因为它可以拦截这些print语句，并且pytest造成混乱。 例如（假设两个文件都在同一文件夹中）：

python -m venv env
./env/bin/pip install pytest
./env/bin/pytest test_query_string.py

文件名： `queryparser.py`

import logging
from collections import namedtuple

import ply.lex as lex
import ply.yacc as yacc


LOG = logging.getLogger(__name__)

Predicate = namedtuple('Predicate', 'key operator value')


class Production:

    def __repr__(self):
        preds = [repr(pred) for pred in self._predicates]
        return '%s(%s)' % (self.__class__.__name__, ', '.join(preds))

    def __eq__(self, other):
        return (
            self.__class__ == other.__class__ and
            self._predicates == other._predicates)

    def debug(self, indent=0, aslist=False):
        lines = []
        lines.append('    ' * indent + self.__class__.__name__)
        for predicate in self._predicates:
            if hasattr(predicate, 'debug'):
                lines.extend(predicate.debug(indent + 1, aslist=True))
            else:
                lines.append('    ' * (indent+1) + repr(predicate))
        if aslist:
            return lines
        else:
            return '\n'.join(lines)



class Conjunction(Production):

    def __init__(self, *predicates):
        self._predicates = predicates


class Disjunction(Production):

    def __init__(self, *predicates):
        self._predicates = predicates


def parse(query: str, debug=False) -> Predicate:
    lexer = QueryLexer().build()
    parser = QueryParser().build()
    if debug:
        output = parser.parse(query, debug=LOG)
    else:
        output = parser.parse(query)
    return output or []


class QueryLexer:

    tokens = (
        'WORD',
        'OPERATOR',
        'QUOTE',
        'AND',
        'OR'
    )

    t_ignore = ' \t'
    t_QUOTE = '"'

    def t_error(self, t):
        LOG.warning('Illegal character %r', t.value[0])
        t.lexer.skip(1)

    def t_WORD(self, t):
        r'\w+'
        return t

    def t_OPERATOR(self, t):
        r'(=|!=|>|<|<=|>=)'
        return t

    def t_AND(self, t):
        r'&&'
        return t

    def t_OR(self, t):
        r'\|\|'
        return t

    def build(self, **kwargs):
        self.lexer = lex.lex(module=self, **kwargs)


class QueryParser:

    precedence = (
        ('nonassoc', 'OR'),
        ('nonassoc', 'AND'),
    )

    def p_query(self, p):
        '''
        query : disjunction
              | conjunction
              | predicate
        '''
        p[0] = p[1]

    def p_disjunction(self, p):
        '''
        disjunction : predicate OR predicate
                    | predicate OR conjunction
                    | predicate OR disjunction
        '''
        output = [p[1]]
        if p.slice[3].type == 'disjunction':
            # We can merge multiple chanined disjunctions together
            output.extend(p[3]._predicates)
        else:
            output.append(p[3])
        p[0] = Disjunction(*output)

    def p_conjunction(self, p):
        '''
        conjunction : predicate AND predicate
                    | predicate AND conjunction
                    | predicate AND disjunction
        '''
        if len(p) == 4:
            output = [p[1]]
            if p.slice[3].type == 'conjunction':
                # We can merge multiple chanined disjunctions together
                output.extend(p[3]._predicates)
            else:
                output.append(p[3])
            p[0] = Conjunction(*output)
        else:
            p[0] = Conjunction(p[1])

    def p_predicate(self, p):
        '''
        predicate : maybequoted OPERATOR maybequoted
        '''
        p[0] = Predicate(p[1], p[2], p[3])

    def p_maybequoted(self, p):
        '''
        maybequoted : WORD
                    | QUOTE WORD QUOTE
        '''
        if len(p) == 4:
            p[0] = p[2]
        else:
            p[0] = p[1]

    def p_error(self, p):
        """
        Panic-mode rule for parser errors.
        """
        if not p:
            LOG.debug('Syntax error at EOF')
        else:
            self.parser.errok()
        LOG.error('Syntax Error at %r', p)

    def build(self):
        self.tokens = QueryLexer.tokens
        self.parser = yacc.yacc(module=self, outputdir='/tmp', debug=True)
        return self.parser

文件名： `test_query_string.py`

import unittest

from queryparser import parse, Conjunction, Disjunction, Predicate


class TestQueryString(unittest.TestCase):

    def test_single_equals(self):
        result = parse('hostname = foo')
        self.assertEqual(result, Predicate('hostname', '=', 'foo'))

    def test_single_equals_quoted(self):
        result = parse('hostname = "foo"')
        self.assertEqual(result, Predicate('hostname', '=', 'foo'))

    def test_anded_equals(self):
        result = parse('hostname = foo && role=cpe')
        self.assertEqual(result, Conjunction(
            Predicate('hostname', '=', 'foo'),
            Predicate('role', '=', 'cpe'),
        ))

    def test_ored_equals(self):
        result = parse('hostname = foo || role=cpe')
        self.assertEqual(result, Disjunction(
            Predicate('hostname', '=', 'foo'),
            Predicate('role', '=', 'cpe'),
        ))

    def test_chained_conjunction(self):
        result = parse('hostname = foo && role=cpe && bla=blub')
        print(result.debug())  # XXX debug statement
        self.assertEqual(result, Conjunction(
            Predicate('hostname', '=', 'foo'),
            Predicate('role', '=', 'cpe'),
            Predicate('bla', '=', 'blub'),
        ))

    def test_chained_disjunction(self):
        result = parse('hostname = foo || role=cpe || bla=blub')
        print(result.debug())  # XXX debug statement
        self.assertEqual(result, Disjunction(
            Predicate('hostname', '=', 'foo'),
            Predicate('role', '=', 'cpe'),
            Predicate('bla', '=', 'blub'),
        ))

    def test_mixed_predicates(self):
        result = parse('hostname = foo || role=cpe && bla=blub')
        print(result.debug())  # XXX debug statement
        self.assertEqual(result, Disjunction(
            Predicate('hostname', '=', 'foo'),
            Conjunction(
                Predicate('role', '=', 'cpe'),
                Predicate('bla', '=', 'blub'),
            )
        ))

    def test_mixed_predicate_and_first(self):
        result = parse('hostname = foo && role=cpe || bla=blub')
        print(result.debug())  # XXX debug statement
        self.assertEqual(result, Conjunction(
            Predicate('hostname', '=', 'foo'),
            Disjunction(
                Predicate('role', '=', 'cpe'),
                Predicate('bla', '=', 'blub'),
            )
        ))

    def test_complex(self):
        result = parse(
            'a=1 && b=2 || c=3 && d=4 || e=5 || f=6 && g=7 && h=8',
            debug=True
        )
        print(result.debug())  # XXX debug statement

        expected = Disjunction(
            Conjunction(
                Predicate('a', '=', '1'),
                Predicate('b', '=', '2'),
            ),
            Conjunction(
                Predicate('c', '=', '3'),
                Predicate('d', '=', '4'),
            ),
            Predicate('e', '=', '5'),
            Conjunction(
                Predicate('f', '=', '6'),
                Predicate('g', '=', '7'),
                Predicate('h', '=', '8'),
            ),
        )

        self.assertEqual(result, expected)

Answer 1

如果使用优先级声明，则AND和OR都应声明为left ，而不是nonassoc 。 nonassoc表示a OR b OR c非法； left表示将被解释为(a OR b) OR c) ， right表示将被解释为a OR (b OR c) 。 （鉴于AND和OR的语义，选择left还是right并没有什么区别，但是在这种情况下通常最好选择left 。）

使用优先级关系，可以编写一个非常简单的语法：

query: predicate
     | query AND query
     | query OR query

（通常，还会有一个带括号的表达式的条目。）

上面没有做您要寻找的链接。 您可以通过走树来进行后期解析，这通常是我的偏爱。 但是，也可以使用具有明确优先级的语法动态地进行链接。

显式优先意味着语法本身定义了可能的形式。 特别是，由于AND绑定比OR绑定更紧密，因此不可能有conjunction: predicate AND disjunction恰恰是因为该产生意味着AND的第二个操作数可能是析取，这不是期望的结果。 对于这种情况，您需要通用的级联序列：

query       : disjunction  # Redundant, but possibly useful for didactic purposes
disjunction : conjunction
            | disjunction OR conjunction   # Left associative
conjunction : predicate
            | conjunction AND predicate

有了这种语法，链接是简单明了的，但是它需要像您的操作中那样进行显式测试（例如， if p.slice(1).type == 'conjunction: ，这可能有点难看。

理想情况下，我们希望直接从语法中触发正确的操作，这暗示着这样的事情（与您的语法非常相似）：

conjunction: predicate
                # p[0] = p[1]
           | predicate AND predicate
                # p[0] = Conjunction(p[1], p[3])
           | conjunction AND predicate
                # p[0] = Conjunction(*(p[1]._predicates + [p[3]])

上述规则的问题是，第二和第三都适用于a AND b ，因为减少后a到predicate ，我们有两个将其降低到选项conjunction或转移AND马上。 在这种情况下，我们希望解析器通过无条件移位来解决移位减少冲突，这将这样做，但仅在产生警告之后才能进行。 对于一个明确的解决方案，我们需要conjunction在第三个规则是一个真正的结合，与至少一个AND运营商。

考虑到这一点，我们可以将单位生产转移到级联的顶部，从而得到以下结果：

query      : disjunction
           | conjunction
           | predicate
disjunction: predicate OR predicate
           | conjunction OR predicate
           | disjunction OR predicate
conjunction: predicate AND predicate
           | conjunction AND predicate

现在，我们不需要在操作中使用条件，因为我们确切地知道每种情况下的条件。

def p_query(self, p):
    '''
    query : disjunction
          | conjunction
          | predicate
    '''
    p[0] = p[1]

def p_disjunction1(self, p):
    '''
    disjunction: predicate OR predicate
               | conjunction OR predicate
    '''
    p[0] = Disjunction(p[1], p[3])

def p_disjunction2(self, p):
    '''
    disjunction: disjunction OR predicate
    '''
    p[0] = Disjunction(*(p[1]._predicate + [p[3]])

def p_conjunction1(self, p):
    '''
    conjunction: predicate AND predicate
    '''
    p[0] = Conjunction(p[1], p[3])

def p_conjunction2(self, p):
    '''
    conjunction: conjunction AND predicate
    '''
    p[0] = Disjunction(*(p[1]._predicate + [p[3]])

笔记

提供的语法适用于两个优先级别的情况，但是产生的数量最终在级别数量上是二次的。 如果这很烦人，那就是具有更多单元产量的替代模型：

 query : disjunction disjunction : conjunction | disjunction_2 disjunction_2 : conjunction OR predicate | disjunction_2 OR predicate conjunction : predicate | conjunction_2 conjunction_2 : predicate AND predicate | conjunction_2 AND predicate

如果您不坚持认为解析器对象是不可变的，则可以将两个链接函数（ p_conjunction2和p_disjunction2 ）组合为一个函数：

 def p_chain(self, p): ''' conjunction: conjunction AND predicate disjunction: disjunction OR predicate ''' p[0] = p[1] p[0]._predicate.append(p[3])

附加的简化可以通过将运营商令牌的价值实现AND和OR 构造函数 ，而不是匹配的字符串。 （无论如何，匹配的字符串实际上都是多余的。）这将允许构造函数（ p_disjunction1和p_conjunction1也可以用单个函数替换：

 def t_AND(self, t): r'&&' t.value = Conjunction return t def t_OR(self, t): r'\\|\\|' t.value = Disjunction return t # ... def p_construct(self, p): ''' disjunction: predicate OR predicate | conjunction OR predicate conjunction: predicate AND predicate ''' p[0] = p[2](p[1], p[3])

如何编辑解析器以正确组合“ AND”和“ OR”谓词？

问题描述

文件名： `queryparser.py`

文件名： `test_query_string.py`

1 个解决方案

解决方案1
1 2017-07-25 18:36:46

笔记

如何编辑解析器以正确组合“ AND”和“ OR”谓词？

问题描述

文件名： queryparser.py

文件名： test_query_string.py

1 个解决方案

解决方案1 1 2017-07-25 18:36:46

笔记

文件名： `queryparser.py`

文件名： `test_query_string.py`

解决方案1
1 2017-07-25 18:36:46