简体   繁体   中英

Syntax error in ply(lex/yacc) when input text file is not in sequence with the grammar sequence

The following code works well when the text file are in sequence with code ie, Introduction then Information but gives error if Information comes before Introduction . What would be the solution to handle this using lex/yacc? Thank in advance.

import ply.lex as lex

# List of token names.   This is always required
tokens = [
    'CheckupInformation',
    'Introduction',
    'Information',
    'perfect',
    'sick',
    'LPAREN',
    'RPAREN',
    'CHAR',
    'NUMBER'
    ] 
def t_CheckupInformation(t)     : 'CheckupInformation'     ; return t
def t_Introduction(t)  : 'Introduction'  ; return t
def t_Information(t) : 'Information' ; return t
def t_perfect(t): 'perfect'; return t
def t_sick(t) : 'sick'; return t



t_LPAREN  = r'\('
t_RPAREN  = r'\)'
t_CHAR = r'[a-zA-Z_][a-zA-Z0-9_\-]*'
t_ignore = " \t"
# Define a rule so we can track line numbers

def t_NUMBER(t):
    r'[+\-0-9_][0-9_]*'
    t.lexer.lineno += len(t.value)
    try:
        t.value = int(t.value)
    except ValueError:
        print("Integer value too large %s" % t.value)
        t.value = 0
    return t
def t_SEMICOLON(t):
    r'\;.*'
    t.lexer.lineno += len(t.value)
def t_newline(t):
    r'\n+'
    t.lexer.lineno += len(t.value)
# Error handling rule
def t_error(t):
    print("Illegal character '%s'" % t.value[0])
    t.lexer.skip(1)

 # Build the lexer
lexer = lex.lex()
# define upper level classes first     
class stat:
    def __init__(self):
        self.statement = ""
        self.intro = list()
        self.body = list()


P=stat()
def p_stat(p):
    'Stat : LPAREN CheckupInformation statIntro statBody RPAREN'
    p[0]=(p[1],p[2],p[3],p[4],p[5])

def p_Intro(p) : 
    '''statIntro : LPAREN Introduction Name RPAREN
                 | statIntro LPAREN Introduction Name RPAREN
                 | empty'''

    if len(p)==5:
       p[0] = (p[3])
    elif len(p)==6:
       p[0] = (p[4])
    else:
       p[0]= None
    P.intro.append(p[0])

def p_Name(p):
    'Name : CHAR'
    p[0]=p[1]



def p_Body(p):
    '''statBody : LPAREN Information bodyinfo RPAREN
                | statBody LPAREN Information bodyinfo RPAREN'''
    if len(p)==5:
       p[0] = (p[3])
    elif len(p)==6:
       p[0] = (p[4])
    P.body.append(p[0])
def p_bodyinfo(p):
    '''bodyinfo : LPAREN CHAR perfect RPAREN
                | LPAREN CHAR sick RPAREN'''
    p[0]=p[2],p[3]


def p_empty(p):
    'empty :  '
    print("This function is called")
    pass   
def p_error(p):
    print("Syntax error in input '%s'!" % p.value)

import ply.yacc as yacc
parser = yacc.yacc()
import sys
if len(sys.argv) < 2 :
    sys.exit("Usage: %s <filename>" % sys.argv[0])
fp = open(sys.argv[1])
contents=fp.read()
result=parser.parse(contents)

print("(CheckupInformation")
if (P.intro) != None:
    for x in range(len(P.intro)):
        print("    (Introduction %s)" %(P.intro[x]))
for x in range(len(P.body)):
        print("    (Information( %s %s))" %(P.body[x]))
print(")")

The code works well for file1 & cannot handle file2.

ERROR: Syntax error in input '(Introduction'! (CheckupInformation (Introduction None) (Information( Anonymous1 perfect)) )

File1:

(CheckupInformation
  (Introduction John)
  (Introduction Patt)
  (Information(Anonymous1 perfect))
  (Information(Anonymous2 sick))
)

File2:

(CheckupInformation

  (Information(Anonymous1 perfect))
  (Information(Anonymous2 sick))
  (Introduction John)
  (Introduction Patt)
)

This might not be the answer you wanted, but I found myself unable to just change one or two lines in your code. The following is still far from perfect, but I think it is approaching a reasonable approach to your problem. I tried to annotate it with useful comments. Please read through it carefully and try to understand why I did what I did, referring to the Ply manual as necessary (some references are in the code comments, but there's lots of useful background information in the document which I didn't reference specifically).

Good luck.

import ply.lex as lex

# Keyword handling copied from the Ply manual, https://www.dabeaz.com/ply/ply.html#ply_nn6
reserved = {
    'CheckupInformation': 'TK_CheckupInformation',
    'Introduction': 'TK_Introduction',
    'Information': 'TK_Information',
    'perfect': 'TK_perfect',
    'sick': 'TK_sick',
}

# I changed CHAR to WORD because CHAR sounds like a character
tokens = ['NUMBER','WORD'] + list(reserved.values())

def t_WORD(t):
    r'[a-zA-Z_][a-zA-Z0-9_-]*'
    t.type = reserved.get(t.value,'WORD')    # Check for reserved words
    return t

# See the Ply manual: https://www.dabeaz.com/ply/ply.html#ply_nn11
literals = '()'

# See the Ply manual: https://www.dabeaz.com/ply/ply.html#ply_nn8
t_ignore = ' \t\n'
t_ignore_COMMENT = r'\;.*'

# Fixed the regex. You can't have a sign in the middle of a number.
def t_NUMBER(t):
    r'[+-]?[0-9_]+'
    try:
        t.value = int(t.value)
    except ValueError:
        print("Integer value too large %s" % t.value)
        t.value = 0
    return t

# See below for the definition of lineno_for_token
# Error handling rule
def t_error(t):
    print("Illegal character '%s' at line %d'" % (
        t.value[0], t.lexer.lineno_for_token(t)))
    t.lexer.skip(1)

# Build the lexer
lexer = lex.lex()

# Ply tracks the character index automatically as lexer.lexpos, and every
# token it produces has a lexpos attribute. So there is no need to misuse
# the lineno attribute for that purpose. It should be the line number of
# the token, as its name indicates.
# You don't seem to use lineno (or lexpos) anywhere, but it is handy for
# error messages. But since it is used rarely, it's easier to compute it
# on demand by counting newlines to the lex position.
# Perhaps this should just be added to the lexer we just built.
lex.Lexer.lineno_for_token = lambda self, t: 1 + self.lexdata.count('\n', 0, t.lexpos)

# Fixed this to use an upper-class name and to derive from object.
# Object to hold a top-level description
class Stat(object):
    # Attributes used for components
    components = {'intro', 'body'}

    def __init__(self, component_dict):
        self.statement = ""  # I don't know what this is used for
        # Copy the components dictionary as attributes, using
        # an empty list as default
        for k in self.components:
            setattr(self, k, component_dict.get(k, ()))
        # Verify that we used every key in the dict.
        for k in component_dict.keys():
            if k not in self.components:
                print("Warning! Ignoring " + k
                      + " because it is not in Stat.components")

    # Arrange for the object to print as expected
    def __repr__(self):
        return '(CheckupInformation %r %r)' % (self.intro, self.body)

# Instead of having a global "P" object (whose name is not very useful),
# we return a Stat object
def p_stat(p):
    """ stat : '(' TK_CheckupInformation components ')' """
    p[0] = Stat(p[3])

# We allow all components to be optional and order independent here. We
# also allow them all to be repeated. But that could be made more precise.

# components is a dictionary whose values are lists
def p_components_empty(p):
    """ components : """
    p[0] = { }

def p_components_append(p):
    """ components : components component """
    p[0] = p[1]
    # The component is a two-element tuple
    key, value = p[2]
    if key in p[0]:
        p[0][key].append(value)
    else:
        p[0][key] = [value]

# Syntax for each component type (just one element, not a list)
# component is a tuple of (key, value)
# All of the productions just copy the value from some specific syntax.
def p_component(p):
    """ component : statIntro
                  | statBody
    """
    p[0] = p[1]

def p_statIntro(p): 
    """statIntro : '(' TK_Introduction WORD ')' """
    p[0] = ('intro', p[3])

def p_statBody(p):
    """statBody : '(' TK_Information bodyinfo ')' """
    p[0] = ('body', p[3])

# bodyinfo is a tuple of (identifier, status)
def p_bodyinfo(p):
    """bodyinfo : '(' WORD TK_perfect ')'
                | '(' WORD TK_sick ')'
    """
    p[0] = (p[2],p[3])

def p_error(p):
    print("Syntax error in input '%s'! at line %d" % (
        p.value, p.lexer.lineno_for_token(p)))

import ply.yacc as yacc

parser = yacc.yacc()

# Only do this if we're called from the command line
if __name__ == "__main__":
    import sys
    if len(sys.argv) < 2 :
        sys.exit("Usage: %s <filename>" % sys.argv[0])

    with open(sys.argv[1]) as fp:
        stat = parser.parse(fp.read())

    if stat is not None:
        print("(CheckupInformation")
        for x in range(len(stat.intro)):
            print("    (Introduction %s)" %(stat.intro[x]))
        for x in range(len(stat.body)):
            print("    (Information( %s %s))" %(stat.body[x]))
        print(")")

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM