Dynamic operator tokens in ANTLR4

Question

I'm trying to make a calculator in ANTLR4 that can use almost every possible symbol as mathematical operator.
Concrete:
- The user defines operations consisting of an operator and a precedence. The operator can be any combination of symbols except for some system symbols (parentheses, commas, ...). Precedence is a positive integer number. Operations are stored in a java HashMap.
- There are three different kinds of operations: left side (unary minus, ...), right side (factorial, ...) and binary (addition, ...)
- The operations should be requested at runtime, so that operations can be (de)activated during the parse. If this is not possible, then the operators should be requested at parser creation.
- For the precedence: full dynamic precedence is preferable(at runtime the precedence of an encountered operation is requested), but if it isn't possible then there should be different precedence presets. (multiplication, addition, ...)

What I've got:
- Working code for operator recognition
- Precedence climbing code which produces a correct parse tree, but gives an error: rule expr failed predicate: (getPrecedence($op) >= $_p)?

UPDATE: fixed operator recognition code, and found code for the precedence climbing mechanism

tokens { PREOP, POSTOP, BINOP, ERROR }
@lexer::members {

    private static List<String> binaryOperators;
    private static List<String> prefixOperators;
    private static List<String> postfixOperators;
    {
        binaryOperators = new ArrayList<String>();
        binaryOperators.add("+");
        binaryOperators.add("*");
        binaryOperators.add("-");
        binaryOperators.add("/");

        prefixOperators = new ArrayList<String>();
        prefixOperators.add("-");

        postfixOperators = new ArrayList<String>();
        postfixOperators.add("!");
    }

    private Deque<Token> deque = new LinkedList<Token>();
    private Token previousToken;
    private Token nextToken;

    @Override
    public Token nextToken() {
        if (!deque.isEmpty()) {
            return previousToken = deque.pollFirst();
        }

        Token next = super.nextToken();
        if (next.getType() != SYMBOL) {
            return previousToken = next;
        }

        StringBuilder builder = new StringBuilder();
        while (next.getType() == SYMBOL) {
            builder.append(next.getText());
            next = super.nextToken();
        }
        deque.addLast(nextToken = next);

        List<Token> tokens = findOperatorCombination(builder.toString(), getOperatorType());
        for (int i = tokens.size() - 1; i >= 0; i--) {
            deque.addFirst(tokens.get(i));
        }
        return deque.pollFirst();       
    }


    private static List<Token> findOperatorCombination(String sequence, OperatorType type) {
        switch (type) {
        case POSTFIX:
            return getPostfixCombination(sequence);
        case PREFIX:
            return getPrefixCombination(sequence);
        case BINARY:
            return getBinaryCombination(sequence);
        default:
            break;
        }
        return null;
    }

    private static List<Token> getPrefixCombination(String sequence) {
        if (isPrefixOperator(sequence)) {
            List<Token> seq = new ArrayList<Token>(1);
            seq.add(0, new CommonToken(MathParser.PREOP, sequence));
            return seq;
        }
        if (sequence.length() <= 1) {
            return null;
        }

        for (int i = 1; i < sequence.length(); i++) {
            List<Token> seq1 = getPrefixCombination(sequence.substring(0, i));
            List<Token> seq2 = getPrefixCombination(sequence.substring(i, sequence.length()));
            if (seq1 != null & seq2 != null) {
                seq1.addAll(seq2);
                return seq1;
            }
        }
        return null;
    }

    private static List<Token> getPostfixCombination(String sequence) {
        if (isPostfixOperator(sequence)) {
            List<Token> seq = new ArrayList<Token>(1);
            seq.add(0, new CommonToken(MathParser.POSTOP, sequence));
            return seq;
        }
        if (sequence.length() <= 1) {
            return null;
        }

        for (int i = 1; i < sequence.length(); i++) {
            List<Token> seq1 = getPostfixCombination(sequence.substring(0, i));
            List<Token> seq2 = getPostfixCombination(sequence.substring(i, sequence.length()));
            if (seq1 != null && seq2 != null) {
                seq1.addAll(seq2);
                return seq1;
            }
        }
        return null;
    }


    private static List<Token> getBinaryCombination(String sequence) {
        for (int i = 0; i < sequence.length(); i++) { // i is number of postfix spaces
            for (int j = 0; j < sequence.length() - i; j++) { // j is number of prefix spaces
                String seqPost = sequence.substring(0, i);
                List<Token> post = getPostfixCombination(seqPost);

                String seqPre = sequence.substring(sequence.length()-j, sequence.length());
                List<Token> pre = getPrefixCombination(seqPre);

                String seqBin = sequence.substring(i, sequence.length()-j);

                if ((post != null || seqPost.isEmpty()) && 
                    (pre != null || seqPre.isEmpty()) && 
                    isBinaryOperator(seqBin)) {
                    List<Token> res = new ArrayList<Token>();
                    if (post != null)
                        res.addAll(post);
                    res.add(new CommonToken(MathParser.BINOP, seqBin));
                    if (pre != null)
                        res.addAll(pre);
                    return res;
                }
            }
        }
        return null;
    }


    /**
     * Returns the expected operator type based on the previous and next token
     */
    private OperatorType getOperatorType() {
        if (isValueEnd(previousToken.getType())) {
            if (isValueStart(nextToken.getType())) {
                return OperatorType.BINARY;
            }
            return OperatorType.POSTFIX;
        }
        return OperatorType.PREFIX;
    }
    private enum OperatorType { BINARY, PREFIX, POSTFIX };


    /**
     * Checks whether the given token is a token found at the start of value elements
     * @param tokenType
     * @return
     */
    private static boolean isValueStart(int tokenType) {
        return tokenType == MathParser.INT;

    }
    /**
     * Checks whether the given token is a token found at the end of value elements
     * @param tokenType
     * @return
     */
    private static boolean isValueEnd(int tokenType) {
        return tokenType == MathParser.INT;

    }

    private static boolean isBinaryOperator(String operator) {
        return binaryOperators.contains(operator);
    }
    private static boolean isPrefixOperator(String operator) {
        return prefixOperators.contains(operator);
    }
    private static boolean isPostfixOperator(String operator) {
        return postfixOperators.contains(operator);
    }
}

Precedence climbing code:

@parser::members {
    static Map<String, Integer> precedenceMap = new HashMap<String, Integer>();
    static {
        precedenceMap.put("*", 2);
        precedenceMap.put("+", 1);
        precedenceMap.put("^", 4);
        precedenceMap.put("-", 3);
        precedenceMap.put("!", 5);
    }
    public static Integer getPrecedence(Token op) {
        return precedenceMap.get(op.getText());
    }
    public static Integer getNextPrecedence(Token op) {
        Integer p = getPrecedence(op);
        if (op.getType() == PREOP)          return p;
        else if (op.getText().equals("^"))  return p;
        else if (op.getType() == BINOP)     return p+1;
        else if (op.getType() == POSTOP)    return p+1;
        throw new IllegalArgumentException(op.getText());
    }
}

prog
    :   expr[0]
    ;


expr [int _p]
    :   aexpr 
        (   {getPrecedence(_input.LT(1)) >= $_p}? op=BINOP expr[getNextPrecedence($op)] 
        |   {getPrecedence(_input.LT(1)) >= $_p}? POSTOP
        )* 
    ;

atom
    :   INT 
    |   '(' expr[0] ')'
    |   op=PREOP expr[getNextPrecedence($op)]
    ;

So now the question is what can do about this predicate failure error

Answer 1

You can't define precedence/associativity rules for Antlr at runtime. What you can, however, is parse all of the operators (built-in in the language or user-defined) as a single chained list (like ArrayList<> ) in the parse, then apply your own algorithm for precedence and associativity in a visitor (or in grammar actions, if you really want to).

The algorithm itself isn't that hard, if you iterate the list many times. For example, you can first fetch the precedence of each operator in the list, then check the one with highest precedence, see if its right or left-associative, and from there you've built your first (bottom-most) tree node. Keep applying until the list is empty, and you've built your own "parse tree", but without the parsing (you're not working with abstract-input strings anymore).

Alternatively, at runtime make externals calls for Antlr to compile the .g4 and to javac to compile the generated Antlr code, then use reflection to call it. However, it is probably much slower and arguably harder to pull off.

Answer 2

A parser rule that will work 'correctly' according to some runtime definition of Symbol precedence is possible. While not initially appearing to be an idiomatic choice, the standard alternative of deferring semantic analysis out of the parser would produce a very poorly differentiated parse tree -- making this a reasonable exception to the standard design rule.

In (overly simplified) form, the parser rule would be:

expr : LParen expr RParen  # group
     | expr Symbol expr    # binary
     | expr Symbol         # postfix
     | Symbol expr         # prefix
     | Int+                # value
     ;

To cure the ambiguity, add inline predicates:

expr : LParen expr RParen                    # group
     | expr s=Symbol { binary($s) }? expr    # binary
     | expr s=Symbol { postfix($s) }?        # postfix
     | s=Symbol { prefix($s) }? expr         # prefix
     | Int+                                  # value
     ;

For any given Symbol, a single predicate method should evaluate as true.

Extending to multiple Symbol strings will add a bit of complexity (ex, differentiating a binary from a postfix followed by a prefix) but the mechanics remain largely the same.

Answer 3

I think your approach is the right way. I propose following grammar:

grammar Op;

options {
  superClass=PrecedenceParser;
}

prog :  expr[0] ;

expr[int _p] locals[Token op]:  INT ({$op = _input.LT(1);} {getPrecedence($op) >= $_p}? OP expr[getPrecedence($op)])*;

INT :   ( '0'..'9' )+ ;

OP : '+' | '*'; // all allowed symbols, should be extended

WS  : [ \t\r\n]+ -> skip ; // skip spaces, tabs, newlines

The rule for op should contain all allowed operator symbols. My restriction to + and * is only for simplicity. The parser super class would be:

public abstract class PrecedenceParser extends Parser {

    private Map<String, Integer> precedences;

    public PrecedenceParser(TokenStream input) {
        super(input);
        this.precedences = new HashMap<>();
    }

    public PrecedenceParser putOperator(String op, int p) {
        precedences.put(op, p);
        return this;
    }

    public int getPrecedence(Token operator) {
        Integer p = precedences.get(operator.getText());
        if (p == null) {
            return Integer.MAX_VALUE; 
        } else {
            return p;
        }
    }

}

Results

with precedences {+ : 4, * : 3 }

(prog (expr 1 + (expr 2) * (expr 3 + (expr 4))))

with precedences {+ : 3, * : 4 }

(prog (expr 1 + (expr 2 * (expr 3) + (expr 4))))

Evaluating these sequences from left to right is equivalent to evaluating them with precedence.

This approach should work for larger sets of operators. ANTLR4 uses this approach internally for precedence climbing yet ANTLR uses constants instead of the precedences map (because it assumes that precedence is determined at parser build time).

Answer 4

Thanks to the other contributors I have found a complete (and actually reasonably clean) solution for my problem.

Operator matching:
By looking at the tokens before and after the encountered series of symbols, it is possible to detect the fixity of the operator. After that, apply an algorithm which detects a sequence of valid operators in the symbol series. Then inject those tokens in the token stream (in nextToken() ). Just make sure you define all hardcoded tokens before the SYMBOL definition.

Precedence climbing:
Actually this wasn't that hard, it is exactly the same as ANTLR4's internal strategy.

grammar Math;


tokens { PREOP, POSTOP, BINOP, ERROR }

@header {
    import java.util.*;
}

@lexer::members {

    private static List<String> binaryOperators;
    private static List<String> prefixOperators;
    private static List<String> postfixOperators;
    {
        binaryOperators = new ArrayList<String>();
        binaryOperators.add("+");
        binaryOperators.add("*");
        binaryOperators.add("-");
        binaryOperators.add("/");
        System.out.println(binaryOperators);

        prefixOperators = new ArrayList<String>();
        prefixOperators.add("-");
        System.out.println(prefixOperators);

        postfixOperators = new ArrayList<String>();
        postfixOperators.add("!");
        System.out.println(postfixOperators);
    }

    private Deque<Token> deque = new LinkedList<Token>();

    private Token previousToken;
    private Token nextToken;

    @Override
    public Token nextToken() {
        if (!deque.isEmpty()) {
            return previousToken = deque.pollFirst();
        }

        Token next = super.nextToken();
        if (next.getType() != SYMBOL) {
            return previousToken = next;
        }

        StringBuilder builder = new StringBuilder();
        while (next.getType() == SYMBOL) {
            builder.append(next.getText());
            next = super.nextToken();
        }
        deque.addLast(nextToken = next);

        List<Token> tokens = findOperatorCombination(builder.toString(), getOperatorType());
        for (int i = tokens.size() - 1; i >= 0; i--) {
            deque.addFirst(tokens.get(i));
        }
        return deque.pollFirst();       
    }


    private static List<Token> findOperatorCombination(String sequence, OperatorType type) {
        switch (type) {
        case POSTFIX:
            return getPostfixCombination(sequence);
        case PREFIX:
            return getPrefixCombination(sequence);
        case BINARY:
            return getBinaryCombination(sequence);
        default:
            break;
        }
        return null;
    }

    private static List<Token> getPrefixCombination(String sequence) {
        if (isPrefixOperator(sequence)) {
            List<Token> seq = new ArrayList<Token>(1);
            seq.add(0, new CommonToken(MathParser.PREOP, sequence));
            return seq;
        }
        if (sequence.length() <= 1) {
            return null;
        }

        for (int i = 1; i < sequence.length(); i++) {
            List<Token> seq1 = getPrefixCombination(sequence.substring(0, i));
            List<Token> seq2 = getPrefixCombination(sequence.substring(i, sequence.length()));
            if (seq1 != null & seq2 != null) {
                seq1.addAll(seq2);
                return seq1;
            }
        }
        return null;
    }

    private static List<Token> getPostfixCombination(String sequence) {
        if (isPostfixOperator(sequence)) {
            List<Token> seq = new ArrayList<Token>(1);
            seq.add(0, new CommonToken(MathParser.POSTOP, sequence));
            return seq;
        }
        if (sequence.length() <= 1) {
            return null;
        }

        for (int i = 1; i < sequence.length(); i++) {
            List<Token> seq1 = getPostfixCombination(sequence.substring(0, i));
            List<Token> seq2 = getPostfixCombination(sequence.substring(i, sequence.length()));
            if (seq1 != null && seq2 != null) {
                seq1.addAll(seq2);
                return seq1;
            }
        }
        return null;
    }


    private static List<Token> getBinaryCombination(String sequence) {
        for (int i = 0; i < sequence.length(); i++) { // i is number of postfix spaces
            for (int j = 0; j < sequence.length() - i; j++) { // j is number of prefix spaces
                String seqPost = sequence.substring(0, i);
                List<Token> post = getPostfixCombination(seqPost);

                String seqPre = sequence.substring(sequence.length()-j, sequence.length());
                List<Token> pre = getPrefixCombination(seqPre);

                String seqBin = sequence.substring(i, sequence.length()-j);

                if ((post != null || seqPost.isEmpty()) && 
                    (pre != null || seqPre.isEmpty()) && 
                    isBinaryOperator(seqBin)) {
                    List<Token> res = new ArrayList<Token>();
                    if (post != null)
                        res.addAll(post);
                    res.add(new CommonToken(MathParser.BINOP, seqBin));
                    if (pre != null)
                        res.addAll(pre);
                    return res;
                }
            }
        }
        return null;
    }


    /**
     * Returns the expected operator type based on the previous and next token
     */
    private OperatorType getOperatorType() {
        if (isAfterAtom()) {
            if (isBeforeAtom()) {
                return OperatorType.BINARY;
            }
            return OperatorType.POSTFIX;
        }
        return OperatorType.PREFIX;
    }
    private enum OperatorType { BINARY, PREFIX, POSTFIX };


    /**
     * Checks whether the current token is a token found at the start of atom elements
     * @return
     */
    private boolean isBeforeAtom() {
        int tokenType = nextToken.getType();
        return tokenType == MathParser.INT || 
                tokenType == MathParser.PLEFT;

    }
    /**
     * Checks whether the current token is a token found at the end of atom elements
     * @return
     */
    private boolean isAfterAtom() {
        int tokenType = previousToken.getType();
        return tokenType == MathParser.INT ||
                tokenType == MathParser.PRIGHT;

    }

    private static boolean isBinaryOperator(String operator) {
        return binaryOperators.contains(operator);
    }
    private static boolean isPrefixOperator(String operator) {
        return prefixOperators.contains(operator);
    }
    private static boolean isPostfixOperator(String operator) {
        return postfixOperators.contains(operator);
    }

}

@parser::members {
    static Map<String, Integer> precedenceMap = new HashMap<String, Integer>();
    static {
        precedenceMap.put("*", 2);
        precedenceMap.put("+", 1);
        precedenceMap.put("^", 4);
        precedenceMap.put("-", 3);
        precedenceMap.put("!", 5);
    }
    public static Integer getPrecedence(Token op) {
        return precedenceMap.get(op.getText());
    }
    public static Integer getNextPrecedence(Token op) {
        Integer p = getPrecedence(op);
        if (op.getType() == PREOP)          return p;
        else if (op.getText().equals("^"))  return p;
        else if (op.getType() == BINOP)     return p+1;
        throw new IllegalArgumentException(op.getText());
    }
}

prog
    :   expr[0]
    ;


expr [int _p]
    :   atom
        (   {getPrecedence(_input.LT(1)) >= $_p}? op=BINOP expr[getNextPrecedence($op)] 
        |   {getPrecedence(_input.LT(1)) >= $_p}? POSTOP
        )* 
    ;

atom
    :   INT 
    |   PLEFT expr[0] PRIGHT
    |   op=PREOP expr[getNextPrecedence($op)]
    ;

INT
    :   ( '0'..'9' )+
    ;

PLEFT   :   '(' ;
PRIGHT  :   ')' ;

WS
    : [ \t\r\n]+ -> skip ; // skip spaces, tabs, newlines

SYMBOL
    :   .
    ;

Note: code is meant as an example, not as my real code (operators and precedence will be requested externally)

Dynamic operator tokens in ANTLR4

Question

4 answers

solution1
0 2015-04-27 15:21:12

solution2
0 2015-04-27 19:26:43

solution3
0 2015-04-29 02:28:53

solution4
0 ACCPTED 2015-05-01 21:34:57

Dynamic operator tokens in ANTLR4

Question

4 answers

solution1 0 2015-04-27 15:21:12

solution2 0 2015-04-27 19:26:43

solution3 0 2015-04-29 02:28:53

solution4 0 ACCPTED 2015-05-01 21:34:57

solution1
0 2015-04-27 15:21:12

solution2
0 2015-04-27 19:26:43

solution3
0 2015-04-29 02:28:53

solution4
0 ACCPTED 2015-05-01 21:34:57