简体   繁体   中英

How to parse complex BibTex items with JavaScript and RegEx

I'm trying to parse a Bibtex file with Javascript using regex and I can't seem to find a proper solution. In the following example bj is an array with the children of a bibliography item. I had to write a quite long regex to consider items where the values can split in multiple lines, lack curly braces ( {} ) or have syntactically wrong commas at the end (eg last field should not end with comma, but some TeX editors don't complain about that).

This is what I am using to test my regex:

@inproceedings{Carrel2005,
    title      = {{Algorithm} for near-optimal autonomous resource management},
    author     = {Carrel, Ândrew and Palmer, Phil},
    notes      = nonote ,
    booktitle  = {8th International Symposium on Artificial {Intelligence, 
                 Robotics}, and Automation in Space},
    year       = {2005}
    blahblah   = error,
}

As you can see, some values are split in two lines and can have curly braces inside. The regex I've been trying to improve is the following:

var txt = "@inproceedings{Carrel2005, \n" +
          "    title      = {{Algorithm} for near-optimal autonomous resource management}, \n" +
          "    author     = {Carrel, Ândrew and Palmer, Phil}, \n" +
          "    notes      = nonote ,\n" +
          "    booktitle  = {8th International Symposium on Artificial Intelligence, \n" +
          "                  Robotics and Automation in Space}, \n" +
          "    year       = {2005} \n" +
          "    blahblah   = error,\n}";

bj = txt.match(/\w*[\t ]*=[\t ]*(\{[\u0020-\u0080\u00A1-\u00FF\u0300-\u036F\t\r\n]*?}|[a-zA-Z0-9]+)[\t ]*(,(?!\s*}))?/g);

Explained:

\w*               A word for the field name.
[\t ]*=[\t ]*     Any number of spaces or tabs after and before the equal sign.
(                 Start of group 1.
  \{              Option 11: starts by an opening curly brace.
  [               Start of character class AAA.
    unicode-set   Letters (basic Latin plus some extensions)
    \t\r\n        ... or whitespace.
  ]*?             End of character class AAA (with LAZY repetition)
|                 End of option 11, start of option 12:
  [a-zA-Z0-9]+    One or more characters (no underscore or whitespace allowed).
)                 End of option 12 and group 1.
[\t ]*            Any number of tabs or spaces.
(                 Start of group 2:
  ,               A literal comma
    (?!\s*})      ...if it is not followed by whitespace and closing curly braces.
)?                End of group 2. ? denotes it is optional.

I haven't been able to match fields that start by multiple curly braces (eg {{Algorithm} for near... ) nor to correctly match those where the sequence }, is found inside.

As I mentioned in the comments, it's not possible to match arbitrarily deep braces, as that would require some state to store the number you've seen. You need a parser, and then add the state in. It would look something like:

var txt = "@inproceedings{Carrel2005, \n" +
    "    title      = {{Algorithm} for near-optimal autonomous resource management}, \n" +
    "    author     = {Carrel, Ândrew and Palmer, Phil}, \n" +
    "    notes      = nonote ,\n" +
    "    booktitle  = {8th International Symposium on Artificial Intelligence, \n" +
    "                  Robotics and Automation in Space}, \n" +
    "    year       = {2005} \n" +
    "    blahblah   = error,\n}";


function parseBibTexLine (text) {
    var m = text.match(/^\s*(\S+)\s*=\s*/);
    if (!m) {
        console.log('line: "' + text + '"');
        throw new Error('Unrecogonised line format');
    }
    var name = m[1];
    var search = text.slice(m[0].length);
    var re = /[\n\r,{}]/g;
    var braceCount = 0;
    var length = m[0].length;
    do {
        m = re.exec(search);
        if (m[0] === '{') {
            braceCount++;
        } else if (m[0] === '}') {
            if (braceCount ===  0) {
                throw new Error('Unexpected closing brace: "}"');
            }
            braceCount--;
        }
    } while (braceCount > 0);
    return {
        field:name,
        value: search.slice(0, re.lastIndex),
        length:length + re.lastIndex + m[0].length
    };
}

function parseBibTex (text) {
    var m = text.match(/^\s*@([^{]+){([^,\n]+)[,\n]/);
    if (!m) {
        throw new Error('Unrecogonised header format');
    }
    var result = {
        typeName: m[1].trim(),
        citationKey: m[2].trim()
    }
    text = text.slice(m[0].length).trim();
    while (text[0] !== '}') {
        var pair = parseBibTexLine(text);
        result[pair.field] = pair.value;
        text = text.slice(pair.length).trim();
    }
    return result;
}

console.log(parseBibTex(txt));

I certainly haven't tested this deeply, but when run on your input I get:

{ typeName: 'inproceedings',
  citationKey: 'Carrel2005',
  title: '{{Algorithm} for near-optimal autonomous resource management}',
  author: '{Carrel, Ândrew and Palmer, Phil}',
  notes: 'nonote ,',
  booktitle: '{8th International Symposium on Artificial Intelligence, \n                  Robotics and Automation in Space}',
  year: '{2005}',
  blahblah: 'error,' }

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM