I'm trying to parse a Bibtex file with Javascript using regex and I can't seem to find a proper solution. In the following example bj
is an array with the children of a bibliography item. I had to write a quite long regex to consider items where the values can split in multiple lines, lack curly braces ( {}
) or have syntactically wrong commas at the end (eg last field should not end with comma, but some TeX editors don't complain about that).
This is what I am using to test my regex:
@inproceedings{Carrel2005,
title = {{Algorithm} for near-optimal autonomous resource management},
author = {Carrel, Ândrew and Palmer, Phil},
notes = nonote ,
booktitle = {8th International Symposium on Artificial {Intelligence,
Robotics}, and Automation in Space},
year = {2005}
blahblah = error,
}
As you can see, some values are split in two lines and can have curly braces inside. The regex I've been trying to improve is the following:
var txt = "@inproceedings{Carrel2005, \n" +
" title = {{Algorithm} for near-optimal autonomous resource management}, \n" +
" author = {Carrel, Ândrew and Palmer, Phil}, \n" +
" notes = nonote ,\n" +
" booktitle = {8th International Symposium on Artificial Intelligence, \n" +
" Robotics and Automation in Space}, \n" +
" year = {2005} \n" +
" blahblah = error,\n}";
bj = txt.match(/\w*[\t ]*=[\t ]*(\{[\u0020-\u0080\u00A1-\u00FF\u0300-\u036F\t\r\n]*?}|[a-zA-Z0-9]+)[\t ]*(,(?!\s*}))?/g);
Explained:
\w* A word for the field name.
[\t ]*=[\t ]* Any number of spaces or tabs after and before the equal sign.
( Start of group 1.
\{ Option 11: starts by an opening curly brace.
[ Start of character class AAA.
unicode-set Letters (basic Latin plus some extensions)
\t\r\n ... or whitespace.
]*? End of character class AAA (with LAZY repetition)
| End of option 11, start of option 12:
[a-zA-Z0-9]+ One or more characters (no underscore or whitespace allowed).
) End of option 12 and group 1.
[\t ]* Any number of tabs or spaces.
( Start of group 2:
, A literal comma
(?!\s*}) ...if it is not followed by whitespace and closing curly braces.
)? End of group 2. ? denotes it is optional.
I haven't been able to match fields that start by multiple curly braces (eg {{Algorithm} for near...
) nor to correctly match those where the sequence },
is found inside.
As I mentioned in the comments, it's not possible to match arbitrarily deep braces, as that would require some state to store the number you've seen. You need a parser, and then add the state in. It would look something like:
var txt = "@inproceedings{Carrel2005, \n" +
" title = {{Algorithm} for near-optimal autonomous resource management}, \n" +
" author = {Carrel, Ândrew and Palmer, Phil}, \n" +
" notes = nonote ,\n" +
" booktitle = {8th International Symposium on Artificial Intelligence, \n" +
" Robotics and Automation in Space}, \n" +
" year = {2005} \n" +
" blahblah = error,\n}";
function parseBibTexLine (text) {
var m = text.match(/^\s*(\S+)\s*=\s*/);
if (!m) {
console.log('line: "' + text + '"');
throw new Error('Unrecogonised line format');
}
var name = m[1];
var search = text.slice(m[0].length);
var re = /[\n\r,{}]/g;
var braceCount = 0;
var length = m[0].length;
do {
m = re.exec(search);
if (m[0] === '{') {
braceCount++;
} else if (m[0] === '}') {
if (braceCount === 0) {
throw new Error('Unexpected closing brace: "}"');
}
braceCount--;
}
} while (braceCount > 0);
return {
field:name,
value: search.slice(0, re.lastIndex),
length:length + re.lastIndex + m[0].length
};
}
function parseBibTex (text) {
var m = text.match(/^\s*@([^{]+){([^,\n]+)[,\n]/);
if (!m) {
throw new Error('Unrecogonised header format');
}
var result = {
typeName: m[1].trim(),
citationKey: m[2].trim()
}
text = text.slice(m[0].length).trim();
while (text[0] !== '}') {
var pair = parseBibTexLine(text);
result[pair.field] = pair.value;
text = text.slice(pair.length).trim();
}
return result;
}
console.log(parseBibTex(txt));
I certainly haven't tested this deeply, but when run on your input I get:
{ typeName: 'inproceedings',
citationKey: 'Carrel2005',
title: '{{Algorithm} for near-optimal autonomous resource management}',
author: '{Carrel, Ândrew and Palmer, Phil}',
notes: 'nonote ,',
booktitle: '{8th International Symposium on Artificial Intelligence, \n Robotics and Automation in Space}',
year: '{2005}',
blahblah: 'error,' }
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.