I have two regular expressions. one to match a python style comment, and one to match a file path.
When I try to see if a comment matches the file path expression it throws an error if the comment string is longer than ~15 characters. Otherwise it acts as expected.
how can I modify my regex so that it doesn't have this problem
sample code:
#include <string>
#include "boost/regex.hpp"
using namespace std;
using namespace boost;
int main(int argc, char** argv)
{
boost::regex re_comment("\\s*#[^\\r\\n]*");
boost::regex re_path("\"?([A-Za-z]:)?[\\\\/]?(([^(\\\\/:*?\"<>|\\r\\n)]+[\\\\/]?)+)?\\.[\\w]+\"?");
string shortComment = " #comment ";
string longComment = "#123456789012345678901234567890";
string myPath = "C:/this/is.a/path.doc";
regex_match(shortComment,re_comment); //evaluates to true
regex_match(longComment,re_comment); //evaluates to true
regex_match(myPath, re_path); //evaluates to true
regex_match(shortComment, re_path); //evaluates to false
regex.match(longComment, re_path); //throws error
}
This is the error that gets thrown
terminate called after throwing an instance of
'boost::exception_detail::clone_impl<boost::exception_detail
::error_info_injector<std::runtime_error> >'
what(): The complexity of matching the regular expression exceeded predefined
bounds. Try refactoring the regular expression to make each choice made by the
state machine unambiguous. This exception is thrown to prevent "eternal" matches
that take an indefinite period time to locate.
I know it is tempting to always create one huge regex to solve all of the worlds problems, and indeed there may be performance reasons for doing so, but you also have to consider the maintenance nightmare you are creating when you build such a monstrosity. That being said, I propose to break the problem down to manageable parts.
Basically take care of quotes, split the string on dir separators, and regex each part of the path.
#include <string>
#include "boost/regex.hpp"
#include "boost/algorithm/string.hpp"
using namespace std;
using namespace boost;
bool my_path_match(std::string line)
{
bool ret = true;
string drive = "([a-zA-Z]\\:)?";
string pathElem = "(\\w|\\.|\\s)+";
boost::regex re_pathElem(pathElem);
boost::regex re_drive("(" + drive + "|" + pathElem + ")");
vector<string> split_line;
vector<string>::iterator it;
if ((line.front() == '"') && (line.back() == '"'))
{
line.erase(0, 1); // erase the first character
line.erase(line.size() - 1); // erase the last character
}
split(split_line, line, is_any_of("/\\"));
if (regex_match(split_line[0], re_drive) == false)
{
ret = false;
}
else
{
for (it = (split_line.begin() + 1); it != split_line.end(); it++)
{
if (regex_match(*it, re_pathElem) == false)
{
ret = false;
break;
}
}
}
return ret;
}
int main(int argc, char** argv)
{
boost::regex re_comment("^.*#.*$");
string shortComment = " #comment ";
string longComment = "#123456789012345678901234567890";
vector<string> testpaths;
vector<string> paths;
vector<string>::iterator it;
testpaths.push_back("C:/this/is.a/path.doc");
testpaths.push_back("C:/this/is also .a/path.doc");
testpaths.push_back("/this/is also .a/path.doc");
testpaths.push_back("./this/is also .a/path.doc");
testpaths.push_back("this/is also .a/path.doc");
testpaths.push_back("this/is 1 /path.doc");
bool ret;
ret = regex_match(shortComment, re_comment); //evaluates to true
cout<<"should be true = "<<ret<<endl;
ret = regex_match(longComment, re_comment); //evaluates to true
cout<<"should be true = "<<ret<<endl;
string quotes;
for (it = testpaths.begin(); it != testpaths.end(); it++)
{
paths.push_back(*it);
quotes = "\"" + *it + "\""; // test quoted paths
paths.push_back(quotes);
std::replace(it->begin(), it->end(), '/', '\\'); // test backslash paths
std::replace(quotes.begin(), quotes.end(), '/', '\\'); // test backslash quoted paths
paths.push_back(*it);
paths.push_back(quotes);
}
for (it = paths.begin(); it != paths.end(); it++)
{
ret = my_path_match(*it); //evaluates to true
cout<<"should be true = "<<ret<<"\t"<<*it<<endl;
}
ret = my_path_match(shortComment); //evaluates to false
cout<<"should be false = "<<ret<<endl;
ret = my_path_match(longComment); //evaluates to false
cout<<"should be false = "<<ret<<endl;
}
Yes, it will (probably) be slower than just a single regex BUT it will work, it doesn't throw errors on the python comment lines, and if you find a path/comment that fails, you should be able to figure out what is wrong and fix it (ie it is maintainable).
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.