Why this regex matcher block takes very long time to complete?

Question

I have a program the extract text using regex patterns from PDF documents.

My problem is the matcher blocks take long time to execute for some PDF files...

This is code :

String title = "(?s)\\(54\\)\\s*([\\w\\s,-]+)|(?s)\\[54\\]\\s*([\\w\\s,-]+)";
String in ="((?s)\\(\\d\\d\\)\\s+Inventor\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=\\(\\d*\\)\\s+Assignee:))|((?s)\\[\\d\\d\\)\\s+Inventor:\\s*([\\-\\w\\d\\s,\\.\\(\\)-]+)*[\\w\\']*(?=\\n))|(Inventor\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=Assignee:))|((?s)\\(\\d\\d\\)\\s+Inventor\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=\\(\\d*\\)\\s+Assignee:))|((?s)\\(\\d\\d\\)\\s+Inventor:\\s*([\\-\\w\\d\\s,\\.\\(\\)-]+)*[\\w\\']*(?=\\n))|(Inventor\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=Assignee:))";
String as ="((?s)\\(\\d\\d\\)\\s+Assignee\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=\\(\\d*\\)\\s+Notice:))|((?s)\\(\\d\\d\\)\\s+Assignee:\\s*([\\-\\w\\d\\s,\\.\\(\\)-]+)*[\\w\\']*(?=\\n))|(Assignee\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=Notice:))|(Assignee\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+)(?=Notice:))";
String app_no ="(?s)\\(21\\)\\s*([\\w\\s,.://-]+)|(?s)\\[21\\]\\s*([\\w\\s,.://-]+)";
String filed ="((?s)\\(22\\)\\s*([\\w\\s,.://-]+))|((?s)\\(22\\)\\s*([\\w\\s,.://-]+)(?=\\s*\\n\\s*Related))|((?s)\\[22\\]\\s*([\\w\\s,.://-]+))|((?s)\\[22\\]\\s*([\\w\\s,.://-]+)(?=\\s*\\n\\s*Related))";
String term ="((?s)\\s*Term\\s*([\\w\\s,.://-]+))|((?s)\\s*Term\\s*([\\w\\s,.://-]+))";
String pat_no = "(?s)\\s*Patent No\\.\\:\\s*([\\w\\d\\s,.://-]+)|(?s)\\s*Patent Number\\:\\s*([\\w\\d\\s,.://-]+)";
String pat_dt = "(?s)\\(45\\)\\s*Date([\\*\\w\\d\\s,.://-]+)(?=\\(\\d*\\)\\s+Inventor:)|(?s)\\(45\\)\\s*Date([\\*\\w\\d\\s,.://-]+)(?=\\(\\d*\\)\\s+Inventors:)|(?s)\\(45\\)\\s*Date([\\*\\w\\d\\s,.://-]+)|(?s)\\[45\\]\\s*Date([\\*\\w\\d\\s,.://-]+)(?=\\[\\d*\\]\\s+Inventor:)|(?s)\\[45\\]\\s*Date([\\*\\w\\d\\s,.://-]+)(?=\\(\\d*\\)\\s+Inventors:)|(?s)\\[45\\]\\s*Date([\\*\\w\\d\\s,.://-]+)";

String region = stripper.getTextForRegion( "class1" );
String regiont = stripper.getTextForRegion( "class2" );

Pattern p = Pattern.compile(in);
Matcher m = p.matcher(region);

Pattern p2 = Pattern.compile(as);
Matcher m2 = p2.matcher(region);

Pattern p3 = Pattern.compile(title);
Matcher m3 = p3.matcher(region);

Pattern p4 = Pattern.compile(pat_no);
Matcher m4 = p4.matcher(regiont);

Pattern p5 = Pattern.compile(app_no);
Matcher m5 = p5.matcher(region);

Pattern p6 = Pattern.compile(filed);
Matcher m6 = p6.matcher(region);

Pattern p7 = Pattern.compile(pat_dt);
Matcher m7 = p7.matcher(regiont);

long TIMEOUT = 15000l; // 15 seconds
long now = System.currentTimeMillis(); // init the long just above the while

System.out.println("find start");

while(m.find())
{
    // System.out.println(m.group());
}

Long nowtime = System.currentTimeMillis() ;

while(m2.find())
{
    // System.out.println(m2.group());

}

while(m3.find()  && (System.currentTimeMillis() - now) < TIMEOUT)
{
    // System.out.println(m3.group());
    patit = m3.group().replace("(54)", " ");
    patit = patit.trim();
    // System.out.println("m3");
}

while(m4.find()  && (System.currentTimeMillis() - now) < TIMEOUT)
{
    // System.out.println(m4.group());
    patno = m4.group().replace("Patent No.: ", " ");
    patno = patno.replace("Patent No: ", " ");
    patno = patno.replace("Patent", " ");
    patno = patno.replace("No.:", " ");
    patno = patno.replace("No:", " ");
    patno = patno.replace("Number: ", " ");
    patno = patno.replace("Number.: ", " ");
    patno =  patno.trim();
    // System.out.println("m4");
}

while(m5.find()  && (System.currentTimeMillis() - now) < TIMEOUT)
{
    //   System.out.println(m5.group());
    appno = m5.group().replace("(21)", " ");
    appno = appno.replace("Appl. No.: ", " ");
    appno = appno.replace("Appl.", " ");
    appno = appno.replace("No.", " ");
    appno = appno.replace(":"," ");
    appno = appno.trim();
    // System.out.println("m5");
}


while(m6.find() && (System.currentTimeMillis() - now) < TIMEOUT)
{
    // System.out.println(m6.group());
    patfilled = m6.group().replace("(22)", " ");
    patfilled = patfilled.replace("Filed", " ");
    patfilled= patfilled.replace("PCT", " ");
    patfilled = patfilled.replace(":", " ");
    patfilled = patfilled.replace("\n", "");
    patfilled= patfilled.trim();
    // System.out.println("m6");
}

while (m7.find() && (System.currentTimeMillis() - now) < TIMEOUT)
{
    patdate = m7.group().replace("(45) Date of Patent: ", " ");
    patdate = patdate.replace("(45) Date of Patent.: ", " ");
    patdate = patdate.replace("(45)", " ");
    patdate = patdate.replace("Date", " ");
    patdate = patdate.replace("of", " ");
    patdate = patdate.replace("Patent.: ", " ");
    patdate = patdate.replace("Patent: ", " ");
    patdate = patdate.replace("Reissued", " ");
    patdate = patdate.replace(":", " ");
    patdate = patdate.replace("Patent", " ");
    patdate = patdate.replace("*", " ");
    patdate = patdate.trim();
    // System.out.println("m7");
}

System.out.println("find end");

In the above code , the mX.find() takes long time to execute for some iterations that is for some files... That is the execution freezes at System.out.println("find start"); at some iterations .

This is the sample output : (scroll and see)

    -------
     find start
    1ms Elasped
    1841
    File name:06377334.pdf
    US 6,377,334 B2
    METHOD FOR CONTROLLING IMAGE 
    SIZE OF INTEGRATED CIRCUITS ON 
    WAFERS SUPPORTED ON HOT PLATES 
    DURING POST EXPOSURE BAKING OF THE 
    WAFERS
    Apr. 23, 2002
    Jan. 24, 2001 
    Related U.S. Application Data
    09/768,973
    -------
    find start
    1ms Elasped
    1842
    File name:06377337.pdf
    US 6,377,337 B1
    PROJECTION EXPOSURE APPARATUS
    Apr. 23, 2002
    Apr. 27, 1999
    09/299,558
    -------
    find start
    1843
    File name:06377338.pdf
    US 6,377,338 B1
    EXPOSURE APPARATUS AND METHOD
    Apr. 23, 2002
    Oct. 13, 2000 
    Related U.S. Application Data
    09/299,558
    -------
    find start
    1844
    File name:06377339.pdf
    US 6,377,339 B1
    DOCUMENT IMAGING SYSTEM 
    INCORPORATING A SELECTIVELY 
    OPAQUE
    Apr. 23, 2002
    Mar. 29, 1999
    09/280,186
    -------
     find start
    1845
    File name:06377340.pdf
    US 6,377,340 B1
    METHOD OF DETECTION OF NATURAL 
    DIAMONDS THAT HAVE BEEN PROCESSED 
    AT HIGH PRESSURE AND HIGH 
    TEMPERATURES
    Apr. 23, 2002
    Oct. 29, 1999
    09/430,477
    -------
    find start
    1846
    File name:06377341.pdf
    US 6,377,341 B1
    REFRACTIVE INDEX BASED DETECTOR 
    SYSTEM FOR LIQUID CHROMATOGRAPHY
    Apr. 23, 2002
    Aug. 3, 1999
    09/368,310
    -------
    find start

(execution freezes here )

Why this happen ? Why the regex matchers take very long time ?

Here the whole program:

import java.awt.Rectangle;
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.filefilter.TrueFileFilter;
import org.apache.commons.io.filefilter.WildcardFileFilter;
import org.apache.pdfbox.exceptions.InvalidPasswordException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.util.PDFTextStripperByArea;


public class PatentAdder {

    /**
     * @param args
     */

    public static String patno,patit,patdate,patfilled,appno;
    private static int File;
    public static void main(String[] args) {

        try {

int cnt=0;

        if( args.length == 1 )
        {
          //  usage();
        }
        else
        {
            PDDocument document = null;
            try
            {
                  File dataDir = new File("F:/patents/test/tittest/USP2002w17/06/378/pdfs");

                  File[] files = dataDir.listFiles();


                 int count=0;


                   long TIMEOUT1 = 60000l; // 15 seconds
                   long now1 = System.currentTimeMillis(); 

                       for (File file : files) {

                     try {
                    //  System.out.println ("Satrt2");
                      File f = file;

                      if (!f.isDirectory()) {
                document = PDDocument.load(f.getAbsolutePath());
                if( document.isEncrypted() )
                {
                    try
                    {
                        document.decrypt( "" );
                    }
                    catch( InvalidPasswordException e )
                    {
                        System.err.println( "Error: Document is encrypted with a password." );
                        System.exit( 1 );
                    }
                } }

                PDFTextStripperByArea stripper = new PDFTextStripperByArea();
                stripper.setSortByPosition( true );

                Rectangle rectt = new Rectangle( 288, 60, 222, 40 );
            Rectangle rect = new Rectangle( 55, 108, 230, 600 ); //  US-Patent title h40

               stripper.addRegion( "class1", rect );
                stripper.addRegion("class2", rectt);


                List allPages = document.getDocumentCatalog().getAllPages();
                PDPage firstPage = (PDPage)allPages.get( 0 );
                stripper.extractRegions( firstPage );


               String title = "(?s)\\(54\\)\\s*([\\w\\s,-]+)|(?s)\\[54\\]\\s*([\\w\\s,-]+)";
               String in ="((?s)\\(\\d\\d\\)\\s+Inventor\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=\\(\\d*\\)\\s+Assignee:))|((?s)\\[\\d\\d\\)\\s+Inventor:\\s*([\\-\\w\\d\\s,\\.\\(\\)-]+)*[\\w\\']*(?=\\n))|(Inventor\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=Assignee:))|((?s)\\(\\d\\d\\)\\s+Inventor\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=\\(\\d*\\)\\s+Assignee:))|((?s)\\(\\d\\d\\)\\s+Inventor:\\s*([\\-\\w\\d\\s,\\.\\(\\)-]+)*[\\w\\']*(?=\\n))|(Inventor\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=Assignee:))";
               String as ="((?s)\\(\\d\\d\\)\\s+Assignee\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=\\(\\d*\\)\\s+Notice:))|((?s)\\(\\d\\d\\)\\s+Assignee:\\s*([\\-\\w\\d\\s,\\.\\(\\)-]+)*[\\w\\']*(?=\\n))|(Assignee\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=Notice:))|(Assignee\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+)(?=Notice:))";
               String app_no ="(?s)\\(21\\)\\s*([\\w\\s,.://-]+)|(?s)\\[21\\]\\s*([\\w\\s,.://-]+)";
               String filed ="((?s)\\(22\\)\\s*([\\w\\s,.://-]+))|((?s)\\(22\\)\\s*([\\w\\s,.://-]+)(?=\\s*\\n\\s*Related))|((?s)\\[22\\]\\s*([\\w\\s,.://-]+))|((?s)\\[22\\]\\s*([\\w\\s,.://-]+)(?=\\s*\\n\\s*Related))";
               String term ="((?s)\\s*Term\\s*([\\w\\s,.://-]+))|((?s)\\s*Term\\s*([\\w\\s,.://-]+))";
               String pat_no = "(?s)\\s*Patent No\\.\\:\\s*([\\w\\d\\s,.://-]+)|(?s)\\s*Patent Number\\:\\s*([\\w\\d\\s,.://-]+)";
               String pat_dt = "(?s)\\(45\\)\\s*Date([\\*\\w\\d\\s,.://-]+)(?=\\(\\d*\\)\\s+Inventor:)|(?s)\\(45\\)\\s*Date([\\*\\w\\d\\s,.://-]+)(?=\\(\\d*\\)\\s+Inventors:)|(?s)\\(45\\)\\s*Date([\\*\\w\\d\\s,.://-]+)|(?s)\\[45\\]\\s*Date([\\*\\w\\d\\s,.://-]+)(?=\\[\\d*\\]\\s+Inventor:)|(?s)\\[45\\]\\s*Date([\\*\\w\\d\\s,.://-]+)(?=\\(\\d*\\)\\s+Inventors:)|(?s)\\[45\\]\\s*Date([\\*\\w\\d\\s,.://-]+)";

                String region = stripper.getTextForRegion( "class1" );

                String regiont = stripper.getTextForRegion( "class2" );

                Pattern p = Pattern.compile(in);
                Matcher m = p.matcher(region);

                Pattern p2 = Pattern.compile(as);
                Matcher m2 = p2.matcher(region);

                Pattern p3 = Pattern.compile(title);
                Matcher m3 = p3.matcher(region);

                Pattern p4 = Pattern.compile(pat_no);
                Matcher m4 = p4.matcher(regiont);

                Pattern p5 = Pattern.compile(app_no);
                Matcher m5 = p5.matcher(region);

                Pattern p6 = Pattern.compile(filed);
                Matcher m6 = p6.matcher(region);


                Pattern p7 = Pattern.compile(pat_dt);
                Matcher m7 = p7.matcher(regiont);


              System.out.println("find start");                                    
                Long nowtime = System.currentTimeMillis() ;



                while(m3.find())
                {
                    patit = m3.group().replace("(54)", " ");
                    patit = patit.trim();

                }

                while(m4.find() )
                {

                    patno = m4.group().replace("Patent No.: ", " ");
                    patno = patno.replace("Patent No: ", " ");
                    patno = patno.replace("Patent", " ");
                    patno = patno.replace("No.:", " ");
                    patno = patno.replace("No:", " ");
                    patno = patno.replace("Number: ", " ");
                    patno = patno.replace("Number.: ", " ");
                    patno =  patno.trim();

                }

                while(m5.find() )
                {

                appno = m5.group().replace("(21)", " ");
                appno = appno.replace("Appl. No.: ", " ");
                appno = appno.replace("Appl.", " ");
                appno = appno.replace("No.", " ");
                appno = appno.replace(":"," ");
                appno = appno.trim();


                }


                while(m6.find())
                {

                  patfilled = m6.group().replace("(22)", " ");
                  patfilled = patfilled.replace("Filed", " ");
                  patfilled= patfilled.replace("PCT", " ");
                  patfilled = patfilled.replace(":", " ");
                  patfilled = patfilled.replace("\n", "");
                  patfilled= patfilled.trim();

                }

                while (m7.find() )
                {
                    patdate = m7.group().replace("(45) Date of Patent: ", " ");
                    patdate = patdate.replace("(45) Date of Patent.: ", " ");
                    patdate = patdate.replace("(45)", " ");
                    patdate = patdate.replace("Date", " ");
                    patdate = patdate.replace("of", " ");
                    patdate = patdate.replace("Patent.: ", " ");
                    patdate = patdate.replace("Patent: ", " ");
                    patdate = patdate.replace("Reissued", " ");
                    patdate = patdate.replace(":", " ");
                    patdate = patdate.replace("Patent", " ");
                    patdate = patdate.replace("*", " ");
                    patdate = patdate.trim();


                }            



                PrintWriter out = new PrintWriter (new File("F:/patents/test/tittest/USP2002w17/06/378/pdfs/output.txt"));
                System.out.println(count);
                out.println(count);

                System.out.println("File name:"+f.getName());
                out.println("File name:"+f.getName());

                System.out.println(patno +"\n"+patit+"\n"+patdate+"\n"+patfilled+"\n"+appno+"\n-------");
                out.println(patno +"\n"+patit+"\n"+patdate+"\n"+patfilled+"\n"+appno+"\n-------");

                Long endtime = System.currentTimeMillis()-nowtime;
                System.out.println(endtime+"ms Elasped") ; 
                out.println(endtime+"ms Elasped") ;

                  count++;

                 }
                     catch (IOException e)
                        {
                            continue;
                        }


                 } 


                  System.out.print("-----Finised "+count+" Files------ \n");


            }
            finally
            {
                if( document != null )
                {
                    document.close();
                }
            }


        }

        }

        catch (Exception e)
        {
            System.out.println(e.getStackTrace());
            //System.out.println(e.getLocalizedMessage());
            System.out.println(e.getMessage());
            System.out.println(e.getCause());
            //System.out.println(e.getClass());
            e.printStackTrace();


        }

    }

Please tell how to optamize the regex and solve this execution freezing issue ...

Answer 1

I love to use regex myself, but it looks like this is not the ideal approach to what you are trying to do. Regex are good for extracting a specific bit of information from a text. But applying several regexps over and over again on a text is a sign that a parsing approach would be better.

One problem with your approach is that each of your while loops is reading the whole text again. This can be avoided by writing your own parser and let it work through the document once.

Another problem with your regex is that they have lots of optional parts ( \\s* etc). Optional parts make the evaluation of a regex more costly. It might be a good approach to use a very simple regex instead and recheck its matching positions for false positives. For example instead of your regex

String term ="((?s)\\s*Term\\s*([\\w\\s,.://-]+))|((?s)\\s*Term\\s*([\\w\\s,.://-]+))";

you could just look for

String simple_term ="Term";

And then check on each occurrence of Term if it really is the part you are looking for.

Btw looking at the string I picked rather randomly from your code, I noticed that it is more complicated than it has to be. Just remove the alternative | because the first and second alternative are just the same.

Answer 2

You run into backtracking hell, since you have a construct similar to \\s*\\s*c - consecutive repeated character class that has non-empty intersection, followed by a non-intersecting sequel.

Let us look at the String in (as seen by the engine):

((?s)\(\d\d\)\s+Inventor\w*:\s*\w*([\w\d,.\s)(-]+);([\w\s.\',();-]+)(?=\(\d*\)\s+Assignee:))
|
((?s)\[\d\d\)\s+Inventor:\s*([\-\w\d\s,\.\(\)-]+)*[\w\']*(?=\n))
|
(Inventor\w*:\s*\w*([\w\d,.\s)(-]+);([\w\s.\',();-]+)(?=Assignee:))
|
((?s)\(\d\d\)\s+Inventor\w*:\s*\w*([\w\d,.\s)(-]+);([\w\s.\',();-]+)(?=\(\d*\)\s+Assignee:))
|
((?s)\(\d\d\)\s+Inventor:\s*([\-\w\d\s,\.\(\)-]+)*[\w\']*(?=\n))
|
(Inventor\w*:\s*\w*([\w\d,.\s)(-]+);([\w\s.\',();-]+)(?=Assignee:))

You have plenty of such pattern in your regex:

((?s)\(\d\d\)\s+Inventor\w*:\s*\w*([\w\d,.\s)(-]+);([\w\s.\',();-]+)(?=\(\d*\)\s+Assignee:))
                               ^^^^^^^^^^^^^^^^^^^^

((?s)\[\d\d\)\s+Inventor:\s*([\-\w\d\s,\.\(\)-]+)*[\w\']*(?=\n))
                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

(Inventor\w*:\s*\w*([\w\d,.\s)(-]+);([\w\s.\',();-]+)(?=Assignee:))
                ^^^^^^^^^^^^^^^^^^^^

This is not mentioning that you include the same 3 subpatterns twice in your regex, which is totally redundant.

Why this regex matcher block takes very long time to complete?

Question

2 answers

solution1
0 2014-02-12 17:41:18

solution2
0 2014-02-12 18:12:37

Why this regex matcher block takes very long time to complete?

Question

2 answers

solution1 0 2014-02-12 17:41:18

solution2 0 2014-02-12 18:12:37

solution1
0 2014-02-12 17:41:18

solution2
0 2014-02-12 18:12:37