Regex Replace on IBMi

I am looking for a way to use Regex Replace functions on IBM iseries .

As far as i know, i can use C++ librairies (regex.h) ( source ) With this, i can only match regex, but not replace. (using regcomp() to compile and regexec() to match the regex)

Does anyone know a way to do it ?

It's true that the C/C++ POSIX regular expression library doesn't have a built in regexp replace function, but you can accomplish the same thing using positional information from regexec() and the RPGLE %replace() built in function. (I'm assuming you're going to use RPGLE but you could use another language.)

For example, if you wanted to mask all but the last four digits of a phone number you could do this:

  /include qcpysrc,regex_h

 d regex_phone_number...
 d                 ds                  inz likeds(regex_t)
 d dsrm            ds                  inz likeds(regmatch_t) dim(20)

 d data            s             52a   inz varying
 d pattern         s            256a   inz varying
 d rc              s             10i 0 inz(0)

   *inlr = *on ;
   data = 'My phone #''s are: (444) 555 - 6666 and 777.888.9999' ;

   dsply data ;

   pattern = '\(?([0-9]{3})[ .)]*([0-9]{3})[ .-]*([0-9]{4})' ;
   rc = regcomp(regex_phone_number :pattern :REG_EXTENDED) ;
   if rc = 0 ;
     dow '1' ;
       rc = regexec(regex_phone_number :data
              :regex_phone_number.re_nsub  :%addr(dsrm) :0) ;

       if rc <> 0 ;
         leave ;
       endif ;

       data = %replace('***': data :dsrm(2).rm_so+1
                :dsrm(2).rm_eo - dsrm(2).rm_so) ;
       data = %replace('***': data :dsrm(3).rm_so+1
                :dsrm(3).rm_eo - dsrm(3).rm_so) ;
     enddo ;
   endif ;

   dsply data ;
   regfree(regex_phone_number) ;

Here's what the copy book regex_h looks like:

  ** Header file for calling the "Regular Expression" functions
  **   provided by the ILE C Runtime Library from an RPG IV
  **   program.                 Scott Klement, 2001-05-04
  **                       Converted to qualified DS 2003-11-29
  **                       Modified by Jarrett Gilliam 2014-11-05
  ** This copy book is for using the C regular expression library, regex.h, in RPG.
  ** You can go to http://www.regular-expressions.info/ to learn more about
  ** regular expressions. This regex flavor is POSIX ERE. You can go to
  ** http://www-01.ibm.com/support/knowledgecenter/ssw_ibm_i_71/rtref/regexec.htm
  ** to learn more about how the C functions work.

 d/if defined(REGEX_H)
 d/define REGEX_H

  * cflags for regcomp()
 d REG_BASIC       c                   CONST(0)
 d REG_EXTENDED    c                   CONST(1)
 d REG_ICASE       c                   CONST(2)
 d REG_NEWLINE     c                   CONST(4)
 d REG_NOSUB       c                   CONST(8)

  * eflags for regexec()
 d REG_NOTBOL      c                   CONST(256)
 d REG_NOTEOL      c                   CONST(512)

  *  errors returned
  * RE pattern not found
 d REG_NOMATCH     c                   CONST(1)
  * Invalid Regular Expression
 d REG_BADPAT      c                   CONST(2)
  * Invalid collating element
 d REG_ECOLLATE    c                   CONST(3)
  * Invalid character class
 d REG_ECTYPE      c                   CONST(4)
  * Last character is \
 d REG_EESCAPE     c                   CONST(5)
  * Invalid number in \digit
 d REG_ESUBREG     c                   CONST(6)
  * imbalance
 d REG_EBRACK      c                   CONST(7)
  * \( \) or () imbalance
 d REG_EPAREN      c                   CONST(8)
  * \{ \} or { } imbalance
 d REG_EBRACE      c                   CONST(9)
  * Invalid \{ \} range exp
 d REG_BADBR       c                   CONST(10)
  * Invalid range exp endpoint
 d REG_ERANGE      c                   CONST(11)
  * Out of memory
 d REG_ESPACE      c                   CONST(12)
  * ?*+ not preceded by valid RE
 d REG_BADRPT      c                   CONST(13)
  * invalid multibyte character
 d REG_ECHAR       c                   CONST(14)
  * (shift 6 caret or not) anchor and not BOL
 d REG_EBOL        c                   CONST(15)
  * $ anchor and not EOL
 d REG_EEOL        c                   CONST(16)
  * Unknown error in regcomp() call
 d REG_ECOMP       c                   CONST(17)
  * Unknown error in regexec() call
 d REG_EEXEC       c                   CONST(18)

  *  Structure of a compiled regular expression:
 d REG_SUBEXP_MAX  c                   20
 d regex_t         ds                  qualified align based(template)
 d   re_nsub                     10i 0
 d   re_comp                       *
 d   re_cflags                   10i 0
 d   re_erroff                   10i 0
 d   re_len                      10i 0
 d   re_ucoll                    10i 0 dim(2)
 d   re_lsub                       *   DIM(REG_SUBEXP_MAX)
 d   re_esub                       *   DIM(REG_SUBEXP_MAX)
 d   re_map                     256a
 d   re_shift                     5i 0
 d   re_dbcs                      5i 0

  *  structure used to report matches found by regexec()
 d regmatch_t      ds                  qualified align based(template)
 d   rm_so                       10i 0
 d   rm_ss                        5i 0
 d   rm_eo                       10i 0
 d   rm_es                        5i 0

  * regcomp() -- Compile a Regular Expression ("RE")
  *     int regcomp(regex_t *preg, const char *pattern,
  *              int cflags);
  * where:
  *       preg (output) = the compiled regular expression.
  *    pattern (input)  = the RE to be compiled.
  *     cflags (input)  = the sum of the cflag constants
  *                       (listed above) for this RE.
  * Returns 0 = success, otherwise an error number.
 d regcomp         pr            10i 0 extproc('regcomp')
 d   preg                              like(regex_t)
 d   pattern                       *   value options(*string)
 d   cflags                      10i 0 value

  * regexec() -- Execute a compiled Regular Expression ("RE")
  *     int regexec(const regex_t *preg, const char *string,
  *              size_t nmatch, regmatch_t *pmatch, int eflags);
  * where:
  *       preg (input)  = the compiled regular expression
  *                       (the output of regcomp())
  *     string (input)  = string to run the RE upon
  *     nmatch (input)  = the number of matches to return.
  *     pmatch (output) = array of regmatch_t DS's
  *                       showing what matches were found.
  *     eflags (input)  = the sum of the flags (constants
  *                       provided above) modifying the RE
  * Returns 0 = success, otherwise an error number.
 d regexec         pr            10i 0 extproc('regexec')
 d   preg                              like(regex_t) const
 d   string                        *   value options(*string)
 d   nmatch                      10u 0 value
 d   pmatch                        *   value
 d   eflags                      10i 0 value

  * regerror() -- return error information from regcomp/regexec
  *   size_t regerror(int errcode, const regex_t *preg,
  *              char *errbuf, size_t errbuf_size);
  *  where:
  *    errcode (input)  = the error code to return info on
  *                      (obtained as the return value from
  *                      either regcomp() or regexec())
  *       preg (input)  = the (compiled) RE to return the
  *                      error for.
  *     errbuf (output) = buffer containing human-readable
  *                      error message.
  * errbuf_size (input) = size of errbuf (max length of msg
  *                      that will be returned)
  * returns:  length of buffer needed to get entire error msg
 d regerror        pr            10u 0 extproc('regerror')
 d   errcode                     10i 0 value
 d   preg                              like(regex_t) const
 d   errbuf                        *   value
 d   errbuf_size                 10i 0 value

  * regfree() -- free memory locked by Regular Expression
  *    void regfree(regex_t *preg);
  *   where:
  *        preg (input) = regular expression to free mem for.
  *   NOTE:  regcomp() will always allocate extra memory
  *        to be pointed to by the various pointers in
  *        the regex_t structure.  if you don't call this,
  *        that memory will never be returned to the system!
 d regfree         pr                  extproc('regfree')
 d   preg                              like(regex_t)

Here's the output:

DSPLY  My phone #'s are: (444) 555 - 6666 and 777.888.9999
DSPLY  My phone #'s are: (***) *** - 6666 and ***.***.9999

The code could be improved by extracting the replace logic and putting it in a Procedure of it's own, creating a custom regexp replace function based on the POSIX library but it's not absolutely necessary.

The ILE C/C++ runtime library does not have a regex replace function available.

Java, however, has excellent support for regular expressions and integrates easily with RPGLE.

I succeed in using Regex with Java. I was inspired by this code from scott klement and that code from ibm . The mix works well. I just added the replace function.


 D newString       pr              O   CLASS(*JAVA:'java.lang.String')
 D                                     EXTPROC(*JAVA:'java.lang.String':
 D                                     *CONSTRUCTOR)
 D    bytearray               32767A   VARYING CONST
 D getBytes        PR         65535A    VARYING
 D                                      EXTPROC(*JAVA:
 D                                       'java.lang.String':
 D                                       'getBytes')
 D PatternCompile  pr              O   CLASS(*JAVA:
 D                                     'java.util.regex.Pattern')
 D                                     EXTPROC(*JAVA:
 D                                     'java.util.regex.Pattern':
 D                                     'compile') STATIC
 D    pattern                      O   CLASS(*JAVA:'java.lang.String')
 D PatternMatcher  pr              O   CLASS(*JAVA:
 D                                     'java.util.regex.Matcher')
 D                                     EXTPROC(*JAVA:
 D                                     'java.util.regex.Pattern':
 D                                     'matcher')
 D    comparestr                   O   CLASS(*JAVA
 D                                     :'java.lang.CharSequence')
 D CheckMatches    pr             1N   EXTPROC(*JAVA
 D                                     :'java.util.regex.Matcher'
 D                                     :'matches')
 D DoReplace       pr              O   CLASS(*JAVA:'java.lang.String')
 D                                     EXTPROC(*JAVA
 D                                     :'java.util.regex.Matcher'
 D                                     :'replaceAll')
 D    replacement                  O   CLASS(*JAVA
 D                                     :'java.lang.String')
 D RegExPattern    s               O   CLASS(*JAVA:
 D                                      'java.util.regex.Pattern')
 D RegExMatcher    s               O   CLASS(*JAVA:
 D                                     'java.util.regex.Matcher')
 D jstrStmt        s                   like(jstring)
 D jPatStr         s                   like(jstring)
 D jRepStr         s                   like(jstring)
 D jRepStr2        s                   like(jstring)
 D result          S             30A   
    jPatStr = newString('^(\+33|0)([1-9][0-9]{8})$');
    jstrStmt = newString('+33123456789');
    jRepStr = newString('0$2');
    RegExPattern = PatternCompile(jPatStr);
    RegExMatcher = PatternMatcher(RegExPattern : jstrStmt);
    if (CheckMatches(RegExMatcher) = *ON);
        dsply ('it matches');
        dsply ('it doesn''t match');
    jRepStr2 = DoReplace(RegExMatcher : jRepStr);
    result = getBytes(jRepStr2);

    dsply (%subst(result : 1 : 30));
    *inlr = *on;

It works, but with Java. I still work on the PASE Solution WarrenT suggested, but using PASE in an ILE program is such a pain...

The Young i Professionals Wiki has a page of Open Source Binaries . In the list is the PCRE Library (Perl Compatible Regular Expressions).

Let us know how this works out. I may try it myself ;-)

For excellent SQLRPGLE example and explanation refer to :


REGEXP_REPLACE ( source-string , pattern-expression , replacement-string , start , occurence , flags )

