How to compare the data of two files (.xml and .html) using perl(regex)?

Question

I tried using regex by using foreach loop for .xml files as there are many .xml files and only one .html file. I opened, read and closed the directory. But when it comes to searching for a particular pattern in both the files, the code doesn't enter the while/if loop.

xml data: #pattern in xml format

<gname>abc</gname>
<pname>xyz</pname>

html data: #pattern in html format

<p>ABC</p>
<p><i>xyz</i></p>

Here, I need to match abc and xyz in both xml and html file (case sensitive).

open( F2, "<F2>" );
my $xml_list1 = "(.*)\.html";

here the data enclosed inside the parentheses also appears when printed. I want say the file name is abc.html so i want to keep " abc " as interchangeable, so that i dont need to write/modify the code if any filename other than abc.html occurs.

close F2;
#print $xml_list1."\n";

foreach my $f (@filenames) {
    #print $f."\n";
    open( F1, "<F1>" );
    my $data = join( "", <F1> );
    close F1;
    my $filename = substr( $f, 0, index( $f, '.' ) );
    my $xml_list = $filename . ".xml";

    while ( $xml_list =~ m//ig ) {
        ...;
    }
}

the code doesn't enter the while/if loop, seems that it finds some error in reading the filename $xml_list .

I want to match both the data without the use of parsers.

Can someone please help me to out.

UPDATE: CODE:

#!/usr/bin/perl
use strict;
use Cwd;
use File::Copy;
use File::Basename;

my $path1=getcwd;

opendir(INP, "$path1\/Input");
my @out = grep(/.(xml)$/,readdir(INP));
my @out1 = grep(/.(html)$/,readdir(INP));
close INP;

foreach my $final(@out)
{
 my $filetobecopied = "Input\/".$final;
 my $newfile = $final;
 copy($filetobecopied, $newfile) or die "File cannot be copied.";
}

foreach my $final1(@out1)
{
 my $filetobecopied1 = "Input\/".$final1;
 my $newfile1 = $final1;
 copy($filetobecopied1, $newfile1) or die "File cannot be copied.";
}

opendir DIR, $path1 or die "cant open dir";
my @files = grep /(.*?)\.(xml)$/,(readdir DIR);
my @files1 = grep /(.*?)\.(html)$/,(readdir DIR);
closedir DIR;

open(F6, ">Ref.txt");
print F6 "FileName\tError Instance\tOutput\n";

# open(F2,"<F2>");
# my $xml_list1="abc.html";
# my $data1=join("",<F1>);
# my $xml_list2=$xml_list1;

foreach my $f(@files)
 {
open(F1, "<$f") or die "Cannot open file: $files[0]";
my $data=join("", <F1>);
close F1;
my $xml_list=$data;
#print "$f\n";

open(F2, "<$f") or die "Cannot open file: $files[0]";
my $xml_listt="abc.html";
my $data1=join("", <F2>);
my $xml_list1=$data1;

print $xml_list1."\n";

while($xml_list=~m/(<personName>(.*?)<\/personName>)/isg)
{
        my $full=$1;
        my $name=$2;
        #print F6 $f."\t".$full."\n";       
if($full=~m/(<givenNames>(\w+)<\/givenNames>(\n)?<familyName>(\w+)<\/familyName>(\n)?(.*?))/isg)
        {
        my $fg=$1;
        my $gname=$2;
        my $fname=$4;
        #print F6 $f."\t".$gname."\t".$fname."\n";
        }
     }
While($xml_list1=~m/(<p><FONT FACE="(.*?)" SIZE="(\d+)"><I>(.*?)<\/I><\/FONT><\/p>)/igs)        
    {
    my $hfull=$1;
    print F6 $f."\n";   #.$hfull."\n";
    }
close F2;
close F1;
}
foreach my $del(@files)
{
unlink $del;
}

Answer 1

@flora: I have modified your program and also i have put the optimized solution for your program.

Modified Code :(Modification to your program)

 #!/usr/bin/perl
    use strict;
    use Cwd;
    use File::Copy;
    use File::Basename;

    my $path1=getcwd;

    opendir(INP, "$path1\/Input");
    my @out = grep(/\.(xml)$/,readdir(INP));
    my @out1 = grep(/\.(html)$/,readdir(INP));
    close INP;

    foreach my $final(@out)
    {
     my $filetobecopied = "Input\/".$final;
     my $newfile = $final;
     copy($filetobecopied, $newfile) or die "File cannot be copied.";
    }

    foreach my $final1(@out1)
    {
     my $filetobecopied1 = "Input\/".$final1;
     my $newfile1 = $final1;
     copy($filetobecopied1, $newfile1) or die "File cannot be copied.";
    }

    opendir DIR, $path1 or die "cant open dir";
    my @files = grep /(.*?)\.(xml)$/,(readdir DIR);
    my @files1 = grep /(.*?)\.(html)$/,(readdir DIR);
    closedir DIR;

    open(F6, ">Ref.txt");
    print F6 "FileName\tError Instance\tOutput\n";

    # open(F2,"<F2>");
    # my $xml_list1="abc.html";
    # my $data1=join("",<F1>);
    # my $xml_list2=$xml_list1;

    foreach my $f (@files)
     {    
    open(F1, "<$f") or die "Cannot open file: $f";
    my $data=join("", <F1>);
    close F1;
    my $xml_list=$data;    
    open(F2, "<$path1\/Input\/abc.html") or die "Cannot open file: abc.html - $!";
    my $data1=join("", <F2>);
    close F2;
    my $xml_list1=$data1;
    print $xml_list1."\n";

    while($xml_list =~ m/(<personName>(.*?)<\/personName>)/isg)
    {
            my $full=$1;
            my $name=$2;
            #print F6 $f."\t".$full."\n";       
    if($full =~ m/(<givenNames>\s*(\w+)\s*<\/givenNames>\s*<familyName>\s*(\w+)\s*<\/familyName>\s*(.*?))/isg)
            {
            my $fg=$1;
            my $gname=$2;
            my $fname=$3;
            #print F6 $f."\t".$gname."\t".$fname."\n";
            }
         }
    while($xml_list1 =~ m/(<p><FONT FACE="(.*?)" SIZE="(\d+)"><I>(.*?)<\/I><\/FONT><\/p>)/isg)        
        {
        my $hfull= $1;
        print F6 $f."\n";   #.$hfull."\n";
        }

    }
    foreach my $del(@files)
    {
     unlink $del;
    }

Optimized solution:

#!/usr/bin/perl
use strict;
use warnings;

my @files = grep {-f} glob("*.xml");
my @files1 = grep {-f} glob("*.html");
open(F6, ">Ref.txt");
print F6 "FileName\tError Instance\tOutput\n";
foreach my $f (@files)
 {
my $xml_list = do {
    local $/ = undef;
    open my $fh,'<',"$f" or die "Cannot open file: $f";
  <$fh>;
};

my $xml_list1 = do {
    local $/ = undef;
    open my $fh,'<',"abc.html" or die "Cannot open file: $f";
  <$fh>;
};


print $xml_list1."\n";

while($xml_list =~ m/(<personName>(.*?)<\/personName>)/isg)
{
        my $full=$1;
        my $name=$2;
        #print F6 $f."\t".$full."\n";       
if($full =~ m/(<givenNames>\s*(\w+)\s*<\/givenNames>\s*<familyName>\s*(\w+)\s*<\/familyName>\s*(.*?))/isg)
        {
        my $fg=$1;
        my $gname=$2;
        my $fname=$3;
        #print F6 $f."\t".$gname."\t".$fname."\n";
        }
     }
while($xml_list1 =~ m/(<p><FONT FACE="(.*?)"\s+SIZE="(\d+)"><I>(.*?)<\/I><\/FONT><\/p>)/isg)        
    {
    my $hfull= $1;
    print F6 $f . "\n";   #.$hfull."\n";
    }

}
foreach my $del(@files)
{
  unlink $del;
}

Answer 2

@flora : Try this code . I was busy with other work so couldn't reply soon. Now this code will generate following output :

Input File:(sample.xml)

<creators> 
<creator affiliationRef="#01" creatorRole="author" xml:id="01"> 
<personName><givenNames>Kazumitsu</givenNames><familyName>Sugiura</familyName></personName>
</creator> 
<creator affiliationRef="#01" creatorRole="author" xml:id="02"> 
<personName><givenNames>Yoshinao</givenNames><familyName>Muro</familyName></personName>
</cre??ator> 
<creator affiliationRef="#01" creatorRole="author" xml:id="03"> 
<personName><givenNames>Masashi</givenNames><familyName>Akiyama</familyName></personName> 
</creator>
</creators>

InputFile(abc.html):

<P><FONT FACE="hello" SIZE="14"><I>Kazumitsu SUGIURA, Yoshinao Muro, and Masashi Akiyama</I></FONT></P>

Output:

FileName    MatchedString   Output
Matched Sugiura(sample.xml)->SUGIURA(abc.html)
Matched Muro(sample.xml)->Muro(abc.html)
Matched Akiyama(sample.xml)->Akiyama(abc.html)

Code:

     #!/usr/bin/perl
    use strict;
    use warnings;
    use Cwd;
    use File::Copy;
    use File::Basename;

    my $path1=getcwd;
    #print $path1;
    opendir(INP, "$path1\/Input");
    my @out = grep(/\.(xml)$/,readdir(INP));
    closedir INP;
    opendir(INP, "$path1\/Input");
    my @out1 = grep(/\.(html)$/,readdir(INP));
    #print @out1;
    closedir INP;


    foreach my $final(@out)
    {
     my $filetobecopied = "Input\/".$final;
     my $newfile = $final;

     copy($filetobecopied, $newfile) or die "File cannot be copied.";
    }

    foreach my $final1(@out1)
    {
     my $filetobecopied1 = "Input\/".$final1;
     my $newfile1 = $final1;
     #print $final1;
     copy($filetobecopied1, $newfile1) or die "File cannot be copied.";
    }

    opendir DIR, $path1 or die "cant open dir";
    my @files = grep /(.*?)\.(xml)$/,(readdir DIR);
    closedir DIR;
    opendir DIR, $path1 or die "cant open dir";
    my @files1 = grep /(.*?)\.(html)$/,(readdir DIR);        
    closedir DIR;
    #print @files1;

    open(F6, ">Ref.txt");
    print F6 "FileName\tMatchedString\tOutput\n";

    # open(F2,"<F2>");
    # my $xml_list1="abc.html";
    # my $data1=join("",<F1>);
    # my $xml_list2=$xml_list1;

    foreach my $f (@files)
     {   


    open(F1, "<$path1\/Input\/$f") or die "Cannot open file: $f - $!";
    my $data=join("", <F1>);
    close F1;
    my $xml_list=$data;    
    #print $xml_list;
   foreach my $f1 (@files1)
    {
         my @fname=(); 
         my @hfull=();
        print $f1 . "\n";
    open(F2, "<$path1\/Input\/$f1") or die "Cannot open file: $f1 - $!";
    my $data1=join("", <F2>);
    close F2;
    my $xml_list1=$data1;
    #print $xml_list1;
    while($xml_list =~ m/(<personName>(.*?)<\/personName>)/isg)
    {
            my $full=$1;
            #print $full . "\n";
    if($full =~ m/(<givenNames>\s*(\w+)\s*<\/givenNames>\s*<familyName>\s*(\w+)\s*<\/familyName>\s*(.*?))/isg)
            { 
              push(@fname,$3);
                 }

         }
    while($xml_list1 =~ m/(<p><FONT FACE="(.*?)" SIZE="(\d+)"><I>(.*?)<\/I><\/FONT><\/p>)/isg)        
        {
           push(@hfull,$4);
       }

    foreach my $a (@fname)
      {
        foreach my $b (@hfull)
           { 
            #print $b . "\n";
              if($b =~ m/$a/isg)
                 {     
                    my $line  = substr($b,index(lc($b),lc($a)),length($a));
                      print F6 "Matched $a($f)\->$line($f1)\n";
                      }   
               }
         }      
      }     
    }       
   foreach my $del(@files)
    {
     unlink $del;
    }

Optimized code:

#!/usr/bin/perl
    use strict;
    use warnings;
    use Cwd;
    use File::Copy;
    use File::Basename;

    my $path1=getcwd;
    #print $path1;
    #opendir(INP, "$path1\/Input");
    #my @out = grep(/\.(xml)$/,readdir(INP));
    #closedir INP;
    #opendir(INP, "$path1\/Input");
    #my @out1 = grep(/\.(html)$/,readdir(INP));
    #print @out1;
    #closedir INP;


    #foreach my $final(@out)
    #{
    #my $filetobecopied = "Input\/".$final;
    # my $newfile = $final;

     #copy($filetobecopied, $newfile) or die "File cannot be copied.";
    #}

    #foreach my $final1(@out1)
    #{
    # my $filetobecopied1 = "Input\/".$final1;
    # my $newfile1 = $final1;
     #print $final1;
    # copy($filetobecopied1, $newfile1) or die "File cannot be copied.";
    #}

    opendir DIR, $path1 or die "cant open dir";
    my @files = grep /(.*?)\.(xml)$/,(readdir DIR);
    closedir DIR;
    opendir DIR, $path1 or die "cant open dir";
    my @files1 = grep /(.*?)\.(html)$/,(readdir DIR);        
    closedir DIR;
    #print @files1;

    open(F6, ">Ref.txt");
    print F6 "FileName\tMatchedString\tOutput\n";

    # open(F2,"<F2>");
    # my $xml_list1="abc.html";
    # my $data1=join("",<F1>);
    # my $xml_list2=$xml_list1;

    foreach my $f (@files)
     {   


    open(F1, "<$path1\/$f") or die "Cannot open file: $f - $!";
    my $data=join("", <F1>);
    close F1;
    my $xml_list=$data;    
    #print $xml_list;
   foreach my $f1 (@files1)
    {
         my @fname=(); 
         my @hfull=();
        print $f1 . "\n";
    open(F2, "<$path1\/$f1") or die "Cannot open file: $f1 - $!";
    my $data1=join("", <F2>);
    close F2;
    my $xml_list1=$data1;
    #print $xml_list1;
    while($xml_list =~ m/(<personName>(.*?)<\/personName>)/isg)
    {
            my $full=$1;
            #print $full . "\n";
    if($full =~ m/(<givenNames>\s*(\w+)\s*<\/givenNames>\s*<familyName>\s*(\w+)\s*<\/familyName>\s*(.*?))/isg)
            { 
              push(@fname,$3);
                 }

         }
    while($xml_list1 =~ m/(<p><FONT FACE="(.*?)" SIZE="(\d+)"><I>(.*?)<\/I><\/FONT><\/p>)/isg)        
        {
           push(@hfull,$4);
       }

    foreach my $a (@fname)
      {
        foreach my $b (@hfull)
           { 
            #print $b . "\n";
              if($b =~ m/$a/isg)
                 {     
                    my $line  = substr($b,index(lc($b),lc($a)),length($a));
                      print F6 "Matched $a($f)\->$line($f1)\n";
                      }   
               }
         }      
      }     
    }       
   #foreach my $del(@files)
    #{
    # unlink $del;
    #}

Answer 3

@flora : This is the final solution i am posting it . Here from xml files i have combined with and matched with data in the html . Now i am passing an argument to the program only one pattern that i want to check whether it matched with xml and html files . Like for example i have passed the argument "Kazumitsu Sugiura" to the program and now the program takes the value from and from xml and combines it as "Kazumitsu Sugiura" . This pattern will now check in the html file and if it matches then the filenames will be displayed as shown below :

InputFile: (sample.xml)

<creators> 
<creator affiliationRef="#01" creatorRole="author" xml:id="01"> 
<personName><givenNames>Kazumitsu</givenNames><familyName>Sugiura</familyName></personName>
</creator> 
<creator affiliationRef="#01" creatorRole="author" xml:id="02"> 
<personName><givenNames>Yoshinao</givenNames><familyName>Muro</familyName></personName>
</cre??ator> 
<creator affiliationRef="#01" creatorRole="author" xml:id="03"> 
<personName><givenNames>Masashi</givenNames><familyName>Akiyama</familyName></personName> 
</creator>
</creators>

InputFile: (test.xml)

<creators> 
<creator affiliationRef="#01" creatorRole="author" xml:id="01"> 
<personName><givenNames>Kazumitsu</givenNames><familyName>Sugiura</familyName></personName>
</creator> 
</creators>

InputFile: (test.html)

<P><FONT FACE="hello" SIZE="14"><I>Kazumitsu SUGIURA, Yoshinao Muro, and Masashi Akiyama</I></FONT></P>

Code:

#!/usr/bin/perl
    use strict;
    use warnings;
    use Cwd;
    use File::Copy;
    use File::Basename;

    my $path1=getcwd;
    my $PatternName = $ARGV[0];     

    opendir DIR, $path1 or die "cant open dir";
    my @files = grep /(.*?)\.(xml)$/,(readdir DIR);
    closedir DIR;
    opendir DIR, $path1 or die "cant open dir";
    my @files1 = grep /(.*?)\.(html)$/,(readdir DIR);        
    closedir DIR;

    #print @files1;

    open(F6, ">Ref.txt");
    print F6 "FileName\tMatchedString\tOutput\n";        

    foreach my $f (@files)
     {   

    open(F1, "<$path1\/$f") or die "Cannot open file: $f - $!";
    my $data=join("", <F1>);
    close F1;
    my $xml_list=$data;    
    #print $xml_list;
   foreach my $f1 (@files1)
    {
         my @fname=(); 
         my @hfull=();
        #print $f1 . "\n";
    open(F2, "<$path1\/$f1") or die "Cannot open file: $f1 - $!";
    my $data1=join("", <F2>);
    close F2;
    my $xml_list1=$data1;
    #print $xml_list1;
    while($xml_list =~ m/(<personName>(.*?)<\/personName>)/isg)
       {
            my $full=$1;
            #print $full . "\n";
    if($full =~ m/(<givenNames>\s*(\w+)\s*<\/givenNames>\s*<familyName>\s*(\w+)\s*<\/familyName>\s*(.*?))/isg)
            { 
                my $var = "$2 $3";
              push(@fname,$var);
                 }

         }
    while($xml_list1 =~ m/(<p><FONT FACE="(.*?)" SIZE="(\d+)"><I>(.*?)<\/I><\/FONT><\/p>)/isg)        
        {
           push(@hfull,$4);
       }


    foreach my $a (@fname)
      {
         if($a =~ /$PatternName/i)
          {
        foreach my $b (@hfull)
           { 
              if($b =~ m/$PatternName/isg)
                 {     
                    print $PatternName . "\n";
                    my $line  = substr($b,index(lc($b),lc($PatternName)),length($PatternName));
                      print F6 "Matched $a($f)\->$line($f1)\n";
                      }      
               } 
            }   
         }      
      }     
    }

Program Execution :

perl filename.pl "Kazumitsu Sugiura"

Output:

Matched Kazumitsu Sugiura(sample.xml)->Kazumitsu SUGIURA(abc.html)
Matched Kazumitsu Sugiura(test.xml)->Kazumitsu SUGIURA(abc.html)

How to compare the data of two files (.xml and .html) using perl(regex)?

Question

3 answers

solution1
0 2014-09-23 10:46:59

solution2
0 2014-09-25 11:26:50

solution3
0 ACCPTED 2014-09-29 16:38:32

How to compare the data of two files (.xml and .html) using perl(regex)?

Question

3 answers

solution1 0 2014-09-23 10:46:59

solution2 0 2014-09-25 11:26:50

solution3 0 ACCPTED 2014-09-29 16:38:32

solution1
0 2014-09-23 10:46:59

solution2
0 2014-09-25 11:26:50

solution3
0 ACCPTED 2014-09-29 16:38:32