I tried using regex by using foreach loop for .xml files as there are many .xml files and only one .html file. I opened, read and closed the directory. But when it comes to searching for a particular pattern in both the files, the code doesn't enter the while/if loop.
xml data: #pattern in xml format
<gname>abc</gname>
<pname>xyz</pname>
html data: #pattern in html format
<p>ABC</p>
<p><i>xyz</i></p>
Here, I need to match abc
and xyz
in both xml and html file (case sensitive).
open( F2, "<F2>" );
my $xml_list1 = "(.*)\.html";
here the data enclosed inside the parentheses also appears when printed. I want say the file name is abc.html
so i want to keep " abc
" as interchangeable, so that i dont need to write/modify the code if any filename other than abc.html
occurs.
close F2;
#print $xml_list1."\n";
foreach my $f (@filenames) {
#print $f."\n";
open( F1, "<F1>" );
my $data = join( "", <F1> );
close F1;
my $filename = substr( $f, 0, index( $f, '.' ) );
my $xml_list = $filename . ".xml";
while ( $xml_list =~ m//ig ) {
...;
}
}
the code doesn't enter the while/if loop, seems that it finds some error in reading the filename $xml_list
.
I want to match both the data without the use of parsers.
Can someone please help me to out.
UPDATE: CODE:
#!/usr/bin/perl
use strict;
use Cwd;
use File::Copy;
use File::Basename;
my $path1=getcwd;
opendir(INP, "$path1\/Input");
my @out = grep(/.(xml)$/,readdir(INP));
my @out1 = grep(/.(html)$/,readdir(INP));
close INP;
foreach my $final(@out)
{
my $filetobecopied = "Input\/".$final;
my $newfile = $final;
copy($filetobecopied, $newfile) or die "File cannot be copied.";
}
foreach my $final1(@out1)
{
my $filetobecopied1 = "Input\/".$final1;
my $newfile1 = $final1;
copy($filetobecopied1, $newfile1) or die "File cannot be copied.";
}
opendir DIR, $path1 or die "cant open dir";
my @files = grep /(.*?)\.(xml)$/,(readdir DIR);
my @files1 = grep /(.*?)\.(html)$/,(readdir DIR);
closedir DIR;
open(F6, ">Ref.txt");
print F6 "FileName\tError Instance\tOutput\n";
# open(F2,"<F2>");
# my $xml_list1="abc.html";
# my $data1=join("",<F1>);
# my $xml_list2=$xml_list1;
foreach my $f(@files)
{
open(F1, "<$f") or die "Cannot open file: $files[0]";
my $data=join("", <F1>);
close F1;
my $xml_list=$data;
#print "$f\n";
open(F2, "<$f") or die "Cannot open file: $files[0]";
my $xml_listt="abc.html";
my $data1=join("", <F2>);
my $xml_list1=$data1;
print $xml_list1."\n";
while($xml_list=~m/(<personName>(.*?)<\/personName>)/isg)
{
my $full=$1;
my $name=$2;
#print F6 $f."\t".$full."\n";
if($full=~m/(<givenNames>(\w+)<\/givenNames>(\n)?<familyName>(\w+)<\/familyName>(\n)?(.*?))/isg)
{
my $fg=$1;
my $gname=$2;
my $fname=$4;
#print F6 $f."\t".$gname."\t".$fname."\n";
}
}
While($xml_list1=~m/(<p><FONT FACE="(.*?)" SIZE="(\d+)"><I>(.*?)<\/I><\/FONT><\/p>)/igs)
{
my $hfull=$1;
print F6 $f."\n"; #.$hfull."\n";
}
close F2;
close F1;
}
foreach my $del(@files)
{
unlink $del;
}
@flora: I have modified your program and also i have put the optimized solution for your program.
Modified Code :(Modification to your program)
#!/usr/bin/perl
use strict;
use Cwd;
use File::Copy;
use File::Basename;
my $path1=getcwd;
opendir(INP, "$path1\/Input");
my @out = grep(/\.(xml)$/,readdir(INP));
my @out1 = grep(/\.(html)$/,readdir(INP));
close INP;
foreach my $final(@out)
{
my $filetobecopied = "Input\/".$final;
my $newfile = $final;
copy($filetobecopied, $newfile) or die "File cannot be copied.";
}
foreach my $final1(@out1)
{
my $filetobecopied1 = "Input\/".$final1;
my $newfile1 = $final1;
copy($filetobecopied1, $newfile1) or die "File cannot be copied.";
}
opendir DIR, $path1 or die "cant open dir";
my @files = grep /(.*?)\.(xml)$/,(readdir DIR);
my @files1 = grep /(.*?)\.(html)$/,(readdir DIR);
closedir DIR;
open(F6, ">Ref.txt");
print F6 "FileName\tError Instance\tOutput\n";
# open(F2,"<F2>");
# my $xml_list1="abc.html";
# my $data1=join("",<F1>);
# my $xml_list2=$xml_list1;
foreach my $f (@files)
{
open(F1, "<$f") or die "Cannot open file: $f";
my $data=join("", <F1>);
close F1;
my $xml_list=$data;
open(F2, "<$path1\/Input\/abc.html") or die "Cannot open file: abc.html - $!";
my $data1=join("", <F2>);
close F2;
my $xml_list1=$data1;
print $xml_list1."\n";
while($xml_list =~ m/(<personName>(.*?)<\/personName>)/isg)
{
my $full=$1;
my $name=$2;
#print F6 $f."\t".$full."\n";
if($full =~ m/(<givenNames>\s*(\w+)\s*<\/givenNames>\s*<familyName>\s*(\w+)\s*<\/familyName>\s*(.*?))/isg)
{
my $fg=$1;
my $gname=$2;
my $fname=$3;
#print F6 $f."\t".$gname."\t".$fname."\n";
}
}
while($xml_list1 =~ m/(<p><FONT FACE="(.*?)" SIZE="(\d+)"><I>(.*?)<\/I><\/FONT><\/p>)/isg)
{
my $hfull= $1;
print F6 $f."\n"; #.$hfull."\n";
}
}
foreach my $del(@files)
{
unlink $del;
}
Optimized solution:
#!/usr/bin/perl
use strict;
use warnings;
my @files = grep {-f} glob("*.xml");
my @files1 = grep {-f} glob("*.html");
open(F6, ">Ref.txt");
print F6 "FileName\tError Instance\tOutput\n";
foreach my $f (@files)
{
my $xml_list = do {
local $/ = undef;
open my $fh,'<',"$f" or die "Cannot open file: $f";
<$fh>;
};
my $xml_list1 = do {
local $/ = undef;
open my $fh,'<',"abc.html" or die "Cannot open file: $f";
<$fh>;
};
print $xml_list1."\n";
while($xml_list =~ m/(<personName>(.*?)<\/personName>)/isg)
{
my $full=$1;
my $name=$2;
#print F6 $f."\t".$full."\n";
if($full =~ m/(<givenNames>\s*(\w+)\s*<\/givenNames>\s*<familyName>\s*(\w+)\s*<\/familyName>\s*(.*?))/isg)
{
my $fg=$1;
my $gname=$2;
my $fname=$3;
#print F6 $f."\t".$gname."\t".$fname."\n";
}
}
while($xml_list1 =~ m/(<p><FONT FACE="(.*?)"\s+SIZE="(\d+)"><I>(.*?)<\/I><\/FONT><\/p>)/isg)
{
my $hfull= $1;
print F6 $f . "\n"; #.$hfull."\n";
}
}
foreach my $del(@files)
{
unlink $del;
}
@flora : Try this code . I was busy with other work so couldn't reply soon. Now this code will generate following output :
Input File:(sample.xml)
<creators>
<creator affiliationRef="#01" creatorRole="author" xml:id="01">
<personName><givenNames>Kazumitsu</givenNames><familyName>Sugiura</familyName></personName>
</creator>
<creator affiliationRef="#01" creatorRole="author" xml:id="02">
<personName><givenNames>Yoshinao</givenNames><familyName>Muro</familyName></personName>
</cre??ator>
<creator affiliationRef="#01" creatorRole="author" xml:id="03">
<personName><givenNames>Masashi</givenNames><familyName>Akiyama</familyName></personName>
</creator>
</creators>
InputFile(abc.html):
<P><FONT FACE="hello" SIZE="14"><I>Kazumitsu SUGIURA, Yoshinao Muro, and Masashi Akiyama</I></FONT></P>
Output:
FileName MatchedString Output
Matched Sugiura(sample.xml)->SUGIURA(abc.html)
Matched Muro(sample.xml)->Muro(abc.html)
Matched Akiyama(sample.xml)->Akiyama(abc.html)
Code:
#!/usr/bin/perl
use strict;
use warnings;
use Cwd;
use File::Copy;
use File::Basename;
my $path1=getcwd;
#print $path1;
opendir(INP, "$path1\/Input");
my @out = grep(/\.(xml)$/,readdir(INP));
closedir INP;
opendir(INP, "$path1\/Input");
my @out1 = grep(/\.(html)$/,readdir(INP));
#print @out1;
closedir INP;
foreach my $final(@out)
{
my $filetobecopied = "Input\/".$final;
my $newfile = $final;
copy($filetobecopied, $newfile) or die "File cannot be copied.";
}
foreach my $final1(@out1)
{
my $filetobecopied1 = "Input\/".$final1;
my $newfile1 = $final1;
#print $final1;
copy($filetobecopied1, $newfile1) or die "File cannot be copied.";
}
opendir DIR, $path1 or die "cant open dir";
my @files = grep /(.*?)\.(xml)$/,(readdir DIR);
closedir DIR;
opendir DIR, $path1 or die "cant open dir";
my @files1 = grep /(.*?)\.(html)$/,(readdir DIR);
closedir DIR;
#print @files1;
open(F6, ">Ref.txt");
print F6 "FileName\tMatchedString\tOutput\n";
# open(F2,"<F2>");
# my $xml_list1="abc.html";
# my $data1=join("",<F1>);
# my $xml_list2=$xml_list1;
foreach my $f (@files)
{
open(F1, "<$path1\/Input\/$f") or die "Cannot open file: $f - $!";
my $data=join("", <F1>);
close F1;
my $xml_list=$data;
#print $xml_list;
foreach my $f1 (@files1)
{
my @fname=();
my @hfull=();
print $f1 . "\n";
open(F2, "<$path1\/Input\/$f1") or die "Cannot open file: $f1 - $!";
my $data1=join("", <F2>);
close F2;
my $xml_list1=$data1;
#print $xml_list1;
while($xml_list =~ m/(<personName>(.*?)<\/personName>)/isg)
{
my $full=$1;
#print $full . "\n";
if($full =~ m/(<givenNames>\s*(\w+)\s*<\/givenNames>\s*<familyName>\s*(\w+)\s*<\/familyName>\s*(.*?))/isg)
{
push(@fname,$3);
}
}
while($xml_list1 =~ m/(<p><FONT FACE="(.*?)" SIZE="(\d+)"><I>(.*?)<\/I><\/FONT><\/p>)/isg)
{
push(@hfull,$4);
}
foreach my $a (@fname)
{
foreach my $b (@hfull)
{
#print $b . "\n";
if($b =~ m/$a/isg)
{
my $line = substr($b,index(lc($b),lc($a)),length($a));
print F6 "Matched $a($f)\->$line($f1)\n";
}
}
}
}
}
foreach my $del(@files)
{
unlink $del;
}
Optimized code:
#!/usr/bin/perl
use strict;
use warnings;
use Cwd;
use File::Copy;
use File::Basename;
my $path1=getcwd;
#print $path1;
#opendir(INP, "$path1\/Input");
#my @out = grep(/\.(xml)$/,readdir(INP));
#closedir INP;
#opendir(INP, "$path1\/Input");
#my @out1 = grep(/\.(html)$/,readdir(INP));
#print @out1;
#closedir INP;
#foreach my $final(@out)
#{
#my $filetobecopied = "Input\/".$final;
# my $newfile = $final;
#copy($filetobecopied, $newfile) or die "File cannot be copied.";
#}
#foreach my $final1(@out1)
#{
# my $filetobecopied1 = "Input\/".$final1;
# my $newfile1 = $final1;
#print $final1;
# copy($filetobecopied1, $newfile1) or die "File cannot be copied.";
#}
opendir DIR, $path1 or die "cant open dir";
my @files = grep /(.*?)\.(xml)$/,(readdir DIR);
closedir DIR;
opendir DIR, $path1 or die "cant open dir";
my @files1 = grep /(.*?)\.(html)$/,(readdir DIR);
closedir DIR;
#print @files1;
open(F6, ">Ref.txt");
print F6 "FileName\tMatchedString\tOutput\n";
# open(F2,"<F2>");
# my $xml_list1="abc.html";
# my $data1=join("",<F1>);
# my $xml_list2=$xml_list1;
foreach my $f (@files)
{
open(F1, "<$path1\/$f") or die "Cannot open file: $f - $!";
my $data=join("", <F1>);
close F1;
my $xml_list=$data;
#print $xml_list;
foreach my $f1 (@files1)
{
my @fname=();
my @hfull=();
print $f1 . "\n";
open(F2, "<$path1\/$f1") or die "Cannot open file: $f1 - $!";
my $data1=join("", <F2>);
close F2;
my $xml_list1=$data1;
#print $xml_list1;
while($xml_list =~ m/(<personName>(.*?)<\/personName>)/isg)
{
my $full=$1;
#print $full . "\n";
if($full =~ m/(<givenNames>\s*(\w+)\s*<\/givenNames>\s*<familyName>\s*(\w+)\s*<\/familyName>\s*(.*?))/isg)
{
push(@fname,$3);
}
}
while($xml_list1 =~ m/(<p><FONT FACE="(.*?)" SIZE="(\d+)"><I>(.*?)<\/I><\/FONT><\/p>)/isg)
{
push(@hfull,$4);
}
foreach my $a (@fname)
{
foreach my $b (@hfull)
{
#print $b . "\n";
if($b =~ m/$a/isg)
{
my $line = substr($b,index(lc($b),lc($a)),length($a));
print F6 "Matched $a($f)\->$line($f1)\n";
}
}
}
}
}
#foreach my $del(@files)
#{
# unlink $del;
#}
@flora : This is the final solution i am posting it . Here from xml files i have combined with and matched with data in the html . Now i am passing an argument to the program only one pattern that i want to check whether it matched with xml and html files . Like for example i have passed the argument "Kazumitsu Sugiura" to the program and now the program takes the value from and from xml and combines it as "Kazumitsu Sugiura" . This pattern will now check in the html file and if it matches then the filenames will be displayed as shown below :
InputFile: (sample.xml)
<creators>
<creator affiliationRef="#01" creatorRole="author" xml:id="01">
<personName><givenNames>Kazumitsu</givenNames><familyName>Sugiura</familyName></personName>
</creator>
<creator affiliationRef="#01" creatorRole="author" xml:id="02">
<personName><givenNames>Yoshinao</givenNames><familyName>Muro</familyName></personName>
</cre??ator>
<creator affiliationRef="#01" creatorRole="author" xml:id="03">
<personName><givenNames>Masashi</givenNames><familyName>Akiyama</familyName></personName>
</creator>
</creators>
InputFile: (test.xml)
<creators>
<creator affiliationRef="#01" creatorRole="author" xml:id="01">
<personName><givenNames>Kazumitsu</givenNames><familyName>Sugiura</familyName></personName>
</creator>
</creators>
InputFile: (test.html)
<P><FONT FACE="hello" SIZE="14"><I>Kazumitsu SUGIURA, Yoshinao Muro, and Masashi Akiyama</I></FONT></P>
Code:
#!/usr/bin/perl
use strict;
use warnings;
use Cwd;
use File::Copy;
use File::Basename;
my $path1=getcwd;
my $PatternName = $ARGV[0];
opendir DIR, $path1 or die "cant open dir";
my @files = grep /(.*?)\.(xml)$/,(readdir DIR);
closedir DIR;
opendir DIR, $path1 or die "cant open dir";
my @files1 = grep /(.*?)\.(html)$/,(readdir DIR);
closedir DIR;
#print @files1;
open(F6, ">Ref.txt");
print F6 "FileName\tMatchedString\tOutput\n";
foreach my $f (@files)
{
open(F1, "<$path1\/$f") or die "Cannot open file: $f - $!";
my $data=join("", <F1>);
close F1;
my $xml_list=$data;
#print $xml_list;
foreach my $f1 (@files1)
{
my @fname=();
my @hfull=();
#print $f1 . "\n";
open(F2, "<$path1\/$f1") or die "Cannot open file: $f1 - $!";
my $data1=join("", <F2>);
close F2;
my $xml_list1=$data1;
#print $xml_list1;
while($xml_list =~ m/(<personName>(.*?)<\/personName>)/isg)
{
my $full=$1;
#print $full . "\n";
if($full =~ m/(<givenNames>\s*(\w+)\s*<\/givenNames>\s*<familyName>\s*(\w+)\s*<\/familyName>\s*(.*?))/isg)
{
my $var = "$2 $3";
push(@fname,$var);
}
}
while($xml_list1 =~ m/(<p><FONT FACE="(.*?)" SIZE="(\d+)"><I>(.*?)<\/I><\/FONT><\/p>)/isg)
{
push(@hfull,$4);
}
foreach my $a (@fname)
{
if($a =~ /$PatternName/i)
{
foreach my $b (@hfull)
{
if($b =~ m/$PatternName/isg)
{
print $PatternName . "\n";
my $line = substr($b,index(lc($b),lc($PatternName)),length($PatternName));
print F6 "Matched $a($f)\->$line($f1)\n";
}
}
}
}
}
}
Program Execution :
perl filename.pl "Kazumitsu Sugiura"
Output:
Matched Kazumitsu Sugiura(sample.xml)->Kazumitsu SUGIURA(abc.html)
Matched Kazumitsu Sugiura(test.xml)->Kazumitsu SUGIURA(abc.html)
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.