#!/usr/local/bin/perl5.8.0
# WebGenDB Utility
# This script replace all the < with entities for the tags that are text within a batchLoad tag
# e.g.
First draft of JavaScript
# becomes First draft of <title>JavaScript</title>
use Getopt::Long;
if ($#ARGV <1) {
print "usage: replaceTags.pl -f -o \n";
exit;
}
GetOptions ('f=s' => \$input,
'o=s' => \$output,
'h' => \$help);
if (defined $help) {
print "usage: replaceTags.pl -f -o \n";
exit;
}
die "usage: replaceTags.pl -f -o \n" unless (defined $input && defined $output);
# Read all to the end of the file
$/ = EOF;
open(INPUT, "$input");
$inFile = ;
# Assumption: all the EAD tags are always all in small caps
# all BatchLoad tags either start or contain a capitalized letter
# For all the which only contain small caps, replace the brakets with entities
# need to account for
# other to keep into account: ,
#
#
$inFile =~ s/<(\/?[a-z=_"\s\d\@:.\/]*\/?)>/<$1>/g;
# Fix
$inFile =~ s/<([a-z]+ [a-zA-Z:="\s]*)>/<$1>/g;
# get rid of all the \n which have been introduced from the EAD
$inFile =~ s/\n/ /g;
# collapse multiple spaces to one
$inFile =~ s/\s+/ /g;
# put a \n after every and after </abc> tag for readibility
# and because of Exception of String too long
$inFile =~ s/(<\/.*?>)/$1\n/g;
# This commented out because the EAD doesn't need to have extra \n in the content
#$inFile =~ s/(<\/.*?>)/$1\n/g;
open (OUTPUT, ">$output");
print OUTPUT $inFile;