#!/usr/local/bin/perl5.8.0 # WebGenDB Utility # This script replace all the < with entities for the tags that are text within a batchLoad tag # e.g. First draft of <title>JavaScript # becomes First draft of <title>JavaScript</title> use Getopt::Long; if ($#ARGV <1) { print "usage: replaceTags.pl -f -o \n"; exit; } GetOptions ('f=s' => \$input, 'o=s' => \$output, 'h' => \$help); if (defined $help) { print "usage: replaceTags.pl -f -o \n"; exit; } die "usage: replaceTags.pl -f -o \n" unless (defined $input && defined $output); # Read all to the end of the file $/ = EOF; open(INPUT, "$input"); $inFile = ; # Assumption: all the EAD tags are always all in small caps # all BatchLoad tags either start or contain a capitalized letter # For all the which only contain small caps, replace the brakets with entities # need to account for # other to keep into account: , <note show="new" id="fn1"> # <archref show="embed" actuate="onload" href="http://www.oac.cdlib.org/findaid/ark:/13030/kt200014h4"> # <note label="General Note"> $inFile =~ s/<(\/?[a-z=_"\s\d\@:.\/]*\/?)>/<$1>/g; # Fix <note label="General Note"> $inFile =~ s/<([a-z]+ [a-zA-Z:="\s]*)>/<$1>/g; # get rid of all the \n which have been introduced from the EAD $inFile =~ s/\n/ /g; # collapse multiple spaces to one $inFile =~ s/\s+/ /g; # put a \n after every </.*> and after </abc> tag for readibility # and because of Exception of String too long $inFile =~ s/(<\/.*?>)/$1\n/g; # This commented out because the EAD doesn't need to have extra \n in the content #$inFile =~ s/(<\/.*?>)/$1\n/g; open (OUTPUT, ">$output"); print OUTPUT $inFile;