#!/usr/bin/perl
# html2xhtml.pl
#####################################################################
#
# Converts a compliant HTML file to be (almost) XHTML 1.0 Compliant
# Original file must be HTML-compliant with no < or > characters in the CTEXT
# (convert these to < and >).
# Creates an optional check file that should be identical to the original
# (check afterwards by running the command FC oldfile checkfile)
# Does not check for missing
or tags.
#
# Written by David Ireland of DI Management Services Pty Ltd
#
# First published 24 August 2004.
#
# Copyright (c) 2004 by DI Management Services Pty Ltd
# This program is free software; you can redistribute it
# and/or modify it under the same terms as Perl itself.
#
#####################################################################
use strict;
my (@tags);
my ($rem, $content, $tag);
my ($output);
my $intag = 0;
my $filename;
my $outfile;
my $chkfile;
my $GET_TAG = '<(?:"[^"]*"|\'[^\']*\'|[^\'">])*>'; #NB quoted apostrophes
my $MAKE_CHECK = 0;
my $DEBUG = 0;
# Change "1.0 Transational" to "1.0 Strict" and "transitional.dtd" to "strict.dtd"
# if your original file is compliant with HTML 4 Strict
# Change to Frames if you use frames.
my $doctype = '';
my $newhtml = '';
# List of empty elements
my @empty_tags =
('area', 'basefont', 'br', 'col', 'frame', 'hr', 'img', 'input', 'isindex', 'link', 'meta', 'param');
# parse command line
if ($#ARGV < 1)
{
print STDERR "Usage: html2xhtml infile outfile [checkfile]\n";
exit;
}
$filename = $ARGV[0];
$outfile = $ARGV[1];
if ($#ARGV >= 2)
{
$chkfile = $ARGV[2];
$MAKE_CHECK = 1;
open(CHK, ">$chkfile") or die "Can't create file $chkfile, $!";
}
# Read input file completely to scalar variable
open(DATA, "<$filename") or die "Can't open file $filename, $!";
undef $/; # full-slurp mode
$_ = ;
close(DATA);
#print "$_\n";
##@tags = /($GET_TAG)/g;
##foreach (@tags)
##{
## print "[$_]\n";
##}
open(OUT, ">$outfile") or die "Can't create file $outfile, $!";
$rem = $_;
$intag = 1 if (substr($rem, 0, 1) eq '<');
while ($rem)
{
# Alternate between reading in tags and content
if ($intag)
{
# read next tag
($tag, $rem) = ($rem =~ /($GET_TAG)(.*)/s);
if ($tag)
{
print "-[$tag]\n" if ($DEBUG);
print CHK $tag if ($MAKE_CHECK);
# output XHTML-transformed tag
$tag = &x_trans($tag);
$output .= $tag;
print "+[$tag]\n" if ($DEBUG);
}
$intag = !$intag;
}
else
{
# read content up to next tag
($content, $rem) = ($rem =~ /(.*?)(?=<)(.*)/s);
if ($content)
{
print CHK $content if ($MAKE_CHECK);
# output unadulterated content
$output .= $content;
# debug output with ~ for NL
$content =~ tr/\n/~/;
#print "{$content}\n";
}
$intag = !$intag;
}
}
# dump output in one go
print OUT $output;
close(OUT);
close(CHK) if ($MAKE_CHECK);
sub x_trans()
# Convert tag to xhtml-compliant form, as well as we can
{
my $tag = shift;
my ($tagname, $attribs, $newattribs);
my ($attrname, $value, $rem);
# Check for !DOCTYPE or or comments
if ($tag =~ /^$/);
# Convert tagname to lower case
$tagname = lc($tagname);
# Split attributes into names and values
$newattribs = "";
$rem = $attribs;
while ($rem)
{
$_ = $rem;
if (/(\w+)\s*=/) # we have attribute name=value
{
($attrname, $value, $rem) =
($rem =~ m{
(\w+)\s* #$attrname
=\s* # '='
( # $value is one of...
"[^"]*" # "double-quoted"
| # or
'[^']*' # 'single-quoted'
| # or
\S+ # atom
)
(.*) # keep remainder for next time
}sx);
if ($attrname && defined($value))
{
# strip any surrounding quotes first
$value =~ s/^"//;
$value =~ s/^'//;
$value =~ s/"$//;
$value =~ s/'$//;
$newattribs .= lc($attrname) . "=\"$value\" ";
}
elsif ($attrname)
{
$newattribs .= lc($attrname) . " ";
}
}
else
{ # we don't have a name=value pair, so we must have an empty value like `disabled',
# or maybe a closing '/' which we ignore here and add later if we think we should
($attrname, $rem) = ($rem =~ /(\w+)\s*(.*)/);
if ($attrname)
{
# Found an `empty value', so make xxx="xxx"
$attrname = lc($attrname);
$newattribs .= "$attrname=\"$attrname\" ";
}
}
}
# Add a fragment identifier if missing
$_ = $newattribs;
if ( (/\bname="/) && !(/\s+id="/) )
{
s/\bname="([^"]*)"/name="$1" id="$1"/;
$newattribs = $_;
}
# Reassemble tag
if ($newattribs)
{
$newattribs =~ s/\s+$//g;
$tag = "<$tagname $newattribs>";
}
else
{
$tag = "<$tagname>";
}
# Add closing '/' to known empty tags
foreach (@empty_tags)
{
if ($tagname eq $_)
{
$tag =~ s#>$# />#;
last;
}
}
return $tag;
}