#!/usr/bin/perl # html2xhtml.pl ##################################################################### # # Converts a compliant HTML file to be (almost) XHTML 1.0 Compliant # Original file must be HTML-compliant with no < or > characters in the CTEXT # (convert these to < and >). # Creates an optional check file that should be identical to the original # (check afterwards by running the command FC oldfile checkfile) # Does not check for missing

or tags. # # Written by David Ireland of DI Management Services Pty Ltd # # First published 24 August 2004. # # Copyright (c) 2004 by DI Management Services Pty Ltd # This program is free software; you can redistribute it # and/or modify it under the same terms as Perl itself. # ##################################################################### use strict; my (@tags); my ($rem, $content, $tag); my ($output); my $intag = 0; my $filename; my $outfile; my $chkfile; my $GET_TAG = '<(?:"[^"]*"|\'[^\']*\'|[^\'">])*>'; #NB quoted apostrophes my $MAKE_CHECK = 0; my $DEBUG = 0; # Change "1.0 Transational" to "1.0 Strict" and "transitional.dtd" to "strict.dtd" # if your original file is compliant with HTML 4 Strict # Change to Frames if you use frames. my $doctype = ''; my $newhtml = ''; # List of empty elements my @empty_tags = ('area', 'basefont', 'br', 'col', 'frame', 'hr', 'img', 'input', 'isindex', 'link', 'meta', 'param'); # parse command line if ($#ARGV < 1) { print STDERR "Usage: html2xhtml infile outfile [checkfile]\n"; exit; } $filename = $ARGV[0]; $outfile = $ARGV[1]; if ($#ARGV >= 2) { $chkfile = $ARGV[2]; $MAKE_CHECK = 1; open(CHK, ">$chkfile") or die "Can't create file $chkfile, $!"; } # Read input file completely to scalar variable open(DATA, "<$filename") or die "Can't open file $filename, $!"; undef $/; # full-slurp mode $_ = ; close(DATA); #print "$_\n"; ##@tags = /($GET_TAG)/g; ##foreach (@tags) ##{ ## print "[$_]\n"; ##} open(OUT, ">$outfile") or die "Can't create file $outfile, $!"; $rem = $_; $intag = 1 if (substr($rem, 0, 1) eq '<'); while ($rem) { # Alternate between reading in tags and content if ($intag) { # read next tag ($tag, $rem) = ($rem =~ /($GET_TAG)(.*)/s); if ($tag) { print "-[$tag]\n" if ($DEBUG); print CHK $tag if ($MAKE_CHECK); # output XHTML-transformed tag $tag = &x_trans($tag); $output .= $tag; print "+[$tag]\n" if ($DEBUG); } $intag = !$intag; } else { # read content up to next tag ($content, $rem) = ($rem =~ /(.*?)(?=<)(.*)/s); if ($content) { print CHK $content if ($MAKE_CHECK); # output unadulterated content $output .= $content; # debug output with ~ for NL $content =~ tr/\n/~/; #print "{$content}\n"; } $intag = !$intag; } } # dump output in one go print OUT $output; close(OUT); close(CHK) if ($MAKE_CHECK); sub x_trans() # Convert tag to xhtml-compliant form, as well as we can { my $tag = shift; my ($tagname, $attribs, $newattribs); my ($attrname, $value, $rem); # Check for !DOCTYPE or or comments if ($tag =~ /^$/); # Convert tagname to lower case $tagname = lc($tagname); # Split attributes into names and values $newattribs = ""; $rem = $attribs; while ($rem) { $_ = $rem; if (/(\w+)\s*=/) # we have attribute name=value { ($attrname, $value, $rem) = ($rem =~ m{ (\w+)\s* #$attrname =\s* # '=' ( # $value is one of... "[^"]*" # "double-quoted" | # or '[^']*' # 'single-quoted' | # or \S+ # atom ) (.*) # keep remainder for next time }sx); if ($attrname && defined($value)) { # strip any surrounding quotes first $value =~ s/^"//; $value =~ s/^'//; $value =~ s/"$//; $value =~ s/'$//; $newattribs .= lc($attrname) . "=\"$value\" "; } elsif ($attrname) { $newattribs .= lc($attrname) . " "; } } else { # we don't have a name=value pair, so we must have an empty value like `disabled', # or maybe a closing '/' which we ignore here and add later if we think we should ($attrname, $rem) = ($rem =~ /(\w+)\s*(.*)/); if ($attrname) { # Found an `empty value', so make xxx="xxx" $attrname = lc($attrname); $newattribs .= "$attrname=\"$attrname\" "; } } } # Add a fragment identifier if missing $_ = $newattribs; if ( (/\bname="/) && !(/\s+id="/) ) { s/\bname="([^"]*)"/name="$1" id="$1"/; $newattribs = $_; } # Reassemble tag if ($newattribs) { $newattribs =~ s/\s+$//g; $tag = "<$tagname $newattribs>"; } else { $tag = "<$tagname>"; } # Add closing '/' to known empty tags foreach (@empty_tags) { if ($tagname eq $_) { $tag =~ s#>$# />#; last; } } return $tag; }