# $Id: FixEntities.pm,v 1.6 2002/12/09 18:51:12 abs Exp $
#
$HTML::FixEntities::VERSION = '1.03';
##++
##
##     Copyright (c) 2002 David Brownlee. All Rights Reserved
##
##     E-Mail: <abs@mono.org>
##
##     Permission  to  use,  copy, and distribute is hereby granted,
##     providing that the above copyright notice and this permission
##     appear in all copies and in supporting documentation.
##--

=head1 NAME

HTML::FixEntities - Convert latin1 characters to HTML entities

=head1 SYNOPSIS

HTML::FixEntities converts any latin1 characters found to their HTML
entity equivalents.

=head1 DESCRIPTION


=head2 Methods

=over 4

=item new HTML::FixEntities ($text, $convert_badchars)

Generates a new fix object, converting $text. If $convert_badchars is true
characters which are unknown will be removed.

HTML::FixEntities will not adjust any text inside <? and ?> (for php scripts)
=item $fix->text;

Returns the fixed text.

=item $fix->totalchanges

Total number of characters changed

=back

=cut

use strict;

package HTML::FixEntities;

my(%charmap) = (
    "\x80",	'&euro;',	# euro symbol
    "\x84",	'"',		# German " mark (on baseline), no entity exists
    "\x85",	'...',		# ellipsis (&hellip;) - doesn't work in Netscape
    "\x8d",	'[]',		# 8d is a small box from Word
    "\x91",	"'",		# left singlequote (&lsquo;)
    "\x92",	"'",		# right singlequote (&rsquo;)
    "\x93",	'"',		# left doublequote (&ldquo;)
    "\x94",	'"',		# right doublequote (&rdquo;)
    "\x96",	'-',		# hyphen
    "\x99",	'&#153;',	# TM - &trade; doesn't work in Netscape
    "\xa0",	'&nbsp;',	# non-breaking space
    "\xa1",	'&iexcl;',	# inverted exclamation mark
    "\xa2",	'&cent;',	# cent
    "\xa3",	'&pound;',	# yer Great British pound
    "\xa4",	'&curren;',	# currency  (strange exploding o symbol)
    "\xa5",	'&yen;',	# yen
    "\xa6",	'&brvbar;',	# broken vertical bar
    "\xa7",	'&sect;',	# section symbol  (S with double lines)
    "\xa8",	'&uml;',	# umlaut  (diaeresis)
    "\xa9",	'&copy;',	# copyright
    "\xaa",	'&ordf;',	# feminine ordinal
    "\xab",	'&laquo;',	# left-pointing double angle  (quotation mark)
    "\xac",	'&not;',	# not symbol
    "\xad",	'&shy;',	# soft hyphen (discretionary) (except in Netscape)
    "\xae",	'&reg;',	# registered
    "\xaf",	'&macr;',	# macron  (spacing macron / overline)
    "\xb0",	'&deg;',	# degree
    "\xb1",	'&plusmn;',	# plus/minus
    "\xb2",	'&sup2;',	# superscript 2
    "\xb3",	'&sup3;',	# superscript 3
    "\xb4",	'&acute;',	# acute accent
    "\xb5",	'&micro;',	# micro symbol 
    "\xb6",	'&para;',	# paragraph symbol  (pilcrow)
    "\xb7",	'&middot;',	# middle dot  (Georgian comma)
    "\xb8",	'&cedil;',	# cedilla
    "\xb9",	'&sup1;',	# superscript 1
    "\xba",	'&ordm;',	# masculine ordinal indicator
    "\xbb",	'&raquo;',	# right-pointing double angle  (quotation mark)
    "\xbc",	'&frac14;',	# vulgar fraction one quarter
    "\xbd",	'&frac12;',	# vulgar fraction one half
    "\xbe",	'&frac34;',	# vulgar fraction three quarters
    "\xbf",	'&iquest;',	# inverted question mark
    "\xc0",	'&Agrave;',	# A grave
    "\xc1",	'&Aacute;',	# A acute
    "\xc2",	'&Acirc;',	# A circumflex
    "\xc3",	'&Atilde;',	# A tilde
    "\xc4",	'&Auml;',	# A umlaut
    "\xc5",	'&Aring;',	# A ring
    "\xc6",	'&AElig;',	# AE capital
    "\xc7",	'&Ccedil;',	# C cedilla
    "\xc8",	'&Egrave;',	# E grave
    "\xc9",	'&Eacute;',	# E acute
    "\xca",	'&Ecirc;',	# E circumflex
    "\xcb",	'&Euml;',	# E umlaut
    "\xcc",	'&Igrave;',	# I grave
    "\xcd",	'&Iacute;',	# I acute
    "\xce",	'&Icirc;',	# I circumflex
    "\xcf",	'&Iuml;',	# I umlaut
    "\xd0",	'&ETH;',	# ETH capital  (Latin)
    "\xd1",	'&Ntilde;',	# N tilde
    "\xd2",	'&Ograve;',	# O grave
    "\xd3",	'&Oacute;',	# O acute
    "\xd4",	'&Ocirc;',	# O circumflex
    "\xd5",	'&Otilde;',	# O tilde
    "\xd6",	'&Ouml;',	# O umlaut
    "\xd7",	'&times;',	# multiplication symbol
    "\xd8",	'&Oslash;',	# O slash
    "\xd9",	'&Ugrave;',	# U grave
    "\xda",	'&Uacute;',	# U acute
    "\xdb",	'&Ucirc;',	# U circumflex
    "\xdc",	'&Uuml;',	# U umlaut
    "\xdd",	'&Yacute;',	# Y acute
    "\xde",	'&THORN;',	# THORN capital  (Latin)
    "\xdf",	'&szlig;',	# ess-zed
    "\xe0",	'&agrave;',	# a grave
    "\xe1",	'&aacute;',	# a acute
    "\xe2",	'&acirc;',	# a circumflex
    "\xe3",	'&atilde;',	# a tilde
    "\xe4",	'&auml;',	# a umlaut
    "\xe5",	'&aring;',	# a ring
    "\xe6",	'&aelig;',	# ae  (Latin)
    "\xe7",	'&ccedil;',	# c cedilla
    "\xe8",	'&egrave;',	# e grave
    "\xe9",	'&eacute;',	# e acute
    "\xea",	'&ecirc;',	# e circumflex
    "\xeb",	'&euml;',	# e umlaut
    "\xec",	'&igrave;',	# i grave
    "\xed",	'&iacute;',	# i acute
    "\xee",	'&icirc;',	# i circumflex
    "\xef",	'&iuml;',	# i umlaut
    "\xf0",	'&eth;',	# eth  (Latin)
    "\xf1",	'&ntilde;',	# n tilde
    "\xf2",	'&ograve;',	# o grave
    "\xf3",	'&oacute;',	# o acute
    "\xf4",	'&ocirc;',	# o circumflex
    "\xf5",	'&otilde;',	# o tilde
    "\xf6",	'&ouml;',	# o umlaut
    "\xf7",	'&divide;',	# division symbol
    "\xf8",	'&oslash;',	# o slash
    "\xf9",	'&ugrave;',	# u grave
    "\xfa",	'&uacute;',	# u acute
    "\xfb",	'&ucirc;',	# u circumflex
    "\xfc",	'&uuml;',	# u umlaut
    "\xfd",	'&yacute;',	# y acute
    "\xfe",	'&thorn;',	# thorn  (Latin)
    "\xff",	'&yuml;',	# y umlaut
);

sub new
    {
    my $class = shift;
    my($self) = bless {}, $class;
    $self->{_replaceunknown} = $_[1];
    $self->fix($_[0]);
    $self;
    }

sub totalchanges
    {
    my $self  = shift;
    $self->{_totalchanges} ?$self->{_totalchanges} :0;
    }

sub fix
    {
    my $self  = shift;
    my($work) = @_;
    my($badchar_regex);

    $badchar_regex = '[\x00-\x08\x0b\x0c\x0e\x0f\x80-\xff]';

    if ($work !~ /$badchar_regex/)
	{
	$self->{_text} = $work;
	return;
	}

    while ($work =~ /($badchar_regex)/)
	{
	$self->{_text} .= $`;
	if (index($`, '<?') != -1 )
	    {
	    my $rest = $1.$';
	    if ($rest =~ /^(.*?\?>)/s)
		{
		$self->{_text} .= $1;
		$work = $';
		}
	    else
		{ $work = $rest; }
	    next;
	    }
	if ($charmap{$1})
	    {
	    $self->{_text} .= $charmap{$1};
	    ++$self->{_changes}{$1};
	    }
	else
	    {
	    ++$self->{_badchars}{$1};
	    if (!$self->{_replaceunknown})
		{ $self->{_text} .= $1; }
	    }
	++$self->{_totalchanges};
	$work = $';
	}
    $self->{_text} .= $work;
    }

sub text
    {
    my $self  = shift;
    $self->{_text};
    }
