#!/usr/bin/perl 

# ########################################################################### #
# Html2Wml                                                                    #
# ========                                                                    #
# Author: Sebastien Aperghis-Tramoni <maddingue@free.fr>                      #
#                                                                             #
# This program converts HTML pages to WML pages.                              #
# See the documentation for more informations.                                #
#                                                                             #
# This program is available under the GNU General Public License.             #
#                                                                             #
# Copyright (c)2000, 2001 Sebastien Aperghis-Tramoni                          #
# ########################################################################### #

use strict;
use CGI;
use FileHandle;
use File::Basename;
use Getopt::Long;
use HTML::Parser;
use LWP::UserAgent;
use Text::Template;
use URI;
use URI::URL;

use vars qw($program $version);
$program = 'Html2Wml';
$version = '0.4.1';


# 
# globals
# 
use vars qw($cgi);
$cgi = 0;
my $result;  ## WML deck in text format
my $binary;  ## WML deck in binary format
my $xmlckres = '';
my $complres = '';

my %options = (
    help           => 0,    ## show the usage and exit
    version        => 0,    ## show the program name and version and exit
    
    ## conversion options
    ascii          => 0,    ## convert named entities to US-ASCII
    collapse       => 1,    ## collapse white space characters
    compile        => 0,    ## compile WML to binary 
   'ignore-images' => 0,    ## completly ignore image links
    linearize      => 1,    ## suppress the tables tags
    nopre          => 0,    ## don't use PRE tag
    
    ## links reconstruction options
    hreftmpl       => '{FILEPATH}{FILENAME}{$FILETYPE =~ s/s?html?/wml/o; $FILETYPE}', 
    srctmpl        => '{FILEPATH}{FILENAME}{$FILETYPE =~ s/gif|png|jpe?g/wbmp/o; $FILETYPE}', 

    ## card splitting options
   'max-card-size'        => 1_500,  ## maximum size of data per card
   'card-split-threshold' =>    50,  ## card split threshold
   'next-card-label'      => '[&gt;&gt;]',  ## label of the link to go to the next card
    
    ## debugging options
    debug          => 0,    ## activate the debug mode
    xmlcheck       => 0,    ## perform a well-formedness check (using XML::Parser)
);

## used by the html parser
use vars qw(%state);
%state = (
    doc_uri  => '',        ## document absolute URI
    output   => '',        ## buffer for storing output
    skip     => 0,         ## skip switch (on/off)
    stack    => [],        ## tag stack
    cardsize => 0,         ## size of the current card
    cardid   => 'wcf000',  ## ID of the current card (stands for "WML Card Fragment 000")
    title    => '',        ## title of the WML deck
);

my %entities;  ## named entities conversion table

# 
# The following two hashes are based on the WML DTD. They are the hardcoded 
# conversion tables which describe the legal syntax of WML tags. 
# 
my %dtdent = ();
    $dtdent{emph}   = 'em,strong,b,i,u,big,small';
    $dtdent{layout} = 'br';
    $dtdent{text}   = $dtdent{emph};
    $dtdent{flow}   = "$dtdent{text},$dtdent{layout},img,anchor,a,table";
    $dtdent{fields} = "$dtdent{flow},input,select,fieldset";

my %with = (
    html     => { action => 'replace',  new_value => 'wml'  }, 
    wml      => { action => 'keep',     nest => 'head,template,card'  }, 
    
    ## header tags
    head     => { action => 'keep',     nest => 'meta,access' }, 
   # meta     => { action => 'keep',     nest => 'EMPTY',  attributes => 'http-equiv,name,content' }, 
    template => { action => 'keep',     nest => 'do,onevent' }, 
    title    => { action => 'skip' }, 
    style    => { action => 'skip' }, 
    script   => { action => 'skip' }, 
    
    ## structural tags
    body     => { action => 'replace',  new_value => 'card' }, 
    card     => { action => 'keep',     nest => 'do,p,pre' }, 
    h1       => { action => 'replace',  new_value => 'p',  render => 'big,b',  special => 'nowidow' }, 
    h2       => { action => 'replace',  new_value => 'p',  render => 'big',  special => 'nowidow'}, 
    h3       => { action => 'replace',  new_value => 'p',  render => 'b',  special => 'nowidow' }, 
    h4       => { action => 'replace',  new_value => 'p',  special => 'nowidow' }, 
    h5       => { action => 'replace',  new_value => 'p',  special => 'nowidow' }, 
    h6       => { action => 'replace',  new_value => 'p',  special => 'nowidow' }, 
    li       => { action => 'replace',  new_value => 'p' }, 
    dt       => { action => 'replace',  new_value => 'p' }, 
    dd       => { action => 'replace',  new_value => 'p' }, 
    div      => { action => 'replace',  new_value => 'p' }, 
    p        => { action => 'keep',     nest => "$dtdent{fields},do",  attributes => 'align' }, 
    br       => { action => 'keep',     nest => 'EMPTY' }, 
    pre      => { action => 'keep',     nest => 'a,br,i,b,em,strong,input,select' }, 
    tt       => { action => 'replace',  new_value => 'pre' }, 
    
    ## tables tags
    table    => { action => 'keep',     nest => 'tr',  attributes => 'title,align' }, 
    caption  => { action => 'skip' }, 
   'tr'      => { action => 'keep',     nest => 'td' }, 
    th       => { action => 'replace',  new_value => 'td' }, 
    td       => { action => 'keep',     nest => "$dtdent{emph},$dtdent{layout},img,a,anchor" }, 
    
    ## link tags
    a        => { action => 'keep',     nest => 'br,img',  attributes => 'id,name,href,title', 
                                        attrconv => { name => 'id' } }, 
    img      => { action => 'keep',     nest => 'EMPTY',  attributes => 'id,src,alt,align' }, 
    
    ## style tags
    em       => { action => 'keep',     nest => $dtdent{flow} }, 
    strong   => { action => 'keep',     nest => $dtdent{flow} }, 
    b        => { action => 'keep',     nest => $dtdent{flow} }, 
    i        => { action => 'keep',     nest => $dtdent{flow} }, 
    u        => { action => 'keep',     nest => $dtdent{flow} }, 
    big      => { action => 'keep',     nest => $dtdent{flow} }, 
    small    => { action => 'keep',     nest => $dtdent{flow} }
    
    ## form tags -- currently not handled
   #'select'  => { action => 'keep',     nest => 'optgroup,option',  attributes => 'title,name,value,multiple' }, 
   # optgroup => { action => 'keep',     nest => 'optgroup,option',  attributes => 'title' }, 
   # option   => { action => 'keep',     nest => 'onevent',  attributes => 'title,value' }, 
   # input    => { action => 'keep',     nest => 'EMPTY',  attributes => 'name,type,value,title,size,maxlength'}, 
);


# 
# The following hash hardcodes the parent-lookup for each element 
# of the WML syntax, i.e. for each element, it gives the prefered 
# parent element. 
# 
my %reverse = (
    ## head tags
    head => 'wml',      meta => 'head',     access => 'head',
    template => 'wml',  onevent => 'template', 
    
    ## structural tags
    card => 'wml',      p => 'card',        pre => 'card',      br => 'p',
    
    ## tables tags
    table => 'p',       tr => 'table',      td => 'tr',
    
    ## link tags
    a => 'p',           anchor => 'p',      img => 'p',
    
    ## style tags
    b => 'p',           i => 'p',           u => 'p', 
    strong => 'p',      em => 'p', 
    big => 'p',         small => 'p', 
    
    ## form tags
   'select' => 'p',     option => 'select', optgroup => 'select', 
    do => 'p',          input => 'p',       fieldset => 'p',
);



# 
# main
# 
$| = 1;

fileparse_set_fstype('Unix');  ## this is because I use fileparse() to 
                               ## split the URL fragments

## CGI security options
$CGI::POST_MAX = 1024 * 1;  # max 1K posts
$CGI::DISABLE_UPLOADS = 1;  # no uploads

load_entities();

if(@ARGV) {
    ## launched from shell
    
    ## getting options
    GetOptions(\%options, 
        qw(help version  
           ascii! collapse! compile hreftmpl=s ignore-images linearize! nopre srctmpl=s
           max-card-size=i card-split-threshold=i next-card-label=s
           debug! xmlcheck!
        )
    );
    version() if $options{version};
    usage() if $options{help};
    usage() unless @ARGV;
    apply_options();
    
    ## converting the file
    $result = html2wml(shift);
    
} else {
    ## launched from web
    $cgi = new CGI;
    
    ## get the options
    for my $option (keys %options) {
        $options{$option} = $cgi->param($option) if defined $cgi->param($option)
    }
    
    ## as there is currently no support for converting images, we disable 
    ## the image links
    $options{'ignore-images'};
    
    apply_options();
    
    print $cgi->header if $options{debug};
    
    $result = html2wml($cgi->param('url') || '/');
}


## XML check
if($options{xmlcheck}) {
    eval {
      require XML::Parser;
      my $xmlparser = new XML::Parser Style => 'Tree', ErrorContext => 2;
      $xmlparser->parse($result);
    };
    $xmlckres = $@ ? $@ : "Well-formed";
}

## XML compile
if($options{compile}) {
    $binary = '';
    my $buf;
    
    eval {
      require IPC::Open2;
      my $in  = new FileHandle;
      my $out = new FileHandle;
      IPC::Open2::open2($out, $in, 'wmlc', '-', '/proc/self/fd/1');
      #syswrite($in, $result, length $result);
      #while(sysread($out, $buf, 1024) == 1024) { $binary .= $buf }
      print $in $result;
      $binary = join '', <$out>;
    };
    
    $complres = $@
}


## output: normal or debug
if($options{debug}) {
    my $i = 1;
    $result .= "\n";
    $result =~ s/^/@{[sprintf '%3d', $i++]}: /gm;  ## add lines number
    $result = simple_wrap($result);
    
    if($cgi) {
        print qq|<html>\n<head>\n<title>$program -- Debug Mode</title>\n|, 
              qq|<style type="text/css">\n  BODY { background-color: #ffffff}\n|, 
              qq|  .tag { color: #8811BB }\n  .attr { color: #553399 }\n </style>\n|, 
              qq|</head>\n<body>\n<h1>$program -- Debug Mode</h1>\n|, 
              qq|<p>This is the result of the conversion of the document |, 
              qq|<a href="$state{doc_uri}">$state{doc_uri}</a> by $program v$version.</p>\n|, 
              qq|<hr />\n|, 
              htmlize($result), 
              qq|<hr />\n<p>Result of XML check:</p>\n|, 
              htmlize($xmlckres); 
        
        print qq|<hr />\n<p>Result of WML compilation:</p>\n<pre>|, 
              htmlize(hextype($binary)), "</pre>\n"  if $options{compile}; 
        
        print qq|\n</body>\n</html>|
        
    } else {
        my $s = "$program -- Debug Mode\n";
        print $s, '-'x length($s), "\n", 
              $result, "\n", ' -'x5, "\n", 
              $xmlckres, "\n";
        print ' -'x5, "Compiled WML\n", ' -'x5, 
              ($complres ? "$complres\n" : hextype($binary)) 
              if $options{compile};
    }
    
} else {
    print $cgi->header(-type => 'text/vnd.wap.wml') if $cgi;
    print $result;
}



# 
# apply_options()
# -------------
sub apply_options {
    if($options{linearize}) {
        delete @with{qw(table tr td th)};
        $with{'caption'} = { action => 'replace', new_value => 'p', render => 'b' };
        $with{'tr'} = { action => 'replace', new_value => 'p' };
        delete @reverse{qw(table tr td)};
    }
    
    if($options{'ignore-images'}) {
        delete $with{img};
    }
    
    if($options{debug}) {
        $options{xmlcheck} = 1;
    }
    
    if($options{nopre}) {
        delete $with{pre};
        $with{'pre'} = { action => 'replace', new_value => 'p' };
    }
}


# 
# html2wml()
# --------
sub html2wml {
    my $url = shift;
    my $file;
    my $type;
    my $converter = new HTML::Parser api_version => 3;
    my $date = localtime;
    
    return unless $url;
    
    ## read the file 
    if($url =~ m{http://}) {  ## absolute uri
        ($file,$type) = get_url($url)
    
    } elsif(not $cgi and -f $url) {  ## local file
        $file = read_file($url)
    
    } else {  ## absolute url relative to the server
        ($file,$type) = get_url( URI::URL->new($url, $cgi->url)->abs )
    }
    
    $type ||= '';
    
    ## if it's an image, call send_image()
    if($url =~ /\.(?:gif|jpg|png)$/i or $type =~ /image/) {
        @_ = ($file, $url);
        goto &send_image
    }
    
    ## get the document title
    if($file =~ m|<title>([^<]+)</title>|) {
        $state{title} = convert_entities(clean_spaces($1))
    }
    
    ## WML header
    $state{skip} = 0;
    $state{output} = q|<?xml version="1.0"?>| . "\n" 
                   . q|<!DOCTYPE wml PUBLIC "-//WAPFORUM//DTD WML 1.2//EN" |
                   . q|"http://www.wapforum.org/DTD/wml12.dtd">| . "\n" 
                   .qq|<!-- Converted by $program $version on $date -->\n|;
    
    ## affectation of the HTML::Parser handlers
    $converter->unbroken_text(1);
    $converter->handler(start       => \&start,   'tagname, attr, attrseq, text');
    $converter->handler(end         => \&end,     'tagname, text');
    $converter->handler(text        => \&text,    'text, is_cdata');
    $converter->handler(comment     => \&comment, 'tokens');
    $converter->handler(declaration => '');
    $converter->handler(process     => '');
    
    ## conversion
    $converter->parse($file);
    $converter->eof;
    
    ## flush the stack
    while(my $tag = pop @{$state{stack}}) {
        $state{output} .= "</$tag>"
    }
    
    ## convert the named HTML entities to numeric entities
    $state{output} = convert_entities($state{output});
    
    ## escape $ chars
    $state{output} =~ s/\$/\$\$/go;
    
    ## post-conversion clean-up
    $state{output} =~ s/\015\012|\012|\015/\n/go;  ## converts CR/LF to native eol
    
    $state{output} =~ s|\s+>|>|go;
    $state{output} =~ s|\s+/>|/>|go;
    $state{output} =~ s|<(\w+) +|<$1 |g;
    $state{output} =~ s|<p[^>]*>\s*</p>||go          if $options{collapse};
    $state{output} =~ s|<p[^>]*>\s*<br/>\s*</p>||go  if $options{collapse};
    $state{output} =~ s|<p[^>]*>\s*&nbsp;\s*</p>||go if $options{collapse};
    $state{output} =~ s|<p[^>]*>\s*&#32;\s*</p>||go  if $options{collapse};
    $state{output} =~ s|<(\w+)>\s*</\1>||go          if $options{collapse};
    
    $state{output} =~ s/\n+/\n/go       if $options{collapse};
    $state{output} =~ s/(?: \n)+/\n/go  if $options{collapse};
    
    ## set the title of the card
    if(length $state{title}) {
        $state{output} =~ s/<card/<card title="$state{title}"/g;
    }
    
    return $state{output}
}


# 
# get_url()
# -------
# This function gets and returns the file from the given URI. 
# If called in a array context, returns the file content and the associated 
# MIME type (as given by the server). 
# 
sub get_url {
    my $uri = shift;
    my $quiet = shift || 0;
    my $ua = new LWP::UserAgent;
    $ua->agent( $cgi ? $cgi->user_agent." [through $program/$version ".$ua->agent."]" 
                     : "$program/$version ".$ua->agent );
    my $request = new HTTP::Request GET => $uri;
    my $response = $ua->request($request);
    
    if($response->is_error) {
        return $quiet ? '' : $response->error_as_HTML
    }
    
    $state{doc_uri} = $uri;
    return wantarray ? ($response->content, $response->content_type) : $response->content
}


# 
# read_file()
# ---------
# This function reads and returns the file from the local disk. 
# 
sub read_file {
    my $file = shift;
    my $quiet = shift || 0;
    open(FILE, $file) or ($quiet ? return '' : fatal("Can't read file '$file': $!\n"));
    local $/ = undef;
    $file = <FILE>;
    close(FILE);
    return $file
}


# 
# send_image()
# ----------
# This function allow Html2Wml to send WBMP images to the client. 
# Currently, it send an empty hardcoded image, but support for 
# conversion from common formats (GIF, JPEG, PNG) will be added soon. 
# 
sub send_image {
    my $data = shift;
    my $path = shift;
    
    my $pixel = pack 'C*', 0, 0, 1, 1, 0xFF;  ## this is one white pixel
    
    ## [here there will be the conversion]
    
    print $cgi->header(-type => 'image/wbmp'), $pixel;
    exit
}


# 
# convert_entities()
# ----------------
# This function converts the named HTML entities into numeric entities. 
# 
sub convert_entities {
    my $text = shift;
    my $ascii = $options{ascii};
    
    my $code = q|  while($text =~ /&(\w+);/g) {                   |
             . q|      my $ent = $1;                              |
             . q|      if(exists $entities{$ent}) {               |
    .($ascii ? q|          my $chr = $entities{$ent}[1];          |
             : q|          my $chr = '&#'.$entities{$ent}[0].';'; | )
             . q|          $text =~ s/&$ent;/$chr/g               | 
             . q|      }                                          |
             . q|  }                                              |;
    
    eval $code;
    
    return $text
}


# 
# clean_spaces()
# ------------
sub clean_spaces {
    my $str = shift;
    $str =~ s/\t+/ /go;
    $str =~ s/^\s+/ /go;
    $str =~ s/ +/ /go;
    return $str
}


# 
# HTML::Parser start tag handler
# 
sub start {
    my($tag, $attr) = @_;
    return unless exists $with{$tag};
    my $curr_tag = ($with{$tag}{action} eq 'replace' ? $with{$tag}{new_value} : $tag);
    
    ## reconstruct well-formed attributes list with only the allowed ones
    if(scalar keys %$attr and exists $with{$curr_tag}{attributes}) {
        $attr = join ' ', 
            map { (exists $with{$tag}{attrconv}{$_} ? $with{$tag}{attrconv}{$_} : $_) 
                  . '="' . 
                  (/href|src/ ? xlate_url($attr->{$_}, $_) : convert_entities($attr->{$_})) 
                  . '"' if exists $attr->{$_} 
            } split(',', $with{$curr_tag}{attributes});
        $attr = ' ' . $attr if length $attr;
    } else {
        $attr = ''
    }
    
    ## set the skip mode state
    $state{skip} = 1 if $with{$curr_tag}{action} eq 'skip';
    
   # print "(start tag) $tag => action: ", 
   #       ($with{$tag}{action} ? $with{$tag}{action} : 'clear'), 
   #       ($curr_tag ne $tag ? " with $curr_tag " : ''), 
   #       ($attr? " attributes:$attr" : ''), 
   #       "<br />\n" if $options{debug};
    
    if($with{$curr_tag}{action} eq 'keep') { # and $with{$curr_tag}{nest} ne 'EMPTY') {
        
        if(@{$state{stack}}) { 
            ## syntax repair: close the tags that were left opened
            while(my $prev_tag = pop @{$state{stack}}) {
                if($with{$prev_tag}{nest} =~ /\b$curr_tag\b/) {
                    push @{$state{stack}}, $prev_tag;
                    last
                }
                $state{output} .= "</$prev_tag>";
            }
        }
    
        ## syntax repair: open the tags that should have been opened
        my $outtertag = ${$state{stack}}[0] || $curr_tag;
        my @nesting_tags = ();
        
        while($outtertag ne 'wml') {
            last unless $reverse{$outtertag};
            $outtertag = $reverse{$outtertag};
            unshift @{$state{stack}}, $outtertag;
            unshift @nesting_tags, $outtertag;
        }
        
        for my $t (@nesting_tags) { $state{output} .= "<$t>" }
    }
    
    ## check for special treatment
    my $restsize = $options{'max-card-size'} - $state{cardsize};
    if($restsize < $options{'card-split-threshold'} and $with{$tag}{special} =~ /nowidow/) {
        split_card()
    }
    
    ## simple tag translation
    if($with{$curr_tag}{action} eq 'keep') {
        if($with{$curr_tag}{nest} eq 'EMPTY') {
            $state{cardsize} += length($curr_tag) + length($attr);
            $state{output} .= "<$curr_tag$attr/>"
        } else {
            $state{cardsize} += length($curr_tag) + length($attr);
            $state{output} .= "<$curr_tag$attr>";
            push @{$state{stack}}, $curr_tag;
        }
    
    } else {
        ## do nothing
    }
    
    ## additional rendering effects
    if(defined $with{$tag}{render}) {  ## note that it's $tag, not $curr_tag
        for my $t (split ',', $with{$tag}{render}) {
            $state{cardsize} += length $t;
            $state{output} .= "<$t>"
        }
    }
}


# 
# HTML::Parser end tag handler
# 
sub end {
    my($tag) = @_;
    return unless exists $with{$tag};
    my $curr_tag = ($with{$tag}{action} eq 'replace' ? $with{$tag}{new_value} : $tag);
    
   # print "( end tag ) $curr_tag, stack = (@{$state{stack}}) <br />\n" if $options{debug};
    
    $state{skip} = 0 if $with{$tag}{action} eq 'skip';
    return if exists $with{$tag}{nest} and $with{$tag}{nest} eq 'EMPTY';
    
    ## additional rendering effects
    if(defined $with{$tag}{render}) {  ## note that it's $tag, not $curr_tag
        for my $t (reverse split ',', $with{$tag}{render}) {
            $state{cardsize} += length $t;
            $state{output} .= "</$t>"
        }
    }
    
    ## closing element
    if(${$state{stack}}[-1] eq $curr_tag  and  $with{$curr_tag}{action} eq 'keep') {
        $state{cardsize} += length $curr_tag;
        $state{output} .= "</$curr_tag> ";
        pop @{$state{stack}};
    
    } else {
        ## do nothing
    }
    
    ## check current card size
    if($state{cardsize} > $options{'max-card-size'}) {
        split_card()
    }
}


# 
# HTML::Parser text handler
# 
sub text {
    my($text) = @_;
    
    return if $state{skip};
    
    ## add a para tag if we're on the card node
    if(${$state{stack}}[-1] eq 'card') {
        $state{cardsize} += 4;
        $state{output} .= "\n<p>";
        push @{$state{stack}}, 'p';
    }
    
    $text = clean_spaces($text) if $options{collapse} and ${$state{stack}}[-1] ne 'pre';
    $state{cardsize} += length $text;
    $state{output} .= $text;
}


# 
# HTML::Parser comment tag handler
# 
sub comment {
    my($comment) = @_;
    local $_;
    
    $comment = join '', @$comment;
    
    ## SSI engine
    if($comment =~ /^\s*\[(\w+)\s+(.*)\]\s*$/) {
        my $element = $1;
        my %attributes = map { /\G(\w+)=["']([^"']+)["']/g } split /\s+/, $2;
        
        for my $attr (keys %attributes) {
            if($attr eq 'virtual' and $attributes{virtual} !~ /^http:/) {
                $attributes{virtual} = URI::URL->new( $attributes{virtual}, $state{doc_uri} )->abs
            }
        }
        
        for($element) {
            /include/ and do {
                if(defined $attributes{virtual}) { $state{output} .= get_url($attributes{virtual}, 1) }
                elsif(defined $attributes{file}) { $state{output} .= read_file($attributes{file}, 1) }
            };
            
            /fsize/ and do {
                if(defined $attributes{virtual}) { $state{output} .= length get_url($attributes{virtual}, 1) }
                elsif(defined $attributes{file}) { $state{output} .= length read_file($attributes{file}, 1) }
            };
        }
    }
}


# 
# split_card()
# ----------
# This function closes the current card and creates a new one. 
# 
sub split_card {
    my @stack = @{$state{stack}};
    shift @stack;  ## shift the <wml> tag
    shift @stack;  ## shift the <card> tag
    
    $state{cardid}++;
    $state{cardsize} = 0;
    
    for my $tag (reverse @stack) { $state{output} .= "</$tag>" }
    
    $state{output} .= qq|\n<p align="right"><a href="#$state{cardid}">$options{'next-card-label'}</a></p>\n</card>\n| 
                    . qq|\n<card id="$state{cardid}">\n|;
    
    for my $tag (@stack) { $state{output} .= "<$tag>" }
}


# 
# xlate_url()
# ---------
# This function translates the given url so that the pointed document will 
# pass through this CGI for conversion when in CGI mode, or construct a url 
# that fits the needs of the webmaster using the given template, if present. 
# 
sub xlate_url {
    my $url  = shift;  ## $url is the url from a href or a src attribute
    my $type = shift;  ## $type is 'src' or 'href'
    my $cgi_options = '';
    
    ## we only treat http URLs
    return $url if $url =~ /^(\w+):/ and lc($1) ne 'http';
    
    ## escape some characters
    $url =~ s/\$/\%24/go;
    $url =~ s/\&/&amp;/go;
    
    if($cgi) {
        ## CGI mode
        my $absurl = URI::URL->new($url, $state{doc_uri})->abs;
        
        for my $option (keys %options) {
            $cgi_options .= "$option=$options{$option}\&amp;" if $cgi->param($option)
        }
        
        return $cgi->url . "?${cgi_options}url=$absurl"
        
    } else {
        ## shell mode
        
        ## This is where the links reconstruction engine lives. 
        
        if($options{"${type}tmpl"} and $url !~ m|^http://|) { 
            ## we don't touch absolute urls
            
            my $tmpl = $options{"${type}tmpl"};
            my $uri = new URI $url, 'http';
            
            if($uri->path) {
                my($filename,$filepath,$filetype) = fileparse($uri->path, '((?:\.\w+)+)');
                
                my $init_vars = qq|{
                    sub FILEPATH { q<$filepath> }
                    sub FILENAME { q<$filename> }
                    sub FILETYPE { q<$filetype> }
                    sub URL { q<$url> }
                }|;
                
                my $new_url = new Text::Template TYPE => 'STRING', SOURCE => $init_vars.$tmpl
                    or fatal("Can't construct template: $Text::Template::ERROR\n"); 
                
                return $new_url->fill_in(HASH => {
                    'FILEPATH' => $filepath,  
                    'FILENAME' => $filename, 
                    'FILETYPE' => $filetype, 
                    'URL' => $url
                }) or fatal("$Text::Template::ERROR\n")
                
            } else {
                return $url
            }
            
        } else {
            return $url
        }
    }
}


# 
# htmlize()
# -------
# This function translate the given text into HTML, and add line numbers. 
# 
sub htmlize {
    my $str = shift;
    my @res = ();
    
    ## convert special chars to entities
    $str =~ s/&/\&amp;/go;
    $str =~ s/</\&lt;/go;
    $str =~ s/>/\&gt;/go;
    
    ## add a small syntax highlighting
    $str =~ s{(\&lt;[?/]?)(\w+)(.*?)(\&gt;)}
             {<b>$1<span class="tag">$2</span></b><span class="attr">$3</span><b>$4</b>}g;
    
    return "<pre>$str</pre>"
}


# 
# hextype()
# -------
# This function generates a human readable representation of binary data
# 
sub hextype {
    my $data = shift;            ## data to print
    my $colwidth = shift || 16;  ## width of ASCII column
    
    my $half = $colwidth/2;
    my $line = 1;
    my $out = '';
    
    while(length $data) {
        my @hex = unpack 'C'x$colwidth, substr($data, 0, $colwidth);
        substr($data, 0, $colwidth) = '';
        $out .= sprintf '%3d:  '. ((('%02x 'x$half).' ')x2) .'   ', $line++, @hex;
        $out .= sprintf ''.('%s'x$half)x2 . "\n", map { $_ > 32 ? chr : '.' } @hex; 
    }
    
    return $out
}


# 
# simple_wrap()
# -----------
# This function wraps the text given in parameter. 
# 
sub simple_wrap {
    my $orig = ref $_[0] ? $_[0] : \$_[0];
    my $text = '';
    my $curlen = 0;
    my $beg = ' 'x5;
    my $cols = 75;
    
    while($$orig =~ m/(\s*\S+\s+)/gm) {
        if($curlen + length($1) > $cols) {
            $text .= "\n$beg$1";
            $curlen = 1 + length($beg) + length($1)
        } else {
            $text .= $1;
            $curlen += length $1;
        }
        $curlen = 0 if index($1, "\n") >= 0;
    }
    
    return $text
}


# 
# load_entities()
# -------------
# 
sub load_entities {
    %entities = (
        nbsp     => [ 32, ' '], 
        iexcl    => [161, '!'], 
        cent     => [162, '-c-'], 
        pound    => [163, '-L-'], 
        curren   => [164, 'CUR'], 
        yen      => [165, 'YEN'], 
        brvbar   => [166, '|'], 
        sect     => [167, 'S:'], 
        uml      => [168, '"'], 
        copy     => [169, '(C)'], 
        ordf     => [170, '-a'], 
        laquo    => [171, '<<'], 
       'not'     => [172, 'NOT'], 
        shy      => [173, '-'], 
        reg      => [174, '(R)'], 
        macr     => [175, '-'], 
        deg      => [176, 'DEG'], 
        plusmn   => [177, '+/-'], 
        sup2     => [178, '^2'], 
        sup3     => [179, '^3'], 
        acute    => [180, '\''], 
        micro    => [181, 'u'], 
        para     => [182, 'P:'], 
        middot   => [183, '.'], 
        cedil    => [184, ','], 
        sup1     => [185, '^1'], 
        ordm     => [186, '-o'], 
        raquo    => [187, '>>'], 
        frac14   => [188, ' 1/4'], 
        frac12   => [189, ' 1/2'], 
        frac34   => [190, ' 3/4'], 
        iquest   => [191, '?'], 
        Agrave   => [192, 'A'], 
        Aacute   => [193, 'A'], 
        Acirc    => [194, 'A'], 
        Atilde   => [195, 'A'], 
        Auml     => [196, 'Ae'], 
        Aring    => [197, 'A'], 
        AElig    => [198, 'AE'], 
        Ccedil   => [199, 'C'], 
        Egrave   => [200, 'E'], 
        Eacute   => [201, 'E'], 
        Ecirc    => [202, 'E'], 
        Euml     => [203, 'E'], 
        Igrave   => [204, 'I'], 
        Iacute   => [205, 'I'], 
        Icirc    => [206, 'I'], 
        Iuml     => [207, 'I'], 
        ETH      => [208, 'DH'], 
        Ntilde   => [209, 'N'], 
        Ograve   => [210, 'O'], 
        Oacute   => [211, 'O'], 
        Ocirc    => [212, 'O'], 
        Otilde   => [213, 'O'], 
        Ouml     => [214, 'Oe'], 
       'times'   => [215, '*'], 
        Oslash   => [216, 'O'], 
        Ugrave   => [217, 'U'], 
        Uacute   => [218, 'U'], 
        Ucirc    => [219, 'U'], 
        Uuml     => [220, 'Ue'], 
        Yacute   => [221, 'Y'], 
        THORN    => [222, 'P'], 
        szlig    => [223, 'ss'], 
        agrave   => [224, 'a'], 
        aacute   => [225, 'a'], 
        acirc    => [226, 'a'], 
        atilde   => [227, 'a'], 
        auml     => [228, 'ae'], 
        aring    => [229, 'a'], 
        aelig    => [230, 'ae'], 
        ccedil   => [231, 'c'], 
        egrave   => [232, 'e'], 
        eacute   => [233, 'e'], 
        ecirc    => [234, 'e'], 
        euml     => [235, 'e'], 
        igrave   => [236, 'i'], 
        iacute   => [237, 'i'], 
        icirc    => [238, 'i'], 
        iuml     => [239, 'i'], 
        eth      => [240, 'e'], 
        ntilde   => [241, 'n'], 
        ograve   => [242, 'o'], 
        oacute   => [243, 'o'], 
        ocirc    => [244, 'o'], 
        otilde   => [245, 'o'], 
        ouml     => [246, 'o'], 
        divide   => [247, '/'], 
        oslash   => [248, 'o'], 
        ugrave   => [249, 'u'], 
        uacute   => [250, 'u'], 
        ucirc    => [251, 'u'], 
        uuml     => [252, 'u'], 
        yacute   => [253, 'y'], 
        thorn    => [254, 'p'], 
        yuml     => [255, 'y'], 
    );
}


# 
# error()
# -----
sub error {
    print STDERR @_
}


# 
# fatal()
# -----
sub fatal {
    print STDERR @_;
    exit -1;
}


# 
# version()
# -------
sub version {
    print "$program/$version\n"; exit
}


# 
# usage()
# -----
sub usage {
    print STDERR <<"USAGE"; exit
usage: $0 [options] file

options: 
  --ascii               use 7 bits ASCII emulation to convert named entities
  --nocollapse          don't collapse spaces and empty paragraphs
  --hreftmpl=template   set the template for the links reconstruction engine
  --ignore-images       completly ignore image links
  --nolinearize         don't linearize the tables
  --nopre               don't use the <pre> tag
  
  --max-card-size=size          set the card size upper limit
  --card-split-threshold=size   set the card splitting threshold 
  --next-card-label=label       set the label of the link to the next card
   
  --debug       activate the debug mode
  --xmlcheck    activate the XML check: output is passed through XML::Parser
  
  --help        show this help screen and exit
  --version     show the program name and version and exit

Read the documentation for more information. 
USAGE
}


# 
# cgiusage()
# --------
sub cgiusage {
    print $cgi->header, <<"USAGE"; exit
<html>
<head>
<title>Error</title>
</head>
<body>
<h1>Error</h1>
<p>This CGI was called with incorrect parameters. 
Check your request and try again </p>
</body>
</html>
USAGE
}


1;

