#! /usr/bin/perl
#
# Script to collect all <A> links in an HTML file, number them, and
# print a list of them at the end of the file.
#
# Part of HTML-XML-utils, see:
# http://www.w3.org/Tools/HTML-XML-utils/
#
# Author: Bert Bos <bert@w3.org>
# Created: 1 Feb 2001
# Version: $Id: hxprintlinks,v 1.2 2009/01/08 14:36:13 bbos Exp $

use Getopt::Std;
use strict;

sub START { 0 }
sub SEEN_A { 1 }		# After "<a"
sub SEEN_HREF { 2 }		# After "<a" and "href"
sub SEEN_EQ { 3 }		# After "<a", "href" and "="
sub SEEN_URL { 4 }		# After "<a", "href", "=" and a URL

# urlexpand -- expands a relative URL to an absolute one
sub urlexpand($$) {
  my ($url, $base) = @_;
  my $result;
  if ($url =~ /^\w+:/) {	# Already absolute
    $result = $url;		# Keep as is
  } elsif ($url =~ /^\//o) {	# Starts with '/'
    $base =~ /^\w+:(\/\/[^\/]*)?/o;
    $result = $&.$url;		# Prefix protocol and possibly machine
  } elsif ($url =~ /^[\#?]/o) {	# URL is fragment or query
    $result = $base.$url;	# Combine base and fragment/query
  } else {			# Starts with path segment
    $base =~ /[^\/:]*$/;
    $result = $`.$url;		# Prefix everything except last segment
    $result =~ s/\/[^\/]*\/\.\.//go; # Remove ".." where possible
  }
  return $result;
}

# print_links -- print OL list of URLs
sub print_links($$@) {
  my ($base, $n, @urls) = @_;
  print "\n<ol>\n";
  for (my $i = 1; $i <= $n; $i++) {
    print "<li>";
    print defined $base ? urlexpand($urls[$i], $base) : $urls[$i];
    print "</li>\n";
  }
  print "</ol>\n";
}

my $state = START;		# State machine
my $url;			# Most recent URL
my $n = 0;			# Length of @urls
my @urls;			# All URLs seen so far
my %options;			# Command line options
my $base;			# Base URL to make URLs absolute

my $PROG = substr($0, rindex($0, "/") + 1);
my $USAGE = "Usage: $PROG [-b base] [file]\n";

getopts('b:', \%options) || die $USAGE;
$base = $options{b} if defined $options{b};

while (<>) {			# Loop over lines
  while (/./) {			# Loop over tokens in a line
    if ($state == START) {
      if (/<a\b\s*/io) {
	print $`, $&;
	$_ = $';
	$state = SEEN_A;
      } elsif (/<\/body|<\/html/io) {
	print $`;
	print_links($base, $n, @urls) if ($n);
	$n = 0;			# Avoid printing list twice
	print $&;
	$_ = $';
      } else {
	print;
	$_ = '';
      }
    } elsif ($state == SEEN_A) {
      if (/[^>]*\bhref\b\s*/io) {
	print $`, $&;
	$_ = $';
	$state = SEEN_HREF;
      } elsif (/[^>]*>/o) {
	print $`, $&;
	$_ = $';
	$state = START;
      } else {
	print;
	$_ = '';
      }
    } elsif ($state == SEEN_HREF) {
      if (/\s*=\s*/o) {
	print $`, $&;
	$_ = $';
	$state = SEEN_EQ;
      } elsif (/[^>]*>/o) {
	print $`, $&;
	$_ = $';
	$state = START;
      } else {
	print;
	$_ = '';
      }
    } elsif ($state == SEEN_EQ) {
      if (/\s*\"([^\"]*)\"\s*/o) {
	$url = $1;
	print $`, $&;
	$_ = $';
	$state = SEEN_URL;
      } elsif (/\s*\'([^\']*)\'\s*/o) {
	$url = $1;
	print $`, $&;
	$_ = $';
	$state = SEEN_URL;
      } elsif (/[^>]*>/o) {
	print $`, $&;
	$_ = $';
	$state = START;
      } else {
	print;
	$_ = '';
      }
    } elsif ($state == SEEN_URL) {
      if (/[^>]*>/o) {
	print $`, $&;
	print "[", ++$n, "]";
	$urls[$n] = $url;
	$_ = $';
	$state = START;
      } else {
	print;
	$_ = '';
      }
    } else {
      die "Cannot happen";
    }
  }
}

print_links($base, $n, @urls) if ($n); # Seen no </body> or </html>
