#!/usr/bin/perl
# ======================================================================
#
# makewhatis
#
# Version 1.0f - 01-OCT-1996
# (c) 1995,1996 Steve "Mr. Bassman" Bryant <bassman@bork.bb.bawue.de>
#
#
# Description:
#
# 	This script creates the "whatis" files, used by apropos(1) and
# man(1).  It requires gzip(1) or zcat(1) to be able to read compressed
# files; this can be configured at the top of the code (below).  It will
# also handle the /usr/man/preformat directory if your system has it
# (you may have to uncomment the call to the preformat hack routine).
# If you would rather have one global whatis file (like /usr/lib/whatis)
# instead of having one whatis file per man directory, use the "-g"
# option; you can also configure where this file should be (below).
# (Note: HP-UX uses "/usr/share/lib/whatis".)
#
#
# Background:
#
# 	I wrote this because the one I had didn't work properly.  This
# seems to be my usual reason for writing anything these days outside of
# my job.  I could have just modified the other, but in re-writing from
# scratch in Perl (instead of sh/awk/sed/sort/whatever), I managed to
# drop the run-time down to a quarter of what it was before (nearly
# 8 mins down to less than 2 on my box).
#
#	I've tested it as thoroughly as I can, but if it still doesn't
# work, let me know and I'll fix it.  {8-]
#
#
# Usage:
#	makewhatis [-g] [-h] [-v] [-u] [-w] [manpath]
#
#
# Copyright:
#
# 	Permission is hereby given to redistribute this program and
# accompanying manual page at no cost to the recipient (media fee
# notwithstanding), providing it is distributed in its entirety.  It
# may not be modified or be included in any commercial product without
# the author's permission.
#
#
# Warning:
#	This program is provided "as-is", with no warranty whatsoever.
# Use it entirely at your own risk.  The author may not be held
# accountable for any damage this program might do.
#
#
# History:
#  07/95 1.0  - Initial version
#  08/95 1.0a - Corrected preformat bug (apropos likes to read
#               /usr/man/preformat/whatis as well so it seems !)
#  01/96 1.0b - Modified RegExp so .SH "NAME" is recognised with quotes.
#  05/96 1.0c - Fixed ".Nm" handling used in Berkeley pages.
#  08/96 1.0d - Added option for using one global whatis file
#               (eg: /usr/lib/whatis).
#  09/96 1.0e - Added support for language dependant man paths.
#  10/96 1.0f - Much improved (and optimised) nroff handling.
#
# Credits (and thanks):
#  Franz Schmausser <oin@mousel.mch.sni.de> for 1.0b.
#  Phil Pennock <bear@dcs.warwick.ac.uk> for 1.0c.
#
# ======================================================================


# General variables:

$use_global       = undef;
$update           = undef;
$verbose          = undef;
$manpath          = undef;
$official_manpath = undef;
%done_dirs        = ();
$zcat             = "gzip -cd";		# Can be "zcat" if no gzip.
$whatis_file      = "whatis";
$global_whatis    = "/usr/lib/whatis";
$language         = "C";		# Default value if no $LANG


umask 022;

select( STDERR ); $| = 1;	# Unbuffers output on this stream
select( STDOUT ); $| = 1;	# Leave STDOUT as the default !


# Ensure the $PATH environment variable contains /bin and /usr/bin:

$ENV{'PATH'} .= ":/bin:/usr/bin";


# Process command line options

while ( $arg = shift( @ARGV ) )
{
    # See if it starts with a dash

    if ( $arg =~ /^-/ )
    {
	# A dash argument may have more than one letter following it,
	# so we split the argument into an array of letters, remove the
	# dash from it, and process what remains.

	@letters = split( '', $arg );
	shift( @letters );

	foreach $_ ( @letters )
	{
	    SWITCH:
	    {
		/g/ && do
		{
		    $use_global  = 1;
		    $whatis_file = $global_whatis;
		    last SWITCH;
		};
		/h/ && do
		{
		    &print_help;
		    exit;
		};
		/u/ && do
		{
		    $update = 1;
		    last SWITCH;
		};
		/v/ && do
		{
		    $verbose = 1;
		    last SWITCH;
		};
		/w/ && do
		{
		    next		# Don't bother including it twice
			if ( $official_manpath );	

		    if ( $official_manpath = `man -w` )
		    {
			chomp $official_manpath;	# Remove \n from end

			$manpath .= ":"
			    if ( $manpath );

			$manpath .= $official_manpath;
		    }
		    else
		    {
			warn "$0: can't execute 'man -w'\n";
		    }
		    last SWITCH;
		};
		/c/ && last SWITCH;	# Ignored on purpose


		# Default case here:

		die "$0: unknown option: $arg\n";
	    }
	}
    }
    else
    {
	# Add the argument to the manpath (with colon if necessary).
	$manpath .= ":"
	    if ( $manpath );

	$manpath .= $arg;
    }
}


# If the user hasn't specified a manpath or set the -w switch, then we
# use the contents of the $MANPATH environment variable.

$manpath = $ENV{'MANPATH'}
    if ( !$manpath );


# Check what language we want to generate for (this only takes effect
# where "%L" appears in the MANPATH).  The environment variable LANG
# is used, and if this is undefined, it the default value of "C" will be
# used (as defined at the top of the code).

$language = $ENV{'LANG'}
    if ( defined( $ENV{'LANG'} ) );


# If we're using one global file (and we're not updating it), then
# we empty it now, and add to it as we go (otherwise it'd get
# overwritten for each directory in the path).

if ( $use_global && !$update )
{
    if ( open( WHATIS, ">$global_whatis" ) )
    {
	close( WHATIS );
    }
    else
    {
	die "$0: can't open $global_whatis: $!\n";
    }
}


# Now, we cycle through each element in the manpath, and create a
# whatis file for it.

@elements = split( /:/, $manpath );
foreach $directory ( @elements )
{
    &make_whatis( $directory );
}



# Now a little hack for /usr/man/preformat - comment this out if your
# system simply uses the cat directories.
#
# NOTE:
#	As of 1.0a we don't do this hacklet any more.  The code is
# left in, in case you want to.  The reason is this: it seems that
# "man -k" will actually look at /usr/man/preformat/whatis after
# all.  This means we don't need the hack, and if we put it in,
# things start appearing twice.
#
#	If you didn't have a preformat directory, it won't affect
# you anyhow !

#&do_preformat_hack
#    if ( $manpath =~ /(^|:)\/usr\/man(\/preformat)?(:|$)/ );


exit;



# ======================================================================
#
# Subroutines
#
# ======================================================================


# Print out a brief help message

sub print_help
{
    print <<EOT;
usage: makewhatis [-g] [-h] [-u] [-v] [-w] [manpath]
options:
  -g   use the global whatis DB file: $global_whatis
  -h   print this help and exit.
  -u   update whatis database with newer man pages only.
  -v   verbose
  -w   use `man -w` to obtain the manpath.  This may be used in
       addition to a user-specified manpath; this is not supported on
       all systems - see man(1).

    If a manpath is specified, the the contents of that path will be
processed.  If none is specified, then the contents of \$MANPATH will
be processed unless the -w switch is set.
EOT

}



# This routine processes the specified directory and creates (or updates)
# the "whatis" file.

sub make_whatis
{
    local ( $directory ) = @_;


    # Do language substitution:

    $directory =~ s/\%L/$language/g;


    # See if we've already done this directory...

    if ( $done_dirs{ $directory } )
    {
	warn "$0: already processed directory: $directory\n"
	    if ( $verbose );

	return;
    }
    else
    {
	$done_dirs{ $directory } = 1;
    }


    # Make sure we can cd to the directory

    if ( !chdir( $directory ) )
    {
	warn "$0: can't chdir to $directory: $!\n";
	return;
    }


    print "Processing directory: $directory\n"
	if ( $verbose );


    # Initialise the "whatis" array - all the entries are stored here and
    # written out to file at the end.

    %whatis = ();


    # If this is only an update, the we load in the existing whatis file for
    # modification - usually it would just be overwritten.

    if ( $update || $use_global )
    {
	$whatis_mtime = 0;

	if ( -e $whatis_file )
	{
	    if ( open( WHATIS, "<$whatis_file" ) )
	    {
		while ( $line = <WHATIS> )
		{
		    chop $line;
		    $whatis{ $line } = 1;
		}
		close( WHATIS );

		$whatis_mtime = &mtime( $whatis_file );	# No fail check
	    }
	    else
	    {
		if ( $use_global )
		{
		    warn "$0: can't open $whatis_file: $!\n";
		}
		else
		{
		    warn "$0: can't open $directory/whatis: $!\n";
		}
		return;
	    }
	}
    }


    # Now we have to loop through each of the subdirectories and process their
    # contents.  Not all subdirectories will be processed - ones which match
    # the pattern man[0-9a-z] will be processed as nroff/groff source manual
    # pages, and cat[0-9a-z] directories will be processed as plain text
    # manual pages.  When I can be bothered, I'll put in a bit that deals
    # with [man|cat].[0-9a-z].Z (ie: specifically compressed directories).
    # Subdirectories that are links are ignored.

    if ( !opendir( CURRENT_DIR, "." ) )
    {
	warn "$0: can't read directory $directory: $!\n";
	return;
    }


    SUBDIR:
    while ( $subdir = readdir( CURRENT_DIR ) )
    {
	# Ignore links

	next
	    if ( -l $subdir );


	# Ignore non-man subdirs

	next
	    if ( $subdir !~ /^(man|cat)[0-9a-z](\.Z)?$/ );


	# Ignore non-directories

	if ( ! -d $subdir )
	{
	    warn "$0: $directory/$subdir is not a dirctory\n"
		if ( $verbose );
	    next;
	}


	# If we're updating only, check whether the directory has a newer
	# modification time than the whatis file.

	next
	    if ( $update                               &&
	         ( &mtime( $subdir ) < $whatis_mtime )    );


	print "Processing subdirectory: $subdir\n"
	    if ( $verbose );


	# Set up some info about the subdir we're processing...

	$section = substr( $subdir, 3, 1 );	# Assumes 1 char only
	$type = "m";
	$type = "c"
	    if ( substr( $subdir, 0, 3 ) eq "cat" );
	$compressed_dir = undef;
	$compressed_dir = 1
	    if ($subdir =~ /.Z$/);


	# Process the files in the subdirectory...

	if ( !opendir( SUBDIR, $subdir ) )
	{
	    warn "$0: can't read directory $directory/$subdir: $!\n";
	    next;
	}

	FILE:
	while ( $filename = readdir( SUBDIR ) )
	{
	    # Ignore "." and ".."

	    next
		if ( ( $filename eq "."  ) ||
		     ( $filename eq ".." )    );


	    # If this is only an update, ignore files older that the
	    # whatis file.

	    next
		if ( $update                                           &&
		     ( &mtime( "$subdir/$filename" ) < $whatis_mtime )    );


	    # Work out what section this manpage thinks it's in.

	    @name_bits = split( /\./, $filename );
	    pop( @name_bits )			# Remove 'Z' or 'gz' from end.
		if ( $name_bits[ $#name_bits ] =~ /(Z|z|gz)/ );
	    $manpage_section = pop( @name_bits );


	    if ( substr( $manpage_section, 0, 1 ) ne $section )	# Wrong section
	    {
		warn "$0: $directory/$subdir/$filename seems to be in the " .
		    "wrong section !\n";
		next;
	    }


	    print "Adding: $filename\n"
		if ( $verbose );


	    # Now we have to read the file (it may be compressed), and
	    # determine the "synopsis" of the man page (it may be pre-
	    # formatted).

	    if ( $compressed_dir               ||
	         ( $filename =~ /.(Z|z|gz)$/ )    )
	    {
		unless ( $pid = open( MANPAGE, "$zcat $subdir/$filename|" ) )
		{
		    warn "$0: can't open $directory/$subdir/$filename: $!\n";
		    next;
		}
	    }
	    else
	    {
		$pid = undef;
		if ( !open( MANPAGE, "<$subdir/$filename" ) )
		{
		    warn "$0: can't open $directory/$subdir/$filename: $!\n";
		    next;
		}
	    }


	    # Get the NAME info from the manpage

	    if ( $type eq "m" )
	    {
		$name = &process_source_page;
	    }
	    else
	    {
		$name = &process_ascii_page;
	    }


	    # Close the file

	    kill 9, $pid    # Avoids zcat/gzip complaining about broken pipes
		if ( $pid );
	    close( MANPAGE );


	    # Do some processing on the manpage info (if one was returned).

	    if ( $name )
	    {
		$name =~ s/--/ - /;	# Change "--" to " - "
		$name =~ s/:/ - /	# Change ":" to " - "
		    if ( $name !~ /-/ );	# if they didn't use a dash
		$name =~ s/-/ - /	# Fix old cat pages
		    if ( $name !~ / - / );
		$name =~ tr/\t/ /;	# Change tabs to spaces
		$name =~ s/\s+/ /g;	# Collapse whitespace
		$name =~ s/^ //;	# Delete whitespace from start of line
		$name =~ s/ $//;	# Delete whitespace from end of line
		$name =~ s/ , /, /g;	# Fix comma spacing


		# We now format the line - we split it into "name" and
		# "description", and space them out nicely.

		@name_bits = split( /\s+-\s+/, $name, 2 );

		$name = "$name_bits[0] ($manpage_section) ";
		while ( length( $name ) < 21 )
		{
		    $name .= " ";
		}

		$name .= "- $name_bits[1]";

		$whatis{ $name } = 1;
	    }
	    elsif ( $verbose )
            {
		warn "$0: no name for $directory/$subdir/$filename\n";
            }
	}

	closedir( SUBDIR );
    }


    closedir( CURRENT_DIR );


    # At this point, we have an array called %whatis which contains all
    # the database entries.  These simply have to be sorted and written
    # to the whatis file.

    if ( !open( WHATIS, ">$whatis_file" ) )
    {
	if ( $use_global )
	{
	    warn "$0: can't open $whatis_file: $!\n";
	}
	else
	{
	    warn "$0: can't open $directory/whatis: $!\n";
	}

	return undef;
    }

    foreach $name ( sort keys %whatis )
    {
	print WHATIS "$name\n";
    }

    close( WHATIS );
}



# This routine returns the modification time of the given filename,
# or undefined if the stat fails.

sub mtime
{
    local ( $filename ) = @_;

    unless ( local( $mtime ) = ( stat( $filename ) )[9] )
    {
	warn "$0: can't stat $filename: $!\n";
	return undef;
    }

    return $mtime;
}



# This routine processes a "source" manual page.
# It looks for a section called "NAME", as denoted by the ".SH" keyword,
# and returns the first line of this section (this section is usually only
# one line long anyhow).
# If no such section is found, it return undefined.

sub process_source_page
{
    local ( $line, $name );	# Local variables


    while ( $line = <MANPAGE> )
    {
	chop $line;		# Remove trailing \n

	$line =~ s/^\s*//g;	# Remove whitespace from start of line
	$line =~ tr/a-z/A-Z/;	# Convert to upper case


	if ( $line =~ /^\.SH\s+["]?NAME/ )	# Found .SH NAME
	{
	    $name = "";

	    while ( $line = <MANPAGE> )	# Read next line
	    {
		chop $line;		# Remove trailing \n
		$line =~ s/\\-/-/;	# Change "\-" to "-".
		$line =~ s/^\s+//;	# Remove whitespace from start of line
		$line =~ s/\s+$//;	# Remove whitespace from end of line


		# If the line just read is the next section or is blank,
		# then we're done and can return what we've found.

		return $name
		    if ( ( $line eq "" )            ||
		         ( $line =~ /^\.S[HS]/ )    );


		# Is there are any more nroff commands, they must either
		# be removed or interpreted; lines starting with a dot
		# are nroff commands

		if ( substr( $line, 0, 1 ) eq "." )
		{
		    # Remove comments
		    $line =~ s/^[\.]+[\/\\]\".*//;

		    # Delete lines with these commands
		    $line =~ s/^\.(iX|PP|z[AZ]|X[ENS]|ds).*$//;


		    # For Berkeley pages
                    $line =~ s/^\.Nm(.*)/$1 -/;


		    # Remove all other nroff commands
		    $line =~ s/^\.[A-Za-z]+//;
		}


		if ( $line =~ /\\/ )	# Inline commands
		{
		    # Remove font settings and string interpolation

		    $line =~ s/\\[f\*]\(..//g;
		    $line =~ s/\\f.//g;
		    $line =~ s/\\\*.//g;


		    # Remove font sizings

		    $line =~ s/\\s('?)[+-]?\d+$1//g;
		    $line =~ s/\\s\(\d\d//g;


		    # Some interpretaions

		    $line =~ s/\\\(em/--/g;
		    $line =~ s/\\\(hy/-/g;
		    $line =~ s/\\\(ul/_/g;


		    $line =~ tr/\\//d;	# Delete any remaining backslashes
		}


		# Add a space on the end if the line's not blank, and
		# if it doesn't end with a dash.

		$line .= " "
		    if ( $line && ( $line !~ /\S-$/ ) );


		# Add this line to the name, and delete hyphenation.

		$name =~ s/\S-$//
		    if ( $line );
		$name .= $line;
	    }


	    # If we reached the end of the file, return what we found;
	    # it may be undefined (which is ok).

	    return $line;
	}
    }


    # Reached EOF

    return undef;
}



# This routine processes a "cat" man-page, looking for the same as the above
# routine.  However, we often have to contend with backspace characters in
# these files, as they are used to do underlining and emboldening.

sub process_ascii_page
{
    local ( $line, $name );	# Local variables


    while ( $line = <MANPAGE> )
    {
	chop $line;		# Remove trailing \n

	$line =~ s/^\s*//g;	# Remove whitespace from start of line
	$line =~ s/.\010//g;	# Remove the "backspace effect"

	if ( $line =~ /^NAME/ )
	{
	    $name = "";

	    while ( $line = <MANPAGE> )
	    {
		chop $line;
		$line =~ s/.\010//g;	# Remove the "backspace effect"
		$line =~ s/^\s*//;	# Remove whitespace from the start


		# We keep going until we get a blank line

		return $name
		    if ( $line eq "" );


		# Add a space on the end if the line's not blank, and
		# if it doesn't end with a dash.

		$line .= " "
		    if ( $line && ( $line !~ /\S-$/ ) );


		# Add this line to the name, and delete hyphenation.

		$name =~ s/\S-$//
		    if ( $line );
		$name .= $line;
	    }


	    # Return what we've found

	    return $name;
	}
    }

    return undef;
}



# A hack for the /usr/man/preformat directory found on many Linux systems.
#
# This directory doesn't usually have its own whatis database; the man
# won't usually look there.  What this program does is to create a whatis
# database for /usr/man/preformat, and then insert its contents into
# /usr/man/whatis (or the global whatis file).  It doesn't delete
# /usr/man/preformat/whatis so that future updates can be done more quickly.

sub do_preformat_hack
{
    $directory = "/usr/man";
    $subdir = "preformat";


    # Firstly - is it there ?

    if ( ! -d "$directory/$subdir" )
    {
	warn "$0: can't incorporate whatis database from /usr/man/preformat: "
	    . "no such directory\n"
	    if ( $verbose );
    }


    if ( !chdir( $directory ) )
    {
	warn "$0: can't cd to $directory: $!\n"
	    if ( $verbose );	# This error will also appear earlier

	return undef;
    }


    return undef
	if ( ! -e "$subdir/whatis" );	# No whatis file


    return undef
	if ( ! -e $whatis_file );


    # If we're in "update" mode, it isn't be necessary to go through
    # with this routine...

    return undef
	if ( $update                                                 &&
	     ( &mtime( "$subdir/whatis" ) < &mtime( $whatis_file ) )    );


    # Load the preformat whatis database into the %whatis array

    if ( !open( PREFORMAT, "<$subdir/whatis" ) )
    {
	warn "$0: can't open $directory/$subdir/whatis: $!\n";
	return undef;
    }

    %whatis = ();

    while ( $line = <PREFORMAT> )
    {
	chop $line;		# Remove trailing \n
	$whatis{ $line } = 1;
    }

    close( PREFORMAT );


    # Now we load in the main whatis database into the same array (which
    # will eliminate duplicates).

    if ( !open( WHATIS, "<$whatis_file" ) )
    {
	if ( $use_global )
	{
	    warn "$0: can't open $whatis_file: $!\n";
	}
	else
	{
	    warn "$0: can't open $directory/whatis: $!\n";
	}
	return undef;
    }

    while( $line = <WHATIS> )
    {
	chop $line;		# Remove trailing \n
	$whatis{ $line } = 1;
    }

    close( WHATIS );


    # Now we sort the entries and write them out to the one file.

    if ( !open( WHATIS, ">$whatis_file" ) )
    {
	if ( $use_global )
	{
	    warn "$0: can't open $whatis_file: $!\n";
	}
	else
	{
	    warn "$0: can't open $directory/whatis: $!\n";
	}
	return undef;
    }

    foreach $line ( sort keys %whatis )
    {
	print WHATIS "$line\n";
    }

    close( WHATIS );
}

# ======================================================================
#
# End of file: makewhatis
#
# ======================================================================
