#!/usr/bin/env perl

### $Id: kotexindy.pl,v 1.8 2011/08/09 00:32:27 nomos Exp $
### texindy wrapper for ko.tex
### written by Dohyun Kim
### public domain

#use warnings;
#use strict;
use 5.8.0;
use Getopt::Long qw(:config no_ignore_case);

my @args = @ARGV;
my ($opt_version,
    $opt_help,
    $opt_quiet,
    $opt_verbose,
    $opt_stdin,
    $opt_german,
    $opt_no_ranges,
    $opt_letter_ordering,
    @opt_debug,
    $opt_out_file,
    $opt_log_file,
    $opt_language,
    $opt_codepage,
    @opt_module,
    $opt_input_markup);

GetOptions (
    'version|V'		=> \$opt_version,
    'help|?'		=> \$opt_help,
    'quiet'		=> \$opt_quiet,
    'verbose'		=> \$opt_verbose,
    'stdin|i'		=> \$opt_stdin,
    'german'		=> \$opt_german,
    'no-ranges|r'	=> \$opt_no_ranges,
    'letter-ordering|l'	=> \$opt_letter_ordering,
    'debug=s'		=> \@opt_debug,
    'out-file=s'	=> \$opt_out_file,
    'log-file|t=s'	=> \$opt_log_file,
    'language|L=s'	=> \$opt_language,
    'codepage|C=s'	=> \$opt_codepage,
    'module|M=s'	=> \@opt_module,
    'input-markup|I=s'	=> \$opt_input_markup);

if ($opt_version or $opt_help) {
    system "texindy @args";
    exit;
}

### obtain output file name
my @idxfiles = @ARGV;
my $indfile = $opt_out_file;
if (!$indfile and @idxfiles) {
    $indfile = $idxfiles[0];
    $indfile =~ s/\.idx$/\.ind/;
}

$indfile or die "Failed to obtain output file name!";

# support stdin option
$opt_stdin and @idxfiles = ('-');

# remove idxfiles from @args
for my $i (@idxfiles) {
    for (0 .. $#args) {
	$i eq $args[$_] and $args[$_] = '';
    }
}

my (@idxarr, @indarr);

### variables for subroutines
my @hanja_to_hangul = get_hanja_hangul_table("hanja_hg.tab");
my @hanjacompat_to_hangul = get_hanja_hangul_table("hjcom_hg.tab");
my @hanjaextA_to_hangul = get_hanja_hangul_table("hjexa_hg.tab");

my @jamo_cho_comp = ( # 0x115F HCF => 0x314F ㅏ
    0x3131, 0x3132, 0x3134, 0x3137, 0x3138, 0x3139, 0x3141, 0x3142,
    0x3143, 0x3145, 0x3146, 0x3147, 0x3148, 0x3149, 0x314A, 0x314B,
    0x314C, 0x314D, 0x314E, 0x1113, 0x3165, 0x3166, 0x1116, 0x1117,
    0x1118, 0x1119, 0x3140, 0x111B, 0x316E, 0x3171, 0x3172, 0x111F,
    0x3173, 0x3144, 0x3174, 0x3175, 0x1124, 0x1125, 0x1126, 0x3176,
    0x1128, 0x3177, 0x112A, 0x3178, 0x3179, 0x317A, 0x317B, 0x317C,
    0x1130, 0x1131, 0x317D, 0x1133, 0x1134, 0x1135, 0x317E, 0x1137,
    0x1138, 0x1139, 0x113A, 0x113B, 0x113C, 0x113D, 0x113E, 0x113F,
    0x317F, 0x1141, 0x1142, 0x1143, 0x1144, 0x1145, 0x1146, 0x3180,
    0x1148, 0x1149, 0x114A, 0x114B, 0x3181, 0x114D, 0x114E, 0x114F,
    0x1150, 0x1151, 0x1152, 0x1153, 0x1154, 0x1155, 0x1156, 0x3184,
    0x3185, 0x3186, 0x115A, 0x3167, 0x3135, 0x3136, 0x115E, 0x314F);

my $cho   = "\x{1100}-\x{115F}\x{A960}-\x{A97C}";
my $jung  = "\x{1160}-\x{11A7}\x{D7B0}-\x{D7C6}";
my $jong  = "\x{11A8}-\x{11FF}\x{D7CB}-\x{D7FB}";
my $hanja = "\x{3400}-\x{4DB5}\x{4E00}-\x{9FA5}\x{F900}-\x{FA2D}";

my $ist_keyword = '\indexentry';
my $ist_actual = '@';
my $ist_encap = '|';
my $ist_level = '!';
my $ist_quote = '"';
my $ist_arg_open = '{';
my $ist_arg_close = '}';

### processing input files
foreach my $file (@idxfiles) {
    open IDX, "<$file" or die "$file: $!";
    binmode IDX, ":utf8";
    while (<IDX>) {
	#    \indexentry{ ..... }{ .. }
	# -> $pre         $body $post
	if (/(\Q$ist_keyword\E\s*\Q$ist_arg_open\E)
	    (.*?[^\Q$ist_quote\E])
	    (\Q$ist_arg_close$ist_arg_open\E.+?\Q$ist_arg_close\E)$/x) {

	    my($pre,$body,$post) = ($1,$2,$3);

	    #    \indexentry{ ..... | .. }{ .. }
	    # -> $pre         $body $post
	    my @xbody = split /(?<!\Q$ist_quote\E)\Q$ist_encap/, $body;
	    for ( my $i=$#xbody; $i>0; $i--) {
		$post = $ist_encap.$xbody[$i].$post;
	    }
	    $body = $xbody[0];

	    # !을 경계로 가름
	    @xbody = split /(?<!\Q$ist_quote\E)\Q$ist_level/, $body;

	    for (@xbody) {
		# @이 없으면... 넣어준다.
		/[^\Q$ist_quote\E]\Q$ist_actual/ or $_ = $_.$ist_actual.$_;

		# @을 경계로 가름.
		my @ybody = split /(?<!\Q$ist_quote\E])\Q$ist_actual\E/, $_;
		$_ = $ybody[0];

		&hanja_to_hangul;
		&syllable_to_jamo;
		&insertjongsongfiller;

		$ybody[0] = $_;
		if ($ybody[1] =~ /[$hanja]/) {
		    $ybody[0] .= "\x{0001}$ybody[1]"; # bug of xindy?
		}

		$_ = join $ist_actual, @ybody;
	    }

	    $body = join $ist_level, @xbody;
	    $_ = "$pre$body$post\n";
	}
	push @idxarr, $_;
    }
    close IDX;
}

### running texindy
open MAKE, "| texindy @args -I omega -M lang/korean/utf8 -o $indfile -i" or die "$!";
binmode MAKE, ":utf8";
print MAKE @idxarr;
close MAKE;
$? >> 8 and die "\ntexindy failed!\n";

### processing output file
open IND, "<:utf8", $indfile or die "$indfile: $!";
while (<IND>) {
    if (/\\lettergroup/) {
	&insertfillers;
	&jamo_to_jamocomp;
    }
    push @indarr, $_;
}
close IND;

open IND, ">:utf8", $indfile or die "$indfile: $!";
print IND @indarr;
close IND;

########## SUBROUTINES ##########

sub syllable_to_jamo {
    s/([\x{AC00}-\x{D7A3}])/do_syllable_to_jamo($1)/ge;
}

sub do_syllable_to_jamo {
    my $syl  = ord shift;
    my $cho  = ($syl - 0xac00) / (21 * 28) + 0x1100;
    my $jung = ($syl - 0xac00) % (21 * 28) / 28 + 0x1161;
    my $jong = ($syl - 0xac00) % 28;
    if ($jong) {
	$jong += 0x11a7;
	return chr($cho).chr($jung).chr($jong);
    }
    return chr($cho).chr($jung);
}

sub hanja_to_hangul {
    s/([\x{3400}-\x{4DB5}])/chr($hanjaextA_to_hangul[ord($1)-0x3400])/ge;
    s/([\x{4E00}-\x{9FA5}])/chr($hanja_to_hangul[ord($1)-0x4E00])/ge;
    s/([\x{F900}-\x{FA2D}])/chr($hanjacompat_to_hangul[ord($1)-0xF900])/ge;
}

sub get_hanja_hangul_table {
    my $file = shift;
    my @HJHG;

    $file = `kpsewhich $file`;
    chomp $file;

    open TAB, $file or die "$file : $!\n";
    @HJHG = <TAB>;
    close TAB;

    chomp @HJHG;
    return @HJHG;
}

sub insertfillers {
    s/([$cho])([$jong])/$1\x{1160}\x{115F}\x{1160}$2/g;
    s/(^|[^$jung$jong])([$jong])/$1\x{115F}\x{1160}$2/g;
    s/(^|[^$cho$jung])([$jung])/$1\x{115F}$2/g;
    s/([$cho])([^$cho$jung]|$)/$1\x{1160}$2/g;
}

sub insertjongsongfiller { # 0xF86A as jongseong filler
    s/([$cho][$jung])([^$jong]|$)/$1\x{F86A}$2/g;
}

sub jamo_to_jamocomp {
    s/([\x{1100}-\x{115F}])\x{1160}([^$jong]|$)/&cjamo_by_jamo_cho($1).$2/ge;
}

sub cjamo_by_jamo_cho {
    my $jamo = ord shift;
    my $cjamo = $jamo_cho_comp[$jamo - 0x1100];
    if ($cjamo < 0x11FF) {
	return chr($cjamo) . chr(0x1160);
    }
    else {
	return chr($cjamo);
    }
}

### EOF
