#!/home/martink/bin/perl

=pod

=head1 NAME

parse-table - parse tabular data, such as created by make-table, and report values suitable for make-conf which creates Circos input files to visualize the table

=head1 SYNOPSIS
  
  cat table.txt | parse-table
  
  # or

  parse-table -file table.txt

=head1 DESCRIPTION

This script parses tabular data and generates the circos input and configuration files used to create circular table views as described here

 http://mkweb.bcgsc.ca/circos/tableviewer/

=head1 INPUT FORMAT

The basic table format is
 
  - D E F
  A 1 2 3
  B 4 5 6
  C 7 8 9

The first row contains the column labels and the first column contains row labels. Value of top-left cell is unimportant. 

Two additional rows and columns can be added - these specify the order of rows/columns and their color.

Order values are used to determine placement of column and row segments.
 
  # table with order values (set col_order_row=yes and row_order_col=yes)
  - - 3 6 5
  - - D E F
  1 A 1 2 3
  4 B 4 5 6
  2 C 7 8 9

  # table with color values (set col_color_row=yes and row_color_col=yes)
  -         - 255,0,0 0,255,0 0,0,255
  -         - D E F
  255,255,0 A 1 2 3
  255,0,255 B 4 5 6
  0,255,255 C 7 8 9

  # table with row order and color values
  -         - - 3 6 5
  -         - - 255,0,0 0,255,0 0,0,255
  -         - - D E F
  1 255,255,0 A 1 2 3
  4 255,0,255 B 4 5 6
  2 0,255,255 C 7 8 9

To specify the color of a subset of columns and rows, use "-" in color cells for which there is no color value.

=head1 SAMPLE DATA

There are three sample data files in samples/. Each represents the same 5x5 table, but differ in additional control parameters like row/cell order and row/cell color.

=head2 samples/table-basic.txt

Must be used with 

  col_order_row=no
  row_order_col=no
  col_color_row=no
  row_color_col=no

=head2 samples/table-ordered.txt

Must be used with

  col_order_row=yes
  row_order_col=yes
  col_color_row=no
  row_color_col=no

=head2 samples/table-ordered-colored.txt

Must be used with

  col_order_row=yes
  row_order_col=yes
  col_color_row=yes
  row_color_col=yes

=head1 CONFIGURATION

All configuration parameters are controlled by the configuration file - see etc/parse-table.conf.

=head1 HISTORY

=over

=item * 21 Jan 2009 v0.1

Standardized configuration and added segment/cell order features.

Bundled with Circos distribution. 

=item * 25 Jun 2008 

Added more error traps.

=item * 11 June 2008

Continuing to refine and debug.

=item * 2 June 2008

Versioned and updated.

=back 

=head1 BUGS

=head1 AUTHOR

Martin Krzywinski

=head1 CONTACT

  Martin Krzywinski
  Genome Sciences Centre
  Vancouver BC Canada
  www.bcgsc.ca
  martink@bcgsc.ca

=cut

################################################################
#
# Copyright 2002-2008 Martin Krzywinski
#
# This file is part of the Genome Sciences Centre Perl code base.
#
# This script is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This script is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this script; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
################################################################

################################################################
#                           Martin Krzywinski (martink@bcgsc.ca)
#                                                           2008
################################################################

use strict;
use Config::General;
use Data::Dumper;
use File::Basename;
use FindBin;
use Getopt::Long;
use IO::File;
use Math::VecStat qw(sum min max average);
use Math::Round qw(round);
use Pod::Usage;
use Set::IntSpan;
use Statistics::Descriptive;
use Storable;
use Time::HiRes qw(gettimeofday tv_interval);
use lib "$FindBin::RealBin";
use lib "$FindBin::RealBin/../lib";
use lib "$FindBin::RealBin/lib";
use vars qw(%OPT %CONF);

use Graphics::ColorObject;

################################################################
#
# *** YOUR MODULE IMPORTS HERE
#
################################################################

GetOptions(\%OPT,

	   "file=s",
	   "color_prefix=s",
	   "skip_rows=i",

	   "header",
	   "header_skip_cols=i",

	   "row_order_col",
	   "col_order_row",
	   "use_row_order_col",
	   "use_col_order_row",

	   "row_color_col",
	   "col_color_row",
	   "use_row_color_col",
	   "use_col_color_row",


	   "intra_cell_exclude",
	   "intra_cell_size_remove",

	   "size_recompute",

	   "use_scaling",
	   "scale_factor=f",

	   "percentile_sampling=f",

	   "percentile_hue_start=i",
	   "percentile_hue_end=i",

	   "percentile_saturation_start=i",
	   "percentile_saturation_end=i",

	   "percentile_brightness_start=i",
	   "percentile_brightness_end=i",

	   "cell_q1_color=s",
	   "cell_q2_color=s",
	   "cell_q3_color=s",
	   "cell_q1_nostroke",
	   "cell_q2_nostroke",
	   "cell_q3_nostroke",

	   "cell_min_percentile=f",
	   "cell_max_percentile=f",
	   "cell_min_value=f",
	   "cell_max_value=f",

	   "data_mult=f",

	   "color_idx_mult=f",
	   "color_remap",
	   "color_autoremap",

	   "configfile=s","help","man","debug+");

pod2usage() if $OPT{help};
pod2usage(-verbose=>2) if $OPT{man};
loadconfiguration($OPT{configfile});
populateconfiguration(); # copy command line options to config hash
validateconfiguration(); 
if($CONF{debug} > 1) {
  $Data::Dumper::Pad = "debug parameters";
  $Data::Dumper::Indent = 1;
  $Data::Dumper::Quotekeys = 0;
  $Data::Dumper::Terse = 1;
  print Dumper(\%CONF);
}

################################################################
# setup the input handle to be the file, if specified, otherwise
# read from STDIN
my $inputhandle;
if(my $file = $CONF{file}) {
  die "No such file $file" unless -e $file;
  open(F,$file);
  $inputhandle = \*FILE;
} else {
  printdebug("using STDIN");
  $inputhandle = \*STDIN;
}

# all the table data (row/column/cell) is stored in $table
# all statistics (row/column/cell/label) is stored in $stat
my ($table,$stat);

# counters for things such as skipped lines, row and column index live in $counter
my $counter;

# determine the number of columns to skip in the header
$CONF{header_skip_cols} = 1 + ($CONF{row_order_col} > 0) + ($CONF{row_size_col} > 0);

my ($ncols,$seen,$row_idx,$label_order,$label_size,$row_idx,$label_color,$nskip);

LINE:
while(<$inputhandle>) {
  chomp;
  s/^\s*//;
  next if /^debug/;
  my $line = $_;
  next if $CONF{skip_rows} && $counter->{skipped_lines}++ < $CONF{skip_rows};
  my @tok = clean_and_split_line($_);
  next unless @tok;
  
  # determine the number of columns - do this every time to catch any lines
  # with different number of fields
  printdebug("read line",int(@tok)+1,"columns");
  printdebug("line",@tok);
  my $ncols_this_line = @tok;
  if(! defined $ncols) {
    $ncols = $ncols_this_line;
  } elsif ($ncols_this_line != $ncols) {
    report_error("line has a different number of columns ($ncols_this_line) ($_) than previous lines($ncols)",$line);
  }

  # parse column information
  # - extract column names
  # - extract column size, if available
  # - extract column order, if available
  # column information is stored in in $table->{col}
  # - $table->{col}{list}[IDX] = $coldata;
  #                              {name,idx,size?,order?}

  if($CONF{col_order_row} && ! $seen->{col_order_row}++) {
    my $skip_cols = $CONF{use_row_order_col} + $CONF{use_row_color_col} + 1;
    printdebug("reading column order starting at col",$skip_cols);
    for my $i ( $skip_cols .. $ncols - 1) {
      my $col_idx  = $i - $skip_cols;
      last if $col_idx >= $CONF{max_col_num};
      my $col_order = $tok[$i];
      printdebug("read column order",$col_order,"from column",$i,"col_idx",$col_idx);
      if($CONF{use_col_order_row}) {
	report_error("column at order value",$tok[$i],"already exists") if $seen->{col_order}{$col_order}++;
	$table->{col}{list}[$col_idx]{order} = $col_order;
	report_error("column order value [$col_order] is not numeric",$line) unless $col_order =~ /^\d+$/;
	printdebug("found column order",$col_idx,$col_order);
      }
    }
    next LINE;
  }
  if($CONF{col_color_row} && ! $seen->{col_color_row}++) {
    my $skip_cols = $CONF{use_row_order_col} + $CONF{use_row_color_col} + 1;
    printdebug("reading column color starting at col",$skip_cols);
    for my $i ( $skip_cols .. $ncols - 1) {
      my $col_idx  = $i - $skip_cols;
      last if $col_idx >= $CONF{max_col_num};
      my $col_color = $tok[$i];
      next unless $col_color =~ /^\d+,\d+,\d+$/;
      if($CONF{use_col_color_row}) {
	$table->{col}{list}[$col_idx]{color} = $col_color;
	printdebug("found column color",$col_idx,$col_color);
      }
    }
    next LINE;
  }
  if($CONF{header} && ! $seen->{header}++) {
    my $skip_cols = $CONF{use_row_order_col} + $CONF{use_row_color_col} + 1;
    printdebug("reading column names starting at col",$skip_cols);
    for my $i ( $skip_cols .. $ncols - 1) {
      my $col_idx = $i - $skip_cols;
      last if $col_idx >= $CONF{max_col_num};
      my $name    = shorten_text($tok[$i]);
      report_error("duplicate column name [$name]",$line) if $seen->{col_name}{$name}++;
      my $col_data = { name=>$name,idx=>$col_idx,j=>$i};
      @{$table->{col}{list}[$col_idx]}{qw(name idx j)} = ($name,$col_idx,$i);
      $label_order->{$name} = $table->{col}{list}[$col_idx]{order};
      $label_size->{$name}  = $table->{col}{list}[$col_idx]{size};
      printdebug("found column name",$col_idx,$name);

    }
    report_error("could not parse column header") unless $table->{col}{list};
    next LINE;
  }

  # parse each data row of table
  # - extract row names and store this in $table->{row}
  # - row identification is stored in the same was as columns, but in $table->{row}
  # - extract cell data

  my $col_idx = 0;
  my $row_data;
  if($CONF{row_order_col}) {
    my $order = $tok[ $col_idx++ ];
    if($CONF{use_row_order_col}) {
      $row_data->{order} = $order;
      report_error("row at order value",$row_data->{order},"already exists") if $seen->{row_order}{$row_data->{order}}++;
      report_error("row order value [$row_data->{order}] is not numeric",$line) unless $row_data->{order} =~ /^\d+$/;
      printdebug("found row order",$row_idx,$row_data->{order});
    }
  }
  if($CONF{row_color_col}) {
    my $color = $tok[ $col_idx++ ];
    printdebug("found color",$color);
    if($color =~ /^\d+,\d+,\d+$/ && $CONF{use_row_color_col}) {
      $row_data->{color} = $color;
      printdebug("found row color",$row_data->{color});
    }
  }
  my $row_name = shorten_text($tok[ $col_idx++ ]);
  report_error("duplicate row name [$row_name]",$line) if $seen->{row_name}{$row_name}++;
  my $row_idx  = $table->{row}{list} ? @{$table->{row}{list}} : 0;
  last LINE if $row_idx >= $CONF{max_row_num};

  @{$row_data}{qw(name idx)} = ($row_name,$row_idx);
  printdebug("found row name",$row_idx,$row_name);
  $label_order->{$row_name} ||= $row_data->{order};
  $label_size->{$row_name}  += $row_data->{size};
  $row_idx++;

  push @{$table->{row}{list}}, $row_data;

  for my $col_data ( @{$table->{col}{list}} ) {
    my $j = $col_data->{j};
    my $cell_value = $tok[$j];
    my $cell_value_clean = clean_value($cell_value);
    $cell_value_clean = $cell_value ne "" && $cell_value > 0 ? $CONF{data_mult} * $cell_value : $CONF{missing_cell_value};
    if($CONF{intra_cell_exclude} && $col_data->{name} eq $row_data->{name}) {
      printdebug("excluded intra-cell value",$row_data->{name},$col_data->{name},$cell_value);
      if($CONF{intra_cell_size_remove}) {
	if($table->{row}{list}[$row_data->{idx}]{size}) {
	  $table->{row}{list}[$row_data->{idx}]{size} -= $cell_value_clean;
	  printdebug("shrank row size",$row_data->{name},$row_data->{size},$cell_value_clean);
	}
	if($table->{col}{list}[$col_data->{idx}]{size}) {
	  $table->{col}{list}[$col_data->{idx}]{size} -= $cell_value_clean;
	  printdebug("shrank col size",$col_data->{name},$col_data->{size},$cell_value_clean);
	}
      }
    } else {
      my $cell_data = { col=>$col_data,
			row=>$row_data,
			raw_value=>$cell_value,
			value=>$cell_value_clean };
      if($cell_value_clean eq $CONF{missing_cell_value}) {
	$cell_data->{missing} = 1;
      }
      printdebug("found cell value",$row_data->{name},$col_data->{name},$cell_value);
      report_error("all cell values must be non-negative, but you have ($row_data->{name},$col_data->{name}=)$cell_value",$line) if $cell_value < 0;
      push @{$table->{cell_list}}, $cell_data;
      $table->{cell}{$row_data->{name}}{$col_data->{name}} = $cell_data;
    }
  }
}

for my $row (@{$table->{row}{list}}) {
  $table->{row}{label}{$row->{name}} = $row;
}
for my $col (@{$table->{col}{list}}) {
  $table->{col}{label}{$col->{name}} = $col;
}

#printdumper($table);
#exit;
################################################################
#
# Compile cells statistics

$stat->{cell}{raw} = Statistics::Descriptive::Full->new();
$stat->{cell}{raw}->add_data(map { $_->{value} } grep(! $_->{missing}, @{$table->{cell_list}}));
printdebug("cell stats","min",$stat->{cell}{raw}->min);
printdebug("cell stats","average",$stat->{cell}{raw}->mean);
printdebug("cell stats","median",$stat->{cell}{raw}->median);
printdebug("cell stats","max",$stat->{cell}{raw}->max);

################################################################
#
# Filter cell values and update the cell list with those
# cells that have not been excluded with the min/max filters

for my $cell (grep(! $_->{missing}, @{$table->{cell_list}})) {
  my $reject;
  $reject = 1 if defined $CONF{cell_min_value} && $cell->{value} < $CONF{cell_min_value};
  $reject = 1 if defined $CONF{cell_max_value} && $cell->{value} > $CONF{cell_max_value};
  $reject = 1 if defined $CONF{cell_min_percentile} && $cell->{value} < $stat->{cell}{raw}->percentile($CONF{cell_min_percentile});
  $reject = 1 if defined $CONF{cell_max_percentile} && $cell->{value} > $stat->{cell}{raw}->percentile($CONF{cell_max_percentile});
  if($reject) {
    $cell->{delete} = 1;
  }
}

@{$table->{cell_list}} = grep(! exists $_->{delete}, @{$table->{cell_list}});

################################################################
#
# create colors that encode cell value by percentile
#

for my $i (0 .. 100/$CONF{percentile_sampling}) {
  my $percentile = $i * $CONF{percentile_sampling};
  my $value = $stat->{cell}{raw}->percentile($percentile);
  my $h_range = $CONF{colors}{h1} - $CONF{colors}{h0};
  my $s_range = $CONF{colors}{s1} - $CONF{colors}{s0};
  my $v_range = $CONF{colors}{v1} - $CONF{colors}{v0};
  my $h = int($CONF{colors}{h0} + $h_range * $percentile/100);
  my $s = $CONF{colors}{s0} + $s_range * $percentile/100;
  my $v = $CONF{colors}{v0} + $v_range * $percentile/100;
  my $c = Graphics::ColorObject->new_HSV([ $h,$s,$v ]);
  my @rgb = @{$c->as_RGB255()};
  printinfo("hsvpercentile",$percentile,$h,$s,$v);
  printinfo("colorpercentile",sprintf("percentile%03d %d,%d,%d",$percentile,@rgb));
}

################################################################
#
# scale cell values
#

for my $cell_data (grep(! $_->{missing}, @{$table->{cell_list}})) {
  my $value_scaled = scale_value($cell_data->{value},$stat->{cell}{raw}->max);
  $cell_data->{scaled_value} = $value_scaled;
  printinfo("scale",$CONF{scaling_type},$cell_data->{value},$value_scaled);
}
my $value_type = $CONF{use_scaling} ? "scaled_value" : "value";

################################################################
#
# report parsed data
#

for my $type (qw(row col)) {
  for my $data (@{$table->{$type}{list}}) {
    printinfo("table report",$type,
	      join(" ",@{$data}{qw(idx name order)}));
  }
}

for my $cell (@{$table->{cell_list}}) {
  printinfo("table report cell",
	    join(" ",
		 $cell->{col}{idx},
		 $cell->{col}{name},
		 $cell->{row}{idx},
		 $cell->{row}{name},
		 "raw",
		 $cell->{value},
		 "scaled",
		 $cell->{$value_type}));
}

#
################################################################

################################################################
#
# calculate cell (scaled or raw, as requested) statistics
# for row, column and labels
#

my @types = qw(row col);
for my $cell (@{$table->{cell_list}}) {
  for my $type (qw(row col)) {
    my $name = $cell->{$type}{name};
    $stat->{$type}{$name} ||= Statistics::Descriptive::Full->new();
    $stat->{label}{$name} ||= Statistics::Descriptive::Full->new();
    $stat->{cells}        ||= Statistics::Descriptive::Full->new();
    if($CONF{intra_cell_exclude} && $cell->{col}{name} eq $cell->{row}{name}) {
      # don't count this
    } else {
      $stat->{$type}{$name}->add_data( $cell->{$value_type} );
      $stat->{label}{$name}->add_data( $cell->{$value_type} );
      $stat->{cells}->add_data( $cell->{$value_type});
    }
  }
}

for my $stattype (qw(row col label)) {
  for my $name (sort keys %{$stat->{$stattype}}) {
    printdebug("row/col/label stat",$stattype,$name,
	       "sum",
	       $stat->{$stattype}{$name}->sum,
	       "n",
	       $stat->{$stattype}{$name}->count,
	       "min",
	       $stat->{$stattype}{$name}->min,
	       "max",
	       $stat->{$stattype}{$name}->max,
	       "avg",
	       $stat->{$stattype}{$name}->mean);
  }
}

################################################################
#
# compile a list of unique labels and assign a chromosome number
# to each label
#
# each label will correspond to an ideogram
#

my @unique_labels = unique ( map { $_->{row}{name}, $_->{col}{name}} @{$table->{cell_list}} );
# first, add labels for which a label order exists
my @labels = grep(defined $label_order->{$_}, @unique_labels);
printdebug("label order preorder",@labels);
# now deal with remaining labels
my @labels_to_order = grep(! defined $label_order->{$_}, @unique_labels);
my @labels_ordered;
my @labels_row = grep( $table->{row}{label}{$_}, @labels_to_order);
my @labels_col = grep( $table->{col}{label}{$_}, @labels_to_order);

if ($CONF{segment_order} =~ /ascii/) {
  if($CONF{segment_order} =~ /row_major/) {
    @labels_ordered = unique (  ( sort @labels_row ), ( sort @labels_col ) );
  } elsif ($CONF{segment_order} =~ /col_major/) {
    @labels_ordered = unique (  ( sort @labels_col ), ( sort @labels_row ) );
  } else {
    @labels_ordered = unique ( sort (@labels_row,@labels_col) );
  }
} elsif ($CONF{segment_order} =~ /size_asc/) {
  if($CONF{segment_order} =~ /row_major/) {
    @labels_ordered = unique (  ( sort {$stat->{label}{$a}->sum <=> $stat->{label}{$b}->{sum}} @labels_row ),
				( sort {$stat->{label}{$a}->sum <=> $stat->{label}{$b}->{sum}} @labels_col ) );
  } elsif ($CONF{segment_order} =~ /col_major/) {
    @labels_ordered = unique (  ( sort {$stat->{label}{$a}->sum <=> $stat->{label}{$b}->{sum}} @labels_col ),
				( sort {$stat->{label}{$a}->sum <=> $stat->{label}{$b}->{sum}} @labels_row ) );
  } else {
    @labels_ordered = unique (  ( sort {$stat->{label}{$a}->sum <=> $stat->{label}{$b}->{sum}} (@labels_row,@labels_col)) );
  }
} elsif ($CONF{segment_order} =~ /size_desc/) {
  if($CONF{segment_order} =~ /row_major/) {
    @labels_ordered = unique (  ( sort {$stat->{label}{$b}->sum <=> $stat->{label}{$a}->{sum}} @labels_row ),
				( sort {$stat->{label}{$b}->sum <=> $stat->{label}{$a}->{sum}} @labels_col ) );
  } elsif ($CONF{segment_order} =~ /col_major/) {
    @labels_ordered = unique (  ( sort {$stat->{label}{$b}->sum <=> $stat->{label}{$a}->{sum}} @labels_col ),
				( sort {$stat->{label}{$b}->sum <=> $stat->{label}{$a}->{sum}} @labels_row ) );
  } else {
    @labels_ordered = unique (  ( sort {$stat->{label}{$b}->sum <=> $stat->{label}{$a}->{sum}} (@labels_row,@labels_col)) );
  }
} elsif ($CONF{segment_order} =~ /row_major/) {
  @labels_ordered = unique ( @labels_row, @labels_col );
} elsif ($CONF{segment_order} =~ /col_major/) {
  @labels_ordered = unique ( @labels_col, @labels_row );
} else {
  die "don't understand segment_order value [$CONF{segment_order}]";
}
push @labels, @labels_ordered;

printinfo("labels ordered",@labels);

my $label2chr;
for my $label_idx (0..@labels-1) {
  $label2chr->{ $labels[$label_idx] } = $label_idx;
  printinfo("label2chr",$label_idx,$labels[$label_idx],sprintf("%s%s",$CONF{chr_prefix},$label_idx));
}

################################################################ 
# compile a list of colors associated with a label

for my $type (qw(row col)) {
  for my $elem (@{$table->{$type}{list}}) {
    if($label_color->{$elem->{name}} && $label_color->{$elem->{name}} ne $elem->{color}) {
      report_error("color for label [$elem->{name}] is already defined to be [$label_color->{$elem->{name}}] but saw new definition [$elem->{color}]");
    } elsif ($elem->{color}) {
      $label_color->{$elem->{name}} = $elem->{color};
      printdebug("assigned color from data file",$elem->{name},$elem->{color});
    }
  }
}

# how many labels don't yet have color
my @label_for_color = grep(! defined $label_color->{$_}, @labels);
my $cumul_size = 0;
for my $i (0..@label_for_color-1) {
  my ($h,$s,$v);
  if($CONF{segment_colors}{interpolate_type} =~ /size/) {
    my $tsize = sum ( map { $stat->{label}{$_}->sum } @label_for_color );
    my $x = $cumul_size / $tsize;
    $h = $CONF{segment_colors}{h1}*$x + $CONF{segment_colors}{h0}*(1-$x);
    $s = $CONF{segment_colors}{s1}*$x + $CONF{segment_colors}{s0}*(1-$x);
    $v = $CONF{segment_colors}{v1}*$x + $CONF{segment_colors}{v0}*(1-$x);
    $cumul_size += $stat->{label}{ $label_for_color[$i]}->sum;
  } else {
    $h = ( $CONF{segment_colors}{h1}*$i + $CONF{segment_colors}{h0}*(@label_for_color-1-$i) ) / (@label_for_color-1);
    $s = ( $CONF{segment_colors}{s1}*$i + $CONF{segment_colors}{s0}*(@label_for_color-1-$i) ) / (@label_for_color-1);
    $v = ( $CONF{segment_colors}{v1}*$i + $CONF{segment_colors}{v0}*(@label_for_color-1-$i) ) / (@label_for_color-1);
  }
  printdebug("interpolated hsv for segment",$h,$s,$v);
  my $c = Graphics::ColorObject->new_HSV([ $h,$s,$v ]);
  my @rgb = @{$c->as_RGB255()};
  $label_color->{$label_for_color[$i]} = join(",",@rgb);
  printdebug("assigned color from interpolated range",$label_for_color[$i],join(",",@rgb));
}

################################################################
#
# create new color definitions

for my $label (keys %$label_color) {
  printinfo(sprintf("colordef %s%s %s",$CONF{chr_prefix},$label2chr->{$label},$label_color->{$label}));
}

################################################################
#
# create highlights based on row and column values
#

for my $type (qw(row col)) {
  my $atype = $type eq "row" ? "col" : "row";
  # create a hash of all cell values that lead from $type/label combination (e.g. row/A)
  # to $atype/label combination (e.g. col/B)
  my %values;
  for my $label (keys %{$table->{$type}{label}}) {
    for my $cell (@{$table->{cell_list}}) {
      #printdumper($cell);
      if($cell->{$type}{name} eq $label) {
	$values{$label}{$cell->{$atype}{name}} = $cell->{$value_type};
      }
    }
  }

  #printdumper(\%values);
  for my $label ( keys %values ) {
    my $cumul_pos = 0;
    for my $alabel ( sort {$values{$label}{$b} <=> $values{$label}{$a}} keys %{$values{$label}} ) {
      printinfo(sprintf("%s %s %s%d %d %d fill_color=%s%d",
			"highlight",
			$type,
			$CONF{chr_prefix},
			$label2chr->{ $label },
			$cumul_pos,
			$cumul_pos + $values{$label}{$alabel},
			$CONF{chr_prefix},$label2chr->{$alabel}));
      $cumul_pos += $values{$label}{$alabel};
    }
  }
}

# create combined row/col highlights
for my $label (@labels) {
  my $cumul_pos = 0;
  my %values;
  for my $cell (@{$table->{cell_list}}) {
    if($cell->{row}{name} eq $label) {
      $values{$cell->{col}{name}} += $cell->{$value_type};
    }
    if ($cell->{col}{name} eq $label) {
      $values{$cell->{row}{name}} += $cell->{$value_type};
    }
  }
  my $sum = $stat->{label}{$label}->sum;
  for my $alabel ( sort {$values{$b} <=> $values{$a}} keys %values ) {
    printinfo(sprintf("%s %s %s%d %d %d fill_color=%s%d",
		      "highlight",
		      "all",
		      $CONF{chr_prefix},
		      $label2chr->{ $label },
		      $cumul_pos,
		      $cumul_pos + $values{$alabel},
		      $CONF{chr_prefix},$label2chr->{$alabel}));
    $cumul_pos += $values{$alabel};
  }
}

################################################################
#
# generate karyotype (segment sizes) and coordinates for
# each segment label 

for my $label ( @labels) {
  my $size = $stat->{label}{$label}->sum;
  next unless $size;
  printinfo("karyotype",
	    sprintf("chr - %s%s %s 0 %d %s%d",
		    $CONF{chr_prefix},
		    $label2chr->{$label},
		    $label2chr->{$label},
		    $size,
		    $CONF{chr_prefix},$label2chr->{$label}));
  (my $text_label = $label) =~ s/ /:/g;
  if($CONF{segment_size_range}) {
    my ($min_size,$max_size) = split(/,/,$CONF{segment_size_range});
    my $min_label = min( map {$stat->{label}{$_}->sum} @labels);
    my $max_label = max( map {$stat->{label}{$_}->sum} @labels);
    my $label_size;
    if($max_label - $min_label) {
      my $k = $CONF{segment_size_progression} || 1;
      $label_size = $min_size + ($max_size-$min_size)*(($stat->{label}{$label}->sum - $min_label)/($max_label-$min_label))**(1/$k);
    } else {
      $label_size = $max_size;
    }
    printinfo("segmentlabel",
	      sprintf("%s%s %s %s %s label_size=%dp",
		      $CONF{chr_prefix},
		      $label2chr->{$label},
		      $size/2,
		      $size/2,
		      $label,$label_size));
  } else {
    printinfo("segmentlabel",
	      sprintf("%s%s %s %s %s",
		      $CONF{chr_prefix},
		      $label2chr->{$label},
		      $size/2,
		      $size/2,
		      $label));
  }
}

################################################################
#
# assign segment position for each cell
#

my $pos;
for my $label (@labels) {
  for my $type (split(/,/,$CONF{placement_order})) {
    my $atype = $type eq "row" ? "col" : "row";
    my @cells = grep(! $_->{missing} && $_->{$type}{name} eq $label, @{$table->{cell_list}});
    my @cells_sorted;
    if($CONF{ribbon_bundle_order} =~ /size/) {
      @cells_sorted =  sort {$b->{$value_type} <=> $a->{$value_type}} @cells;
    } elsif ($CONF{ribbon_bundle_order} =~ /ascii/) {
      @cells_sorted =  sort {$a->{$atype}{name} cmp $b->{$atype}{name}} @cells;
    } elsif ($CONF{ribbon_bundle_order} =~ /native/) {
      @cells_sorted =  sort {$label2chr->{$a->{$atype}{name}} <=> $label2chr->{$b->{$atype}{name}}} @cells;
    } else {
      die "ribbon_bundle_order value [$CONF{ribbon_bundle_order}] not supported";
    }
    #printdumper(\@cells_sorted);
      
    for my $cell (@cells_sorted) {
      my $value_norm = $cell->{$value_type};
      if( ($type eq "row" && $CONF{reverse_rows})
	||
	  ($type eq "col" && $CONF{reverse_columns})
	) {
	$pos->{$label} ||= $stat->{label}{$label}->sum;
	$cell->{$type."_end"}   = $pos->{$label};
	$cell->{$type."_start"} = $pos->{$label} - $value_norm;
	$pos->{$label} = $cell->{$type."_start"};
      } else {
	$cell->{$type."_start"} = $pos->{$label};
	$cell->{$type."_end"}   = $cell->{$type."_start"} + $value_norm;
	$pos->{$label} = $cell->{$type."_end"};
      }
    }
  }
}
    
my $linkid=0;
for my $cell ( grep(! $_->{missing}, @{$table->{cell_list}}) ) {

  my $z  = int( $cell->{$value_type} );
  my %param = (z=>$z,
	       color=>sprintf("%s%d",$CONF{chr_prefix},$label2chr->{$cell->{row}{name}}),
	       stroke_color=>"black");

  if($CONF{linkcolor}{color_remap}) {
    if($CONF{linkcolor}{percentile} && ! $CONF{linkcolor}{color_autoremap}) {
      for my $p (sort {$a <=> $b} keys %{$CONF{linkcolor}{percentile}}) {
	if ($cell->{$value_type} <= int($stat->{cells}->percentile($p))) {
	  for my $param (keys %{$CONF{linkcolor}{percentile}{$p}}) {
	    my $value = $CONF{linkcolor}{percentile}{$p}{$param};
	    $param{$param} = $value;
	  }
	  last;
	}
      }
    } else {
      for my $i (0 .. 100/$CONF{percentile_sampling}) {
	my $p = $i * $CONF{percentile_sampling};
	if ($cell->{$value_type} <= int($stat->{cells}->percentile($p))) {
	  $param{color} = sprintf("percentile%03d",$p);
	  last;
	}
      }
    }
  }
  
  if($cell->{$value_type} < $stat->{cell}{raw}->percentile(25)) {
    $param{color} = $CONF{cell_q1_color} if $CONF{cell_q1_color};
    $param{stroke_thickness} = 0 if $CONF{cell_q1_nostroke};
  } elsif ($cell->{$value_type} < $stat->{cell}{raw}->percentile(50)) {
    $param{color} = $CONF{cell_q2_color} if $CONF{cell_q2_color};
    $param{stroke_thickness} = 0 if $CONF{cell_q2_nostroke};
  } elsif ($cell->{$value_type} < $stat->{cell}{raw}->percentile(75)) {
    $param{color} = $CONF{cell_q3_color} if $CONF{cell_q3_color};
    $param{stroke_thickness} = 0 if $CONF{cell_q3_nostroke};
  } elsif ($cell->{$value_type} <= $stat->{cell}{raw}->percentile(100)) {
    $param{color} = $CONF{cell_q4_color} if $CONF{cell_q4_color};
    $param{stroke_thickness} = 0 if $CONF{cell_q4_nostroke};
  }

  printinfo("link",
	    sprintf("cell_%04d %s%s %d %d %s",
		    $linkid,
		    $CONF{chr_prefix},
		    $label2chr->{ $cell->{row}{name} },
		    $cell->{row_start},
		    $cell->{row_end},
		    join(",", map {sprintf("%s=%s",$_,$param{$_})} keys %param)
		   ));

  printinfo("link",
	    sprintf("cell_%04d %s%s %d %d %s",
		    $linkid,
		    $CONF{chr_prefix},
		    $label2chr->{ $cell->{col}{name} },
		    $cell->{col_start},
		    $cell->{col_end},
		    join(",", map {sprintf("%s=%s",$_,$param{$_})} keys %param)
		   ));
  $linkid++;
}

report_error("could not parse any cell values to draw") if ! $linkid;

sub limit_array_size {
  my $max   = shift;
  my @array = @_;
  if(@_ <= $max) {
    return @_;
  } else {
    return splice(@_,0,$max);
  }
}

sub clean_and_split_line {
  my $line = shift;
  chomp $line;
  $line =~ s/^\s*//;
  return () unless $line;
  return () if $line =~ /^\#/;
  my @tok = map {clean_field($_)} split_line($line);
  return @tok;
}

sub split_line {
  my $str = shift;
  return split(/[\s\t]+/,$str);
}

sub clean_field {
  my $str = shift;
  $str =~ s/\"//g;
  return $str;
}

sub scale_value {
  my ($v,$vm) = @_;
  my $vscaled;
  if($CONF{scaling_type} eq "atten_small") {
    my $k = (exp($CONF{scale_factor}*$v/$vm)-exp(0)) / (exp($CONF{scale_factor})-exp(0));
    $vscaled = $k * $vm;
  } else {
    my $k = log($v*$CONF{scale_factor})/log($vm*$CONF{scale_factor});
    $vscaled = $k * $vm;
  }
  #printinfo("factor",$v,$v/$vm,$k);
  return $vscaled;
}

sub clean_value {
  my $val = shift;
  (my $new_val = $val) =~ s/[^\w$CONF{missing_cell_value}]*//g;
  report_error("could not correctly parse cell value [$val] into a number") unless $new_val ne "";
  return $new_val;
}

sub unique {
  my %seen;
  my @new;
  for my $elem (@_) {
    push @new, $elem if ! $seen{$elem}++;
  }
  return @new;
}

sub validateconfiguration {
  $CONF{data_mult}      ||= 1;
  $CONF{color_idx_mult} ||= 1;

  $CONF{colors}{h0} = $CONF{percentile_hue_start} if defined $CONF{percentile_hue_start};
  $CONF{colors}{h1} = $CONF{percentile_hue_end} if defined $CONF{percentile_hue_end};

  $CONF{colors}{s0} = $CONF{percentile_saturation_start} if defined $CONF{percentile_saturation_start};
  $CONF{colors}{s1} = $CONF{percentile_saturation_end} if defined $CONF{percentile_saturation_end};

  $CONF{colors}{v0} = $CONF{percentile_brightness_start} if defined $CONF{percentile_brightness_start};
  $CONF{colors}{v1} = $CONF{percentile_brightness_end} if defined $CONF{percentile_brightness_end};

  $CONF{linkcolor}{color_remap}     ||= $CONF{color_remap};
  $CONF{linkcolor}{color_autoremap} ||= $CONF{color_autoremap};
}

sub shorten_text {
  my $string = shift;
  if($CONF{shorten_text}) {
    for my $word (sort {length($b) <=> length($a)} keys %{$CONF{string_replace}}) {
      $string =~ s/$word/$CONF{string_replace}{$word}/i;
    }
  }
  return $string;
}

sub report_error {
  my $msg = shift;
  my $line = shift;
  print STDERR uc "error parsing your table file\n\n";
  print STDERR "problem: $msg\n\n";
  if(defined $line) {
    print STDERR "offending line was\n";
    print STDERR $line,"\n";
  }
  die if $CONF{strict_sanity};
}

################################################################
#
# *** DO NOT EDIT BELOW THIS LINE ***
#
################################################################

sub populateconfiguration {
  foreach my $key (keys %OPT) {
    $CONF{$key} = $OPT{$key};
  }

  # any configuration fields of the form __XXX__ are parsed and replaced with eval(XXX). The configuration
  # can therefore depend on itself.
  #
  # flag = 10
  # note = __2*$CONF{flag}__ # would become 2*10 = 20

  for my $key (keys %CONF) {
    my $value = $CONF{$key};
    while($value =~ /__([^_].+?)__/g) {
      my $source = "__" . $1 . "__";
      my $target = eval $1;
      $value =~ s/\Q$source\E/$target/g;
      #printinfo($source,$target,$value);
    }
    $CONF{$key} = $value;
  }

}

sub loadconfiguration {
  my $file = shift;
  my ($scriptname) = fileparse($0);
  if(-e $file && -r _) {
    # great the file exists
  } elsif (-e "/home/$ENV{LOGNAME}/.$scriptname.conf" && -r _) {
    $file = "/home/$ENV{LOGNAME}/.$scriptname.conf";
  } elsif (-e "$FindBin::RealBin/$scriptname.conf" && -r _) {
    $file = "$FindBin::RealBin/$scriptname.conf";
  } elsif (-e "$FindBin::RealBin/etc/$scriptname.conf" && -r _) {
    $file = "$FindBin::RealBin/etc/$scriptname.conf";
  } elsif (-e "$FindBin::RealBin/../etc/$scriptname.conf" && -r _) {
    $file = "$FindBin::RealBin/../etc/$scriptname.conf";
  } else {
    return undef;
  }
  $OPT{configfile} = $file;
  my $conf = new Config::General(-ConfigFile=>$file,
				 -AllowMultiOptions=>"yes",
				 -LowerCaseNames=>1,
				 -AutoTrue=>1);
  %CONF = $conf->getall;
}

sub printdumper {
  printinfo(Dumper(@_));
}

sub printdebug {
  printinfo("debug",@_)  if $CONF{debug};
}

sub printinfo {
  printf("%s\n",join(" ",@_));
}

sub printdumper {
  printinfo(Dumper(@_));
}

