#!/home/martink/bin/perl 

=pod

=head1 NAME

binlinks - collate link ends into bins and generate link density track 

=head1 SYNOPSIS

  binlinks -links linkfile.txt -link_end 0|1|2
           -output_style 0|1|2
           [-min_link_size SIZE] [-max_link_size SIZE]
           [-color_by_chr]

=head1 DESCRIPTION

This script collates link starts and ends within a bin and reports the total size of link start and end spans for each bin. The output can be interpreted as link density and is very helpful in demonstrating how many links start/end at a region of the genome.

The output is suitable for 2D track display, such as the histogram.

The reported data points will be formatted with the color named after the corresponding target chromosome. The way the color is determined depends on the value of the -output_style. For example, for -output_style=0, the color will be that of the target chromosome with the largest contribution to the link size sum in a given bin.

=head1 OUTPUT

The output is controlled by the value passed to -output_style.

=over 

=item * -output_style 0

Link density is reported for each bin. The density is the sum of link ends (as controlled by -link_end) for all target chromosomes. 

Use -color_by_chr to color the link by the color of the target chromosome with the largest contribution.

=item * -output_style 1

Link density for largest contribution, as determined by the target chromosome. The link is colored by the color code of the target chromosome.

=item * -output_style 2

For each bin, the link density for each target chromosome is reported. The link is colored by the color code of the target chromosome.

=item * -output_style 3

Like -output_style=2, but the data is formatted for a stacked histogram track (multiple density values for the same bin are comma-delimited).

The output values are ordered based on a sorted list of all target chromosomes. You will need to have a list of corresponding colors for the values, which is provided to you in STDERR.

  ./binlinks -output_style 3 > density.txt 2> colors.txt

The color list is meant to be used as the argument to the color parameter within the histogram <plot> track.

  <plot>
  file = density.txt
  color = [STDERR output of script]
  type = histogram
  ...
  </plot>

=back

=head1 OPTIONS

=over

=item * -links FILE

Reads the links from FILE, which must be formatted as a Circos link list: each link is prepresented by a pair of lines, identified by a unique linkid string.

  linkid chr1 start1 end1
  linkid chr1 start2 end2

=item * -link_end 0|1|2

Controls which end(s) of the link are processed. (0 - start, 1 - end, 2 - both). The start of the link is considered to be the end defined by the first link line, and the link end is defined by the second link line.

=item * -min_link_size SIZE, -max_link_size SIZE

Filters for links, based on the size of the link end. If either is defined, links smaller (or larger) than the cutoff will be ignored in calculating the link density.

=item * -color_by_chr

The output data will be formatted with the color of the target chromosome.

If -output_style=0 is used, the color will be that of the largest contributor to the bin.

=item * -normalize

The density values are normalized by the sum of the bin. 

This setting does nothing when -output_style=0 (otherwise, all values would be reported as 1).

=back

=head1 HISTORY

=over

=item * 7 Sep 2008

Additional documentation.

=item * 6 Aug 2008

Started and versioned.

=back 

=head1 BUGS

=head1 AUTHOR

Martin Krzywinski

=head1 CONTACT

  Martin Krzywinski
  Genome Sciences Centre
  Vancouver BC Canada
  www.bcgsc.ca
  martink@bcgsc.ca

=cut

################################################################
#
# Copyright 2002-2008 Martin Krzywinski
#
# This file is part of the Genome Sciences Centre Perl code base.
#
# This script is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This script is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this script; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
################################################################

################################################################
#                           Martin Krzywinski (martink@bcgsc.ca)
#                                                           2008
################################################################

use strict;
use Config::General;
use Data::Dumper;
use File::Basename;
use FindBin;
use Getopt::Long;
use IO::File;
use Math::VecStat qw(sum min max average);
use Memoize;
#use Devel::DProf;
use Pod::Usage;
use Set::IntSpan;
use Time::HiRes qw(gettimeofday tv_interval);
use lib "$FindBin::RealBin";
use lib "$FindBin::RealBin/../lib";
use lib "$FindBin::RealBin/lib";
use vars qw(%OPT %CONF);

################################################################
#
# *** YOUR MODULE IMPORTS HERE
#
################################################################

GetOptions(\%OPT,
	   "links=s",
	   "bin_size=f",
	   "min_link_size=f",
	   "max_link_size=f",
	   "link_end=i",
	   "color_by_chr",
	   "output_style=i",
	   "normalize",
	   "configfile=s","help","man","debug+");

pod2usage() if $OPT{help};
pod2usage(-verbose=>2) if $OPT{man};
loadconfiguration($OPT{configfile});
populateconfiguration(); # copy command line options to config hash
validateconfiguration(); 
if($CONF{debug} > 1) {
  $Data::Dumper::Pad = "debug parameters";
  $Data::Dumper::Indent = 1;
  $Data::Dumper::Quotekeys = 0;
  $Data::Dumper::Terse = 1;
  print Dumper(\%CONF);
}

my ($links,$chr1list,$chr2list) = parse_links($CONF{links});

if($CONF{output_style} == 3) {
  print STDERR "color=".join(",",@$chr2list),"\n";
}
for my $chr1 (keys %$links) {
  for my $bin (sort {$a <=> $b} keys %{$links->{$chr1}}) {
    my @chr2 = @$chr2list;
    my $sum = sum(values %{$links->{$chr1}{$bin}});
    if($CONF{output_style} != 3) {
      @chr2 = sort { $links->{$chr1}{$bin}{$b} <=> $links->{$chr1}{$bin}{$a} } @chr2;
    }
    if($CONF{output_style} == 0) {
      if($CONF{color_by_chr}) {
	printinfo($chr1,$bin*$CONF{bin_size},($bin+1)*$CONF{bin_size}-1,$sum,"color=".$chr2[0]);
      } else {
	printinfo($chr1,$bin*$CONF{bin_size},($bin+1)*$CONF{bin_size}-1,$sum);
      }
    } else {
      my @values = map { $links->{$chr1}{$bin}{$_} } @chr2;
      if($sum && $CONF{normalize}) {
	@values = map { $_/$sum } @values;
      }
      if($CONF{output_style} == 3) {
	printinfo($chr1,$bin*$CONF{bin_size},($bin+1)*$CONF{bin_size}-1,
		  join(",",map {sprintf("%.4f",$_)} @values));
      } else {
	for my $i (0..@chr2-1) {
	  printinfo($chr1,$bin*$CONF{bin_size},($bin+1)*$CONF{bin_size}-1,
		    sprintf("%.4f",$values[$i]),"color=".$chr2[$i]);
	  last if $CONF{output_style} == 1;
	}
      }
    }
  }
}

sub parse_links {
  my $file = shift;
  open(F,$file);
  my $links;
  my $chash;
  my $chrseen;
  while(<F>) {
    chomp;
    my @tok1 = split(" ",lc $_);
    my $line2 = <F>;
    last unless $line2;
    chomp $line2;
    my @tok2 = split(" ",lc $line2);
    my $link = {
		set=>[Set::IntSpan->new(sprintf("%d-%d",@tok1[2,3])),
		      Set::IntSpan->new(sprintf("%d-%d",@tok2[2,3]))],
		chr=>[$tok1[1],$tok2[1]]};

    $chrseen->{chr1}{$tok1[1]}++;
    $chrseen->{chr2}{$tok2[1]}++;

    next if $CONF{min_link_size} && $link->{set}[0]->cardinality < $CONF{min_link_size};
    next if $CONF{min_link_size} && $link->{set}[1]->cardinality < $CONF{min_link_size};
    next if $CONF{max_link_size} && $link->{set}[0]->cardinality > $CONF{max_link_size};
    next if $CONF{max_link_size} && $link->{set}[1]->cardinality < $CONF{max_link_size};
    my @ends = $CONF{link_end} == 2 ? (0,1) : $CONF{link_end} == 1 ? (1) : 0;
    for my $end (@ends) {
      # bin index for this end of the link
      my @bins = (int($link->{set}[ $end ]->min / $CONF{bin_size}) ..
		  int($link->{set}[ $end ]->max / $CONF{bin_size}));
      printdebug($link->{set}[$CONF{link_end}]->run_list,@bins) if @bins > 1;
      for my $b (@bins) {
	# span of the bin
	my $bset = Set::IntSpan->new(sprintf("%d-%d",$b*$CONF{bin_size},
					     ($b+1)*$CONF{bin_size}-1));
	# intersection between bin and this end of the link
	my $intersect = $bset->intersect( $link->{set}[$end] )->cardinality;
	# the chromosome of this (and other) end
	my $thischr = $link->{chr}[ $end ];
	my $otherchr = $link->{chr}[ ! $end ];
	printdebug($b,$intersect,$thischr,$otherchr);
	$links->{$thischr}{$b}{$otherchr} += $intersect;
      }
    }

    if($CONF{debug}) {
      for my $tc (keys %$links) {
	for my $bin (sort {$a <=> $b} keys %{$links->{$tc}}) {
	  for my $oc (sort {$links->{$tc}{$bin}{$b} <=> $links->{$tc}{$bin}{$a}} 
		      keys %{$links->{$tc}{$bin}}) {
	    printdebug($tc,$bin,$oc,$links->{$tc}{$bin}{$oc});
	    last;
	  }
	}
      }
    }
  }
  return ($links,
	  [sort { my ($x1) = $a =~ /(\d+)/g; my ($x2) = $b=~/(\d+)/g; $x1<=>$x2 } keys %{$chrseen->{chr1}}],
	  [sort { my ($x1) = $a =~ /(\d+)/g; my ($x2) = $b=~/(\d+)/g; $x1<=>$x2 } keys %{$chrseen->{chr2}}]);
}


sub validateconfiguration {

}

################################################################
#
# *** DO NOT EDIT BELOW THIS LINE ***
#
################################################################

sub populateconfiguration {
  foreach my $key (keys %OPT) {
    $CONF{$key} = $OPT{$key};
  }

  # any configuration fields of the form __XXX__ are parsed and replaced with eval(XXX). The configuration
  # can therefore depend on itself.
  #
  # flag = 10
  # note = __2*$CONF{flag}__ # would become 2*10 = 20

  for my $key (keys %CONF) {
    my $value = $CONF{$key};
    while($value =~ /__([^_].+?)__/g) {
      my $source = "__" . $1 . "__";
      my $target = eval $1;
      $value =~ s/\Q$source\E/$target/g;
      #printinfo($source,$target,$value);
    }
    $CONF{$key} = $value;
  }

}

sub loadconfiguration {
  my $file = shift;
  my ($scriptname) = fileparse($0);
  if(-e $file && -r _) {
    # great the file exists
  } elsif (-e "/home/$ENV{LOGNAME}/.$scriptname.conf" && -r _) {
    $file = "/home/$ENV{LOGNAME}/.$scriptname.conf";
  } elsif (-e "$FindBin::RealBin/$scriptname.conf" && -r _) {
    $file = "$FindBin::RealBin/$scriptname.conf";
  } elsif (-e "$FindBin::RealBin/etc/$scriptname.conf" && -r _) {
    $file = "$FindBin::RealBin/etc/$scriptname.conf";
  } elsif (-e "$FindBin::RealBin/../etc/$scriptname.conf" && -r _) {
    $file = "$FindBin::RealBin/../etc/$scriptname.conf";
  } else {
    return undef;
  }
  $OPT{configfile} = $file;
  my $conf = new Config::General(-ConfigFile=>$file,
				 -AllowMultiOptions=>"yes",
				 -LowerCaseNames=>1,
				 -AutoTrue=>1);
  %CONF = $conf->getall;
}

sub printdebug {
  printinfo("debug",@_)  if $CONF{debug};
}

sub printinfo {
  printf("%s\n",join(" ",@_));
}

sub printerr {
  printf STDERR ("%s\n",join(" ",@_));
}
