#!/usr/bin/env perl

=pod

=head1 NAME

matrix - convert data in x,y,z format to a tabular format

=head1 SYNOPSIS

  matrix FILE [-delim CHR] [-outdelim CHR] [-width 5] [-total] [-title jobs] [-nsort] [-sort] [-byrow] [-bycol] [-sum]

  # or

  cat FILE | matrix {OPTIONS}

=head1 DESCRIPTION

This script converts data reported in x,y,z format

  > cat matrix.txt
  # 3 x 5 matrix
  r1 c1 11
  r2 c1 21
  r3 c1 31
  r1 c2 12
  r2 c2 22
  r3 c2 32
  r1 c3 13
  r2 c3 23
  r3 c3 33
  r1 c4 14
  r2 c4 24
  r3 c4 34
  r1 c5 15
  r2 c5 25
  r3 c5 35

into a tabular format

  > cat matrix.txt | matrix -title "data"
  xx c1 c2 c3 c4 c5
  r1 11 12 13 14 15
  r2 21 22 23 24 25
  r3 31 32 33 34 35

=head1 OPTIONS

=head2 Row vs Column Dominance

By default the input is expected to be of the form

  row col value

You can specify -byrow to explicitly remind yourself of this fact. 

If your input has the column as the first field

  col row value

then use -bycol

  > cat matrix.txt| ./matrix -title "xx" -bycol
  xx r1 r2 r3
  c1 11 21 31
  c2 12 22 32
  c3 13 23 33
  c4 14 24 34
  c5 15 25 35

=head2 Missing Data

Missing data will be denoted by "-", by default. 

  > cat matrix.missing.txt | ./matrix -title "xx"
  xx c1 c2 c3 c4 c5
  r1 11 12 - - 15
  r2 21 - 23 - -
  r3 - 32 33 34 -

The missing token can be adjusted with -missing.

  > cat matrix.missing.txt | ./matrix -title "xx" -missing 0
  xx c1 c2 c3 c4 c5
  r1 11 12 0 0 15
  r2 21 0 23 0 0
  r3 0 32 33 34 0

=head2 Aggregating Values

If your input refers to a row,col pair several times

  r1 c3 1
  r1 c3 1.5
  r1 c3 2.5

then by default an error will be returned. 

You have the option of using the numerical sum of the entries with -sum. The result is equivalent to the input

  r1 c3 5

=head2 Row and Column Sum Statistics

Using -total will add a final row and column that gives the column and row totals, as well as the grand total.

  > cat matrix.missing.txt | ./matrix -title "xx" -sum -tot -width 5
     xx    c1    c2    c3    c4    c5 total
     r1    11    12     -     -    15    38
     r2    21     -    23     -     -    44
     r3     -    32    33    34     -    99
  total    32    44    56    34    15   181

=head2 Row and Column Order

By default the order of rows and columns is arbitrary. 

You can apply asciibetic or numerical sort on the row and column entries using -sort and -nsort.

  > cat matrix.sort.txt
  # 3 x 3 matrix, unsorted
  b b bb
  c a ca
  b a ba
  a a aa
  a b ab
  a c ac

  > cat matrix.sort.txt | ./matrix -title "x" -width 3
  x   c   a   b
  c   -  ca   -
  a  ac  aa  ab
  b   -  ba  bb

  > cat matrix.sort.txt | ./matrix -title "x" -width 3 -sort
  x   a   b   c
  a  aa  ab  ac
  b  ba  bb   -
  c  ca   -   -

=head1 OTHER OPTIONS

=head2 -missing

Missing data string. You'll probably want fixed-field rows, and "-" does nicely here. You can force missing data to be 0 by using -missing 0.

=head2 -outdelim

Output field delimiter. This is a space by default.

=head2 -delim

Using -delim STRING sets the field separator. By default this is general whitespace, which should be correct for most spaces (comma separated, tab separated). To specify a tab delimiter explicitly, use -delim "tab".

=head2 -width

Adjust the width of the output field. 

  > ./matrix matrix.width.txt 
  table c a b d
  4 4c - - -
  1 - 1a 1b -
  3 - - 3b -
  30 30c 30a - 30d
  10 - - 10b -
  2 - 2a - 2d
  15 - - - 15d
  5 - 5a - -

  > ./matrix matrix.width.txt -width 5    
  table     c     a     b     d
      4    4c     -     -     -
      1     -    1a    1b     -
      3     -     -    3b     -
     30   30c   30a     -   30d
     10     -     -   10b     -
      2     -    2a     -    2d
     15     -     -     -   15d
      5     -    5a     -     -

=head1 HISTORY

=over 

=item 3 Jun 2014

Added -sort, -nsort, -byrow, -bycol.

Expanded documentation.

=item 19 Mar 2014

Bug in column headers.

Added totals with -tot.

=item 18 Feb 2005

Added variable output field width.

=item 10 May 2004

Created.

=back 

=head1 BUGS

=head1 AUTHOR

Martin Krzywinski

=head1 CONTACT

  Martin Krzywinski
  Genome Sciences Centre
  Vancouver BC Canada
  www.bcgsc.ca
  martink@bcgsc.ca

=cut

use strict;
use Config::General;
use Data::Dumper;
use FindBin;
use File::Basename;
use Math::VecStat qw(min max average sum minabs maxabs);
use Getopt::Long;
use Pod::Usage;
use lib "$FindBin::RealBin";
use lib "$FindBin::RealBin/../lib";
use lib "$FindBin::RealBin/lib";
use vars qw(%OPT %CONF);

#use Statistics::Descriptive;
#use Math::Round;
use Regexp::Common;

GetOptions(\%OPT,
					 "delim=s",
					 "title=s",
					 "nsort",
					 "sum",
					 "sort",
					 "nsort",
					 "byrow",
					 "bycol",
					 "missing=s",
					 "outdelim=s",
					 "width=i",
					 "total",
					 "version",
					 "man","help","debug+");

our $VERSION = 0.1;

if($OPT{version}) {
	printinfo($VERSION);exit;
}

pod2usage() if $OPT{help};
pod2usage(-verbose=>2) if $OPT{man};
loadconfiguration($OPT{configfile});
populateconfiguration(); # copy command line options to config hash
validateconfiguration(); 
if($CONF{debug} > 1) {
  $Data::Dumper::Pad = "debug parameters";
  $Data::Dumper::Indent = 1;
  $Data::Dumper::Quotekeys = 0;
  $Data::Dumper::Terse = 1;
  print Dumper(\%CONF);
}

my $inputhandle;
if(my $file = $ARGV[0]) {
  die "No such file $file" unless -e $file;
  open(FILE,$file);
  $inputhandle = \*FILE;
} else {
  $inputhandle = \*STDIN;
}

my $matrix;
my (%x,%y);
while(<$inputhandle>) {
  chomp;
  next if /^\s*#/;
	next if /^\s*$/;
	my @tok = split(/[$CONF{delim}]+/,$_);
	die "Expecting exactly 3 fields on each line. Saw [$_]" unless @tok == 3;
  my ($x,$y,$z) = @tok;
	if($CONF{bycol}) {
		($x,$y) = ($y,$x);
	}
	die "Duplicate entry for $x,$y" if exists $matrix->{$x}{$y} && ! $CONF{sum};
	if($CONF{sum}) {
		die "Entry [$z] for $x,$y is not numerical but you asked for -sum" unless $z =~ /$RE{num}{real}/;
		$matrix->{$x}{$y} += $z;
	} else {
		$matrix->{$x}{$y} = $z;
	}
	$x{$x} ||= 1;
	$y{$y} ||= 1;
}

my @x = keys %x;
my @y = keys %y;

if($CONF{nsort}) {
	@x = sort {$a <=> $b} @x;
	@y = sort {$a <=> $b} @y;
} elsif ($CONF{sort}) {
	@x = sort @x;
	@y = sort @y;
}

if($CONF{total}) {
	# compute totals
	my $sum = 0;
	for my $x (@x) {
		for my $y (@y) {
			$sum += $matrix->{$x}{$y};
			$matrix->{total}{$y} += $matrix->{$x}{$y};
			$matrix->{$x}{total} += $matrix->{$x}{$y};
		}
	}
	$matrix->{total}{total} = $sum;
	push @y, "total";
	push @x, "total";
}

# header
my @row = map {sprintf("%$CONF{width}s",$_)} ($CONF{title},@y);
print join($CONF{outdelim},@row),"\n";

# rows
foreach my $x (@x) {
  my @values;
  push(@values,$x);
  foreach my $y (@y) {
    if(my $v = $matrix->{$x}{$y}) {
      push(@values,$v);
    } else {
      push(@values,$CONF{missing});
    }
  }
  @values = map {sprintf("%$CONF{width}s",$_)} @values;
  print join("$CONF{outdelim}",@values),"\n";
}

sub validateconfiguration {
  $CONF{delim}    ||= "\\s";
	$CONF{title}    ||= "table";
  $CONF{delim}    = "\t" if $CONF{delim} =~ /^tab$/i;
  $CONF{outdelim} ||= " ";
  $CONF{missing}  = "-" if ! defined $CONF{missing};
  $CONF{width}    ||= "";
}

################################################################
#
# *** Do NOT EDIT BELOW THIS LINE ***
#
################################################################
################################################################
################################################################
################################################################


sub populateconfiguration {
  foreach my $key (keys %OPT) {
    $CONF{$key} = $OPT{$key};
  }
}

sub loadconfiguration {
  my $file = shift;
  my ($scriptname) = fileparse($0);
  if(-e $file && -r _) {
    # great the file exists
  } elsif (-e "/home/$ENV{LOGNAME}/.$scriptname.conf" && -r _) {
    $file = "/home/$ENV{LOGNAME}/.$scriptname.conf";
  } elsif (-e "$FindBin::RealBin/$scriptname.conf" && -r _) {
    $file = "$FindBin::RealBin/$scriptname.conf";
  } elsif (-e "$FindBin::RealBin/etc/$scriptname.conf" && -r _) {
    $file = "$FindBin::RealBin/etc/$scriptname.conf";
  } else {
    return undef;
  }
  $OPT{configfile} = $file;
  my $conf = new Config::General(-ConfigFile=>$file,
				 -AllowMultiOptions=>"yes",
				 -LowerCaseNames=>1,
				 -AutoTrue=>1);
  %CONF = $conf->getall;
}

sub printdebug {
  printinfo("debug",@_)  if $CONF{debug};
}

sub printinfo {
  my $message = shift;
  $message .= " " if $message;
   for (@_) {
     if (/^-?\d+\.[\de-]{5,}$/) {
       if(/e/) {
 	$_ = sprintf("%0.4e",$_);
       } else {
 	$_ = sprintf("%0.4f",$_);
       }
     }
   }
  printf("%s%s\n",$message,join(" ",@_));
}


