#!/usr/bin/env perl

# Copyright 2013-2015, Derrick Wood, Jennifer Lu <jlu26@jhmi.edu>
#
# This file is part of the Kraken taxonomic sequence classification system.
#
# Kraken is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Kraken is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Kraken.  If not, see <http://www.gnu.org/licenses/>.

# Reports a summary of Kraken's results.

use strict;
use warnings;
use File::Basename;
use Getopt::Long;

my $PROG = basename $0;
my $KRAKEN_DIR = "#####=KRAKEN_DIR=#####";

# Test to see if the executables got moved, try to recover if we can
if (! -e "$KRAKEN_DIR/classify") {
  use Cwd 'abs_path';
  $KRAKEN_DIR = dirname abs_path($0);
}

require "$KRAKEN_DIR/krakenlib.pm";

my $show_zeros = 0;
my $db_prefix;

GetOptions(
  "help" => \&display_help,
  "version" => \&display_version,
  "show-zeros" => \$show_zeros,
  "db=s" => \$db_prefix,
);

eval { $db_prefix = krakenlib::find_db($db_prefix); };
if ($@) {
  die "$PROG: $@";
}

sub usage {
  my $exit_code = @_ ? shift : 64;
  print STDERR "Usage: $PROG [--db KRAKEN_DB_NAME] [--show-zeros] <kraken output file(s)>\n";
  my $default_db;
  eval { $default_db = krakenlib::find_db(); };
  if (defined $default_db) {
    print STDERR "\n   Default database is \"$default_db\"\n";
  }
  exit $exit_code;
}

sub display_help {
  usage(0);
}

sub display_version {
  print "Kraken version #####=VERSION=#####\n";
  print "Copyright 2013-2019, Derrick Wood, Jennifer Lu (jlu26\@jhmi.edu)\n";
  exit 0;
}

my (%child_lists, %name_map, %rank_map);
load_taxonomy($db_prefix);

my %taxo_counts;
my $seq_count = 0;
$taxo_counts{0} = 0;
while (<>) {
  my @fields = split;
  $taxo_counts{$fields[2]}++;
  $seq_count++;
}
my $classified_count = $seq_count - $taxo_counts{0};

my %clade_counts = %taxo_counts;
dfs_summation(1);

for (keys %name_map) {
  $clade_counts{$_} ||= 0;
}

my $unclassified_percent = 100;
if ($seq_count) {
  $unclassified_percent = $clade_counts{0} * 100 / $seq_count;
}

printf "%6.2f\t%d\t%d\t%s\t%d\t%s%s\n",
  $unclassified_percent,
  $clade_counts{0}, $taxo_counts{0}, "U",
  0, "", "unclassified";
dfs_report(1, 0);

sub dfs_report {
  my $node = shift;
  my $depth = shift;
  if (! $clade_counts{$node} && ! $show_zeros) {
    return;
  }
  printf "%6.2f\t%d\t%d\t%s\t%d\t%s%s\n",
    ($clade_counts{$node} || 0) * 100 / $seq_count,
    ($clade_counts{$node} || 0),
    ($taxo_counts{$node} || 0),
    rank_code($rank_map{$node}),
    $node,
    "  " x $depth,
    $name_map{$node};
  my $children = $child_lists{$node};
  if ($children) {
    my @sorted_children = sort { $clade_counts{$b} <=> $clade_counts{$a} } @$children;
    for my $child (@sorted_children) {
      dfs_report($child, $depth + 1);
    }
  }
}

sub rank_code {
  my $rank = shift;
  for ($rank) {
    $_ eq "species" and return "S";
    $_ eq "genus" and return "G";
    $_ eq "family" and return "F";
    $_ eq "order" and return "O";
    $_ eq "class" and return "C";
    $_ eq "phylum" and return "P";
    $_ eq "kingdom" and return "K";
    $_ eq "superkingdom" and return "D";
  }
  return "-";
}

sub dfs_summation {
  my $node = shift;
  my $children = $child_lists{$node};
  if ($children) {
    for my $child (@$children) {
      dfs_summation($child);
      $clade_counts{$node} += ($clade_counts{$child} || 0);
    }
  }
}

sub load_taxonomy {
  my $prefix = shift;

  open NAMES, "<", "$prefix/taxonomy/names.dmp"
    or die "$PROG: can't open names file: $!\n";
  while (<NAMES>) {
    chomp;
    s/\t\|$//;
    my @fields = split /\t\|\t/;
    my ($node_id, $name, $type) = @fields[0,1,3];
    if ($type eq "scientific name") {
      $name_map{$node_id} = $name;
    }
  }
  close NAMES;

  open NODES, "<", "$prefix/taxonomy/nodes.dmp"
    or die "$PROG: can't open nodes file: $!\n";
  while (<NODES>) {
    chomp;
    my @fields = split /\t\|\t/;
    my ($node_id, $parent_id, $rank) = @fields[0,1,2];
    if ($node_id == 1) {
      $parent_id = 0;
    }
    $child_lists{$parent_id} ||= [];
    push @{ $child_lists{$parent_id} }, $node_id;
    $rank_map{$node_id} = $rank;
  }
  close NODES;
}
