#include <stdlib.h>
#include <stdio.h>
#include <mm_malloc.h>
#include <sys/time.h>
#include <pthread.h>
#include <alloca.h>
#include "profile.h"
#include "sequence.h"

#ifdef BOTH
#undef USE_TRANSPOSE
#endif

extern unsigned int heuristic(const struct Profile * const restrict prf, const PFSequence * const restrict Sequence);
extern unsigned int TransposeHeuristic(const int * const restrict TransposeMatch, const size_t Alphabet_Length,
                                const size_t Profile_Length, const PFSequence * const restrict Sequence);
#ifdef SMP
struct ThreadData {
  struct Profile * prf;
  int * TransposeMatch;
  FASTAStructure * FASTA;
  char * SequenceFileName;
#ifdef BOTH
  unsigned int (*Scores)[2];
#else 
  unsigned int * Scores;
#endif
  size_t start;
  size_t stop;
  size_t counter; 
};

static void *thread_fctA( void * _Data)
{
  Sequence SeqData;
  struct timeval _t0, _t1;
  const struct Profile * const restrict prf   = ((struct ThreadData*) _Data)->prf;
  const FASTAStructure * const restrict FASTA = ((struct ThreadData*) _Data)->FASTA;
#ifdef BOTH
  unsigned int (* const restrict Scores)[2]   = ((struct ThreadData*) _Data)->Scores;
#else
  unsigned int * const restrict Scores        = ((struct ThreadData*) _Data)->Scores;
#endif
  int * const restrict TransposeMatch         = ((struct ThreadData*) _Data)->TransposeMatch;
  PFSequence * PFSeq;

  /* Timing */
  gettimeofday(&_t0,0);

  /* Allocate memory to hold sequence */
  SeqData.Memory = malloc(FASTA->MaxSequenceSize*sizeof(unsigned char));
  if (SeqData.Memory == NULL) {
    fputs("Thread Cannot allocate memory for sequence.\n", stderr);
    return (void*) 1;
  }
  /* Allocate work aligned memory for xali1 */
  int * Work = _mm_malloc((1+prf->Length)*4*sizeof(int)+63,64);

  /* Open sequence file*/
  FILE* inSequence = fopen(((struct ThreadData*) _Data)->SequenceFileName, "r");

  size_t Counter = 0;
  size_t Start   = ((struct ThreadData*) _Data)->start;
  size_t Stop   = ((struct ThreadData*) _Data)->stop;


  /* LOOPS ON SEQUENCES */
   for (size_t i=Start; i<Stop; ++i) {
    PFSeq = ReadSequenceIndex(&SeqData, i, inSequence, FASTA->DataPtr);

    /* Translate first sequence */
    PFSeq = TranslateCharToIndex(PFSeq, prf->Alphabet_Mapping);
#ifdef BOTH
    Scores[i][0] = heuristic(prf, PFSeq);
#else
    Scores[i] = heuristic(prf, PFSeq);
#endif
  }

  /* close sequence file */
  fclose(inSequence);

  /* Free Memory */
  free(SeqData.Memory);
  _mm_free(Work);

  /* Timing */
  gettimeofday(&_t1,0);

  const double t = (double) (_t1.tv_sec - _t0.tv_sec) + (double) (_t1.tv_usec - _t0.tv_usec) * 0.000001;
//   printf("This thread got %lu sequences passing test and ran it in %lf [s]\n", Counter, t);
  ((struct ThreadData*) _Data)->counter = Counter;
  return 0;
}

static void *thread_fctB( void * _Data)
{
  Sequence SeqData;
  struct timeval _t0, _t1;
  const struct Profile * const restrict prf   = ((struct ThreadData*) _Data)->prf;
  const FASTAStructure * const restrict FASTA = ((struct ThreadData*) _Data)->FASTA;
#ifdef BOTH
  unsigned int (* const restrict Scores)[2]   = ((struct ThreadData*) _Data)->Scores;
#else 
  unsigned int * const restrict Scores        = ((struct ThreadData*) _Data)->Scores;
#endif
  int * const restrict TransposeMatch         = ((struct ThreadData*) _Data)->TransposeMatch;
  PFSequence * PFSeq;

  /* Timing */
  gettimeofday(&_t0,0);

  /* Allocate memory to hold sequence */
  SeqData.Memory = malloc(FASTA->MaxSequenceSize*sizeof(unsigned char));
  if (SeqData.Memory == NULL) {
    fputs("Thread Cannot allocate memory for sequence.\n", stderr);
    return (void*) 1;
  }
  /* Allocate work aligned memory for xali1 */
  int * Work = _mm_malloc((1+prf->Length)*4*sizeof(int)+63,64);

  /* Open sequence file*/
  FILE* inSequence = fopen(((struct ThreadData*) _Data)->SequenceFileName, "r");

  size_t Counter = 0;
  size_t Start   = ((struct ThreadData*) _Data)->start;
  size_t Stop   = ((struct ThreadData*) _Data)->stop;


  /* LOOPS ON SEQUENCES */
   for (size_t i=Start; i<Stop; ++i) {
    PFSeq = ReadSequenceIndex(&SeqData, i, inSequence, FASTA->DataPtr);

    /* Translate first sequence */
    PFSeq = TranslateCharToIndex(PFSeq, prf->Alphabet_Mapping);

#ifdef BOTH
    Scores[i][1] = TransposeHeuristic(TransposeMatch, prf->Alphabet_Length,
                                      prf->Length, PFSeq);
#else
    Scores[i]    = TransposeHeuristic(TransposeMatch, prf->Alphabet_Length,
                                      prf->Length, PFSeq);
#endif
  }

  /* close sequence file */
  fclose(inSequence);

  /* Free Memory */
  free(SeqData.Memory);
  _mm_free(Work);

  /* Timing */
  gettimeofday(&_t1,0);

  const double t = (double) (_t1.tv_sec - _t0.tv_sec) + (double) (_t1.tv_usec - _t0.tv_usec) * 0.000001;
//   printf("This thread got %lu sequences passing test and ran it in %lf [s]\n", Counter, t);
  ((struct ThreadData*) _Data)->counter = Counter;
  return 0;
}
#endif

int main (int argc, char *argv[])
{
  struct Profile prf;
  FASTAStructure FASTA;
  Sequence SeqData;
  PFSequence * PFSeq;
  struct timeval _t0, _t1;
  int res;
  unsigned int Score;

  if (argc < 3) { fputs("provide profile and FASTA file\n", stderr); return 1;}

/* Read the profile and output some infos */
  res = ReadProfile(argv[1], &prf);
  if (res != 0) {
    fputs("Error found.\n", stderr);
    return 1;
  }
//   printf("Profile %s has length %lu and alphabet size of %lu\nCutoff value is set to %i\n",
//          argv[1], prf.Length, prf.Alphabet_Length, prf.CutOffData.ICUT[0]);
// 
//   puts("Alphabet Mapping");
//   for (size_t i=0; i<ALPHABET_SIZE; ++i) {
//     printf("Map %c=%2u\t", (char) ((unsigned char) 'A' + (unsigned char) i), (unsigned int) prf.Alphabet_Mapping[i]);
//     if ((i+1) % 8 == 0 ) puts("");
//   }
//   puts("\n");


  /* Read the FASTA file */
  res = AnalyzeFASTAStructure(argv[2], &FASTA);
  if (res != 0) {
    fputs("Error found.\n", stderr);
    return 1;
  }

//   printf("FASTA file %s analyzed\n\tFound %lu sequences within %lu bytes\n\tBiggest sequence entry is %lu bytes\n",
//           argv[1], FASTA.SequenceCount, FASTA.FileSize, FASTA.MaxSequenceSize);

  // Transpose Match Matrix for second heuristic
  gettimeofday(&_t0,0);
  const int * TIMatch = TransposeAndConvertMatchMatrix(&(prf.Scores.Match), prf.Alphabet_Length, prf.Length);
  gettimeofday(&_t1,0);
  double t = (double) (_t1.tv_sec - _t0.tv_sec) + (double) (_t1.tv_usec - _t0.tv_usec) * 0.000001;
  fprintf(stderr,"Transposing Match matrix took %lf seconds.\n", t);
#ifdef BOTH  
  unsigned int (* restrict const Scores)[2] = (unsigned int (*)[2])  _mm_malloc(2*FASTA.SequenceCount*sizeof(unsigned int), 64);
#else
  unsigned int * restrict const Scores = (unsigned int *)  _mm_malloc(FASTA.SequenceCount*sizeof(unsigned int), 64);
#endif
  
#ifndef SMP
  /* Allocate memory to hold sequence */
  SeqData.Memory = malloc(FASTA.MaxSequenceSize*sizeof(unsigned char));
  if (SeqData.Memory == NULL) {
    fputs("Cannot allocate menmory for sequence.\n", stderr);
    return 1;
  }

  /* Open sequence file*/
  FILE* inSequence = fopen(argv[2], "r");

  size_t counter = 0;
  gettimeofday(&_t0,0);
  /* LOOPS ON SEQUENCES */
   for (size_t i=0; i<FASTA.SequenceCount; ++i) {
    PFSeq = ReadSequenceIndex(&SeqData, i, inSequence, FASTA.DataPtr);
//     if (i == 15147) printf("%s\n%lu\n%s\n",SeqData.Header, FASTA.DataPtr[15147].HeaderLength , PFSeq->ProfileIndex);

    /* Translate first sequence */
    PFSeq = TranslateCharToIndex(PFSeq, prf.Alphabet_Mapping);

    Scores[i] = heuristic(&prf, PFSeq);
  }
  gettimeofday(&_t1,0);
  t = (double) (_t1.tv_sec - _t0.tv_sec) + (double) (_t1.tv_usec - _t0.tv_usec) * 0.000001;
//   printf("Heuristic took %lf [s].\n%lu sequences satisfy alignment test.\n", t, counter);

/* close sequence file */
  fclose(inSequence);

  /* Free Memory */
  free(SeqData.Memory);

  FreeProfile(&prf);
#else
  /* Retrieve number of cores */
  const size_t nCPUs = argc == 4 ? atoi(argv[3]) : (size_t) sysconf(_SC_NPROCESSORS_CONF);

//   const size_t share = FASTA.SequenceCount/nCPUs;

  // Share according to file size
  size_t * shares = alloca(nCPUs*sizeof(size_t));
  {
    const size_t FileShare = (size_t) FASTA.FileSize / nCPUs;
    const Data * DataPtr = FASTA.DataPtr;
    size_t counter = 0;
    shares[0] = 0;
    for (size_t i=1; i<nCPUs; ++i) {
      register size_t tmp = i*FileShare;
      while ( (size_t) DataPtr->Offset < tmp) { ++DataPtr; ++counter; }
      shares[i] = counter;
//       printf("share %i stops at %li\n", i, counter);
    }
  }

  struct ThreadData *threads_arg = alloca(nCPUs*sizeof(struct ThreadData));
  pthread_t *threads = (pthread_t*) alloca(nCPUs*sizeof(pthread_t));

  /* Inner loop on profile */
  
  gettimeofday(&_t0,0);

  for (size_t i=0; i<nCPUs-1; ++i) {
    threads_arg[i].prf   = &prf;
    threads_arg[i].TransposeMatch = TIMatch;
    threads_arg[i].FASTA = &FASTA;
    threads_arg[i].SequenceFileName = argv[2];
    threads_arg[i].Scores = Scores;
//     threads_arg[i].start = i*share;
//     threads_arg[i].stop  = (i+1)*share;
    threads_arg[i].start = shares[i];
    threads_arg[i].stop  = shares[i+1];
#ifndef USE_TRANSPOSE    
    if (pthread_create (&threads[i],  NULL, thread_fctA,  (void*) &threads_arg[i]) != 0) {
      return 1;
    }
#else
    if (pthread_create (&threads[i],  NULL, thread_fctB,  (void*) &threads_arg[i]) != 0) {
      return 1;
    }
#endif
  }
  threads_arg[nCPUs-1].prf   = &prf;
  threads_arg[nCPUs-1].TransposeMatch = TIMatch;
  threads_arg[nCPUs-1].FASTA = &FASTA;
  threads_arg[nCPUs-1].SequenceFileName = argv[2];
  threads_arg[nCPUs-1].Scores = Scores;
//   threads_arg[nCPUs-1].start = (nCPUs-1)*share;
  threads_arg[nCPUs-1].start = shares[nCPUs-1];
  threads_arg[nCPUs-1].stop  = FASTA.SequenceCount;
  
#ifndef USE_TRANSPOSE
  if (pthread_create (&threads[nCPUs-1],  NULL, thread_fctA,  (void*) &threads_arg[nCPUs-1]) != 0) {
      return 1;
    }
#else
  if (pthread_create (&threads[nCPUs-1],  NULL, thread_fctB,  (void*) &threads_arg[nCPUs-1]) != 0) {
      return 1;
    }
#endif

  for (size_t i=0; i<nCPUs; i++) {
    pthread_join(threads[i], NULL);
  }

  gettimeofday(&_t1,0);
  t = (double) (_t1.tv_sec - _t0.tv_sec) + (double) (_t1.tv_usec - _t0.tv_usec) * 0.000001;
#ifndef USE_TRANSPOSE 
  fprintf(stderr,"heuristic took %lf seconds to treat on %li cores.\n", t, nCPUs);
#else
  fprintf(stderr,"Transpose heuristic took %lf seconds to treat on %li cores.\n", t, nCPUs);
#endif
  /* Inner loop on sequence */
#ifdef BOTH  
  gettimeofday(&_t0,0);

  for (size_t i=0; i<nCPUs; ++i) {
    if (pthread_create (&threads[i],  NULL, thread_fctB,  (void*) &threads_arg[i]) != 0) {
      return 1;
    }
  }

  for (size_t i=0; i<nCPUs; i++) {
    pthread_join(threads[i], NULL);
  }

  gettimeofday(&_t1,0);
  t = (double) (_t1.tv_sec - _t0.tv_sec) + (double) (_t1.tv_usec - _t0.tv_usec) * 0.000001;
  fprintf(stderr,"Transpose heuristic took %lf seconds to treat on %li cores.\n", t, nCPUs);
#endif
  
  /* Open sequence file*/
  FILE* inSequence = fopen(argv[2], "r");
  char Buffer[256] __attribute__((aligned(16)));
  for (size_t i=0; i<FASTA.SequenceCount; ++i) {
    ReadSequenceNameIndex(Buffer, i, inSequence, FASTA.DataPtr);
#ifdef BOTH
    printf("%s\t%u\t%u\n", Buffer, Scores[i][0], Scores[i][1]);
#else
    printf("%s\t%u\n", Buffer, Scores[i]);
#endif
  }
  fclose(inSequence);
  
  

  /* Free Memory */
  FreeProfile(&prf);
  _mm_free(Scores);
    
#endif
  return 0;
}
