#include <stdio.h>
#include <mpi.h>
#include <malloc.h>
#include <stdlib.h>
#include <math.h>
#include <getopt.h>
#include <string.h>

const int MAX_BUF_SIZE = 100000;
const int SKIP_NUM     = 100;
const int DEF_LOOP_NUM = 200;
const int DEF_MSG_SIZE_LATENCY   = 4;
const int DEF_MSG_SIZE_BANDWIDTH = 100000;

// integer maximum
int imax(int a, int b) { return a > b ? a : b; }

struct glob_stat_st {
    double   sum;

    double   min;
    int      min_rank1, min_rank2;

    double   max;
    int      max_rank1, max_rank2;
} glob_stat;

inline void on_mpi_error_report_and_abort( int ierr) {
    char err_str[MPI_MAX_ERROR_STRING];
    int  err_str_len;
    if( ierr != MPI_SUCCESS) {
        MPI_Error_string( ierr, err_str, &err_str_len );
        fprintf( stderr, "%s\n", err_str);
        MPI_Abort( MPI_COMM_WORLD, ierr );
        exit( 1 );
    }
}

void    print_rank_results( int rank, double *results,
                            int numprocs, double limit);
void    calc_rank_statistics(double *results, int numprocs, int myid);
void    print_result_title(int numprocs);
void    get_global_statistics(double           *results,
                           int                 numprocs,
                           struct glob_stat_st *glob_stat,
                           int                 id);

// Use a blocking factor
#ifndef BLOCKING_FACTROR
// 6: up, down, left, right, front, rear
// 26: 6 + diagonal
#define BLOCKING_FACTROR 26
#endif
const int blocking_factor = BLOCKING_FACTROR;
static MPI_Request request[BLOCKING_FACTROR];
static MPI_Status rcv_stat[BLOCKING_FACTROR];

typedef enum {
   OUTPUT_TYPE_TABLE,
   OUTPUT_TYPE_STATISTICS,
   OUTPUT_TYPE_PAIRS,
} output_type_t;

static output_type_t output_type   = OUTPUT_TYPE_PAIRS;

static int bandwidth_test=1;

/* Calculate smallest 2^x number which is bigger then __n__ */
static int calc_num_rounds(int num);

static int calc_num_rounds(int num)
{
  int numrounds;
  for( numrounds = 1; numrounds < num; numrounds<<=1 );
  return numrounds;
}

void usage( char *progname, int myid )
{
  if(myid != 0)
    return;

  fprintf(stderr, "Usage:\n");
  fprintf(stderr, "%s [-t=test_type] [-o=output_type] [-l=loop_num] [-s=msg_size] [limit] \n",
          progname ); 

  fprintf(stderr,
          "    test_type:   b - banwidth (default)\n"
          "                 l - latency\n"
          "    output_type: p - results for every rank pair (default) \n"
          "                 t - table\n"
          "                 s - statistics per rank - average, min, max\n" 
          "    loop_num:    number of loops per every round (default is %d)\n"
          "    msg_size:    message size  (default is %d for bandwidth, %d for latency)\n"
          "    limit:       if defined,\n"
          "                 only results that are worse than the limit are printed\n",
          DEF_LOOP_NUM,
          DEF_MSG_SIZE_BANDWIDTH,
          DEF_MSG_SIZE_LATENCY
 );

}

static struct option option_table[] = {
    {"t", required_argument, 0, 0},
    {"o", required_argument, 0, 0},
    {"l", required_argument, 0, 0},
    {"s", required_argument, 0, 0},
    {"h", no_argument, 0, 0},
    {"help",no_argument, 0, 0}, 
    {0, 0, 0, 0}
};

int read_options( char **argv, int argc, int  myid,
                  int *buf_size, int *loops, double *limit)
{
  int c, option_index;
  int retval = 0;

#if 0
  if( argc < 3 )
  {
    usage(argv[0], myid);
    return -1;
  }
#endif
  do {
    c = getopt_long_only(argc, argv, "", option_table, &option_index);
    switch (c)
    {
    case '?':
    case ':':
      usage(argv[0], myid);
      break;
    case EOF:
      break;
    case 0:
      switch (option_index) 
      {
      case 0:
        if(!strcmp( optarg, "b"))
          bandwidth_test=1;
        else if(!strcmp( optarg, "l"))
          bandwidth_test=0;
        else 
        {
            fprintf(stderr, "Unknown output type\n");
            usage(argv[0], myid);
            retval = -1;
        }
        break;
      case 1:
        if(!strcmp( optarg, "t"))
          output_type   =OUTPUT_TYPE_TABLE;
        else if(!strcmp( optarg, "s"))
          output_type   =OUTPUT_TYPE_STATISTICS;
        else if(!strcmp( optarg, "p"))
          output_type   =OUTPUT_TYPE_PAIRS;
        else 
        {
            fprintf(stderr, "Unknown output type\n");
            usage(argv[0], myid);
            retval = -1;
        }
        break;
      case 2:
        *loops = atoi(optarg);
        break;
      case 3:
        *buf_size = atoi(optarg);
        break;
      case 4:
      case 5:
        usage(argv[0], myid);
        retval = -1;
        break;
      default:
        fprintf(stderr, "Unknown option\n");
        usage(argv[0], myid);
        retval = -1;
      break;
      }
    break;
    default:
      fprintf(stderr, "Unreachable statement!\n");
      usage(argv[0], myid);
      retval = -1;
      break;
    }
  } while (c != EOF);

  if(retval)
    goto out;

  if(*loops   == 0) *loops=DEF_LOOP_NUM;
  if(*buf_size == 0)*buf_size = bandwidth_test?DEF_MSG_SIZE_BANDWIDTH:DEF_MSG_SIZE_LATENCY;

  if (argc - optind > 0)
  {
    *limit = atof(argv[optind]);
  }


out:
  return retval;
}

/*
 * The send loop for the banwidth calculation
 */
void  bandwidth_sendloop(int   myid,
                  int   partner,
                  void  *s_buf,
                  void  *r_buf,
                  int   buf_size,
                  int   loops) {
    int    i;
    int    ierr = 0;
    double result;

    /*
     * First - syncronize -  send and receive
     */
    ierr = MPI_Sendrecv(s_buf, 4, MPI_CHAR, partner, 2,
                        r_buf, 4, MPI_CHAR, partner, 2,
                        MPI_COMM_WORLD, rcv_stat);
    on_mpi_error_report_and_abort( ierr );

    // Initialize MPI requests
    for ( i = 0; i < blocking_factor; i++ ) {
        request[i] = MPI_REQUEST_NULL;
    }

    // 
    for( i = 0; i < blocking_factor && i < loops; i++ ) {
        ierr = MPI_Isend(s_buf, buf_size, MPI_CHAR, partner, 1, MPI_COMM_WORLD, request + i);
        on_mpi_error_report_and_abort( ierr );
    }

    /*
     * The main send loop
     */
    for( i = blocking_factor; i < loops; i++ ) {
        int index;
        ierr = MPI_Waitany(blocking_factor, request, &index, rcv_stat);
        on_mpi_error_report_and_abort( ierr );
        ierr = MPI_Isend(s_buf, buf_size, MPI_CHAR, partner, 1, MPI_COMM_WORLD, request + index);
        on_mpi_error_report_and_abort( ierr );
    }

    MPI_Waitall(blocking_factor, request, rcv_stat);

    return;
}


/*
 * The receive loop for the banwidth calculation.
 * 
 */
double  bandwidth_recvloop(int   myid,
                  int   partner,
                  void  *s_buf,
                  void  *r_buf,
                  int   buf_size,
                  int   loops)
{
    double      t_start=0.0, t_end=0.0, t=0.0;
    int         i;
    int         ierr = 0;
    double      result;

    /*
     * First - syncronize -  send and receive
     */
    ierr = MPI_Sendrecv(s_buf, 4, MPI_CHAR, partner, 2,
                        r_buf, 4, MPI_CHAR, partner, 2,
                        MPI_COMM_WORLD, rcv_stat);
    on_mpi_error_report_and_abort( ierr );

    // Initialize MPI requests
    for ( i = 0; i < blocking_factor; i++) {
        request[i] = MPI_REQUEST_NULL;
    }

    /*
     * The main receive loop
     */
    t_start=MPI_Wtime();
    for( i = 0; i < blocking_factor && i < loops; i++ ) {
        ierr = MPI_Irecv(r_buf, buf_size, MPI_CHAR, partner, 1,
                         MPI_COMM_WORLD, request + i);
        on_mpi_error_report_and_abort( ierr );
    }

    for( i = blocking_factor; i < loops; i++ ) {
        int index;
        ierr = MPI_Waitany(blocking_factor, request, &index, rcv_stat);
        on_mpi_error_report_and_abort( ierr );
        ierr = MPI_Irecv(r_buf, buf_size, MPI_CHAR, partner, 1,
                         MPI_COMM_WORLD, request + index);
        on_mpi_error_report_and_abort( ierr );
    }

    MPI_Waitall(blocking_factor, request, rcv_stat);

    t_end=MPI_Wtime();
    t = t_end - t_start;

    result = ((buf_size*1.0)/1.0e6)*loops/t;

    return result;
}

/*
 * The bandwidth test.
 */
double  bandwidth(int   myid,
                  int   partner,
                  void  *s_buf,
                  void  *r_buf,
                  int   buf_size,
                  int   loops)
{
  double      result;

  if(partner<myid)
  {
    result = bandwidth_recvloop(myid,
                  partner,
                  s_buf,
                  r_buf,
                  buf_size,
                  loops);
    bandwidth_sendloop(myid,
                  partner,
                  s_buf,
                  r_buf,
                  buf_size,
                  loops);
  }
  else
  {
    bandwidth_sendloop(myid,
                  partner,
                  s_buf,
                  r_buf,
                  buf_size,
                  loops);
    result = bandwidth_recvloop(myid,
                  partner,
                  s_buf,
                  r_buf,
                  buf_size,
                  loops);
  }
  return result;
}

/*
 * The latency test
 */
double  latency(int   myid,
                int   partner,
                void  *s_buf,
                void  *r_buf,
                int   buf_size,
                int   loops)
{
  double      t_start=0.0, t_end=0.0, t=0.0, t_sum=0.0;
  int         i;
  int         skip = SKIP_NUM;
  double      result;
  
  
  for( i=0; i < loops+skip; i++)
  {
    if(i == skip) t_start=MPI_Wtime();
     
    if(partner >myid)
    {
      MPI_Send(s_buf, buf_size, MPI_CHAR, partner, 1, MPI_COMM_WORLD);
      MPI_Recv(r_buf, buf_size, MPI_CHAR, partner, 1, MPI_COMM_WORLD, rcv_stat);
    }
    else
    {
      MPI_Recv(r_buf, buf_size, MPI_CHAR, partner, 1, MPI_COMM_WORLD, rcv_stat);
      MPI_Send(s_buf, buf_size, MPI_CHAR, partner, 1, MPI_COMM_WORLD);
    }
  } 
  t_end=MPI_Wtime();

  result = (t_end-t_start)*1.0e6/(2.0*loops);
  return result;
}



int  main(int argc, char **argv )
 {
  int myid, numprocs, i, j, partner;
  char name[MPI_MAX_PROCESSOR_NAME];
  int len;
  void        *r_buf, *s_buf;
  int         ierr = 0;
  double      *results;

  int loops    = 0;
  int buf_size = 0;
  int numrounds;
  int numresults;
  double limit = 0;


  MPI_Init(&argc, &argv);
  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
  MPI_Comm_rank(MPI_COMM_WORLD, &myid); 
  MPI_Get_processor_name(name, &len);


  if( read_options( argv, argc, myid, &buf_size, &loops, &limit) < 0 )
  {
      MPI_Finalize();
      return 0;
  }


  if(buf_size > MAX_BUF_SIZE){
           fprintf(stderr, "Maximum message size is %d\n", MAX_BUF_SIZE);
           MPI_Finalize();
           return 0;
  }

  if( (r_buf = (void *)malloc(buf_size)) == NULL )
  {
          fprintf(stderr,"Memory allocation error\n");
          MPI_Abort(MPI_COMM_WORLD, MPI_ERR_INTERN );
          exit(1);
  }

  if( (s_buf = (void *)malloc(buf_size)) == NULL )
  {
          fprintf(stderr,"Memory allocation error\n");
          MPI_Abort(MPI_COMM_WORLD, MPI_ERR_INTERN );
          exit(1);
  }

  if( (results = (void *) malloc( imax( 5, numprocs) * sizeof(*results))) == NULL )
  {
          fprintf(stderr,"Memory allocation error\n");
          MPI_Abort(MPI_COMM_WORLD, MPI_ERR_INTERN );
          exit(1);
  }

  memset(s_buf, 0, buf_size);
  sprintf(s_buf, "From rank %d\n", myid); 

  memset(results, 0, (int)(numprocs*sizeof(*results)));
  
  numrounds = calc_num_rounds(numprocs);

  if(myid == 0)
  {
    if(bandwidth_test)
      printf("\n****** Running bandwidth test ********\n");
    else
      printf("\n****** Running latency   test ********\n");
    printf("Total number of rounds:          %d\n", numrounds-1);
    printf("Total number of loops per round: %d\n", loops);
    printf("Message size:                    %d\n", buf_size);
    if(limit && output_type != OUTPUT_TYPE_TABLE)
      printf("Limit:                           %-8.2f\n", limit);
    printf("**************************************\n");
      fprintf(stderr, "Round number      ");
  }


  /* The main loop
   */ 
  for(i=1; i < numrounds; i++)
  {

    MPI_Barrier(MPI_COMM_WORLD);

    if(myid == 0)
    {
      fprintf(stderr, "\b\b\b\b\b%5d", i);
    }

    partner = ((i)^myid);

    if( partner >= numprocs)
    {
      /* printf("MYID %d idle\n", myid); */
    }
    else
    {
      /*printf("%d: MYID %d partner %d\n", i, myid, partner); */
      if(bandwidth_test)
      {
        if( i == 1)
        {
           /* warmup */
           bandwidth(myid,
                     partner,
                     s_buf,
                     r_buf,
                     buf_size,
                     loops);
        }

       
        results[partner] = bandwidth(myid,
                                     partner,
                                     s_buf,
                                     r_buf,
                                     buf_size,
                                     loops);
      }
      else
      {
        results[partner] =   latency(myid,
                                     partner,
                                     s_buf,
                                     r_buf,
                                     buf_size,
                                     loops);
      }
    }
  } 

  MPI_Barrier(MPI_COMM_WORLD);

    numresults = numprocs;

  /*
   * Send the results to the head and print it.
   */
  if( myid != 0 )
  {
    /*
     * Send to the head rank
     */
    ierr = MPI_Send(results, numresults, MPI_DOUBLE, 0,  3,
                          MPI_COMM_WORLD);
    on_mpi_error_report_and_abort( ierr );
  }
  else
  {
    print_result_title(numprocs); 
    /*
     * On the head rank - read all results from all ranks and print it 
     */
    for( i=0; i<numprocs; i++)
    {
      if( i != 0 )
      {
        ierr = MPI_Recv(results, numresults, MPI_DOUBLE, i,  3,
                        MPI_COMM_WORLD, rcv_stat);
        on_mpi_error_report_and_abort( ierr );
      }

      get_global_statistics(results, numprocs, &glob_stat, i);

      if( output_type == OUTPUT_TYPE_STATISTICS)
      {
        calc_rank_statistics(results, numprocs, i);
      }
      print_rank_results(i, results, numprocs, limit);
    }
    printf("___________________________________________________________\n"
         "Global statistics:\n"
         " MIN     %7.2lf between %d and %d\n"
         " MAX     %7.2lf between %d and %d\n"
         " AVERAGE %7.2lf\n",
         glob_stat.min, glob_stat.min_rank1, glob_stat.min_rank2,
         glob_stat.max, glob_stat.max_rank1, glob_stat.max_rank2,
         glob_stat.sum/(numprocs*(numprocs-1)) );
  } 
    


  MPI_Finalize();
}

void get_global_statistics(double              *results,
                           int                 numprocs,
                           struct glob_stat_st *glob_stat,
                           int                 id )
{
  int i;

  if( id == 0 )
  {
   glob_stat->sum = 0;

   glob_stat->min = results[1];
   glob_stat->min_rank1 = 1;
   glob_stat->min_rank2 = 0;

   glob_stat->max = results[1];
   glob_stat->max_rank1 = 1;
   glob_stat->max_rank2 = 0;
  }

  for(i=0; i<numprocs; i++) 
  {
    if( i == id )
      continue;

    if(results[i] < glob_stat->min)
    { 
      glob_stat->min = results[i];
      glob_stat->min_rank1 = id;
      glob_stat->min_rank2 = i;
    }

    if(results[i] > glob_stat->max)
    { 
      glob_stat->max = results[i];
      glob_stat->max_rank1 = id;
      glob_stat->max_rank2 = i;
    }
    glob_stat->sum += results[i];
  }
}
/*
 * Calculate statistics for a rank
 */
void calc_rank_statistics(double *results, int numprocs, int myid)
{
  double min, max, average;
  int    i;
  int min_rank, max_rank;

  min=max=(myid==0)?results[1]:results[0];
  min_rank=max_rank=(myid==0)?1:0;

  for( i=0; i<numprocs; i++)
  {
    if(i == myid)
      continue;

    average += results[i];

    if( results[i] < min )
    {
      min      = results[i];
      min_rank = i;
    }

    if( results[i] > max )
    {
      max      = results[i];
      max_rank = i;
    }

  }
  average/=(numprocs-1);

  results[0]=min;
  results[1]=(double)min_rank;
  results[2]=max;
  results[3]=(double)max_rank;
  results[4]=average;
}
 
/*
 *
 */
void print_result_title(int numprocs)
{
  int i;
  printf("\n**************************************\n");
  if( output_type == OUTPUT_TYPE_TABLE )
  {
    printf("   ");
    for( i=0; i<numprocs; i++)
      printf(" %8d ", i );
    printf("\n    ");
    for( i=0; i<numprocs; i++)
      printf("  _______ ");
    printf("\n");
  }
  else if( output_type == OUTPUT_TYPE_PAIRS )
  {
    printf("RANKS    RESULTS\n________________\n");
  }
  else if( output_type == OUTPUT_TYPE_STATISTICS )
  {
    printf("RANK           MIN                MAX               AVERAGE\n"
           "           RESULT  RANK        RESULT  RANK\n"
           "___________________________________________________________\n");
  }
}

/*
 * Print all results for a rank
 */
void print_rank_results( int    rank,
                         double *results,
                         int    numprocs,
                         double limit )
{
  int i;
  switch( output_type)
  {
  case OUTPUT_TYPE_TABLE:
    printf(" %d| ", rank );
    for(i=0; i<numprocs; i++)
    {
      printf(" %8.2lf ", results[i] );
    }
    printf("\n");
    break;
  case OUTPUT_TYPE_STATISTICS:
    if( !limit || 
        (bandwidth_test  && results[4] < limit) ||
        (!bandwidth_test && results[4] > limit ))
    {
      printf(" %-7d %8.2lf   %-5d     %8.2lf   %-5d     %8.2lf\n",
               rank, results[0], (int)results[1], results[2],
               (int)results[3], results[4]);
    }
    break;
  default:
  case OUTPUT_TYPE_PAIRS:
    for(i=0; i<numprocs; i++)
    {
      if( !limit ||
          (bandwidth_test  && results[i] < limit) ||
          (!bandwidth_test && results[i] > limit ))
      {
        if( i != rank) printf("%d -- %d  %8.2lf\n", rank, i, results[i] );
      }
    }
    break;
  }
}
