/*-----------------------------------------------------------------------*/
/* Program: STREAM                                                       */
/* Revision: $Id: stream.c,v 5.10 2013/01/17 16:01:06 mccalpin Exp mccalpin $ */
/* Original code developed by John D. McCalpin                           */
/* Programmers: John D. McCalpin                                         */
/*              Joe R. Zagar                                             */
/*                                                                       */
/* This program measures memory transfer rates in MB/s for simple        */
/* computational kernels coded in C.                                     */
/*-----------------------------------------------------------------------*/
/* Copyright 1991-2013: John D. McCalpin                                 */
/*-----------------------------------------------------------------------*/
/* License:                                                              */
/*  1. You are free to use this program and/or to redistribute           */
/*     this program.                                                     */
/*  2. You are free to modify this program for your own use,             */
/*     including commercial use, subject to the publication              */
/*     restrictions in item 3.                                           */
/*  3. You are free to publish results obtained from running this        */
/*     program, or from works that you derive from this program,         */
/*     with the following limitations:                                   */
/*     3a. In order to be referred to as "STREAM benchmark results",     */
/*         published results must be in conformance to the STREAM        */
/*         Run Rules, (briefly reviewed below) published at              */
/*         http://www.cs.virginia.edu/stream/ref.html                    */
/*         and incorporated herein by reference.                         */
/*         As the copyright holder, John McCalpin retains the            */
/*         right to determine conformity with the Run Rules.             */
/*     3b. Results based on modified source code or on runs not in       */
/*         accordance with the STREAM Run Rules must be clearly          */
/*         labelled whenever they are published.  Examples of            */
/*         proper labelling include:                                     */
/*           "tuned STREAM benchmark results"                            */
/*           "based on a variant of the STREAM benchmark code"           */
/*         Other comparable, clear, and reasonable labelling is          */
/*         acceptable.                                                   */
/*     3c. Submission of results to the STREAM benchmark web site        */
/*         is encouraged, but not required.                              */
/*  4. Use of this program or creation of derived works based on this    */
/*     program constitutes acceptance of these licensing restrictions.   */
/*  5. Absolutely no warranty is expressed or implied.                   */
/*-----------------------------------------------------------------------*/

/* clock_gettime(): _POSIX_C_SOURCE >= 199309L
   aligned_alloc(): _ISOC11_SOURCE */
#define _POSIX_C_SOURCE 199309L
#define _ISOC11_SOURCE
#include <features.h>

// perror()
#include <errno.h>
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <math.h>
#include <float.h>
#include <limits.h>
#include <time.h>
#ifdef _OPENMP
    #include <omp.h>
#else
    #error "OpenMP support required"
#endif

/*-----------------------------------------------------------------------
 * INSTRUCTIONS:
 *
 *	1) STREAM requires different amounts of memory to run on different
 *           systems, depending on both the system cache size(s) and the
 *           granularity of the system timer.
 *     You should adjust the value of 'STREAM_ARRAY_SIZE_proc' (below)
 *           to meet *both* of the following criteria:
 *       (a) Each array must be at least 4 times the size of the
 *           available cache memory. I don't worry about the difference
 *           between 10^6 and 2^20, so in practice the minimum array size
 *           is about 3.8 times the cache size.
 *           Example 1: One Xeon E3 with 8 MB L3 cache
 *               STREAM_ARRAY_SIZE_proc should be >= 4 million, giving
 *               an array size of 30.5 MB and a total memory requirement
 *               of 91.5 MB.  
 *           Example 2: Two Xeon E5's with 20 MB L3 cache each (using OpenMP)
 *               STREAM_ARRAY_SIZE_proc should be >= 20 million, giving
 *               an array size of 153 MB and a total memory requirement
 *               of 458 MB.  
 *       (b) The size should be large enough so that the 'timing calibration'
 *           output by the program is at least 20 clock-ticks.  
 *           Example: most versions of Windows have a 10 millisecond timer
 *               granularity.  20 "ticks" at 10 ms/tic is 200 milliseconds.
 *               If the chip is capable of 10 GB/s, it moves 2 GB in 200 msec.
 *               This means the each array must be at least 1 GB, or 128M elements.
 *
 *      Version 5.10 increases the default array size from 2 million
 *          elements to 10 million elements in response to the increasing
 *          size of L3 caches.  The new default size is large enough for caches
 *          up to 20 MB. 
 *      Version 5.10 changes the loop index variables from "register int"
 *          to "long int", which allows array indices >2^32 (4 billion)
 *          on properly configured 64-bit systems.  Additional compiler options
 *          (such as "-mcmodel=medium") may be required for large memory runs.
 *
 *      Array size can be set at compile time without modifying the source
 *          code for the (many) compilers that support preprocessor definitions
 *          on the compile line.  E.g.,
 *                gcc -O -fopenmp stream.c -o stream
 *          will override the default size of 10M with a new size of 100M elements
 *          per array.
 */
long int STREAM_ARRAY_SIZE_proc = 10000000;
long int STREAM_ARRAY_SIZE_thread;

/*  2) STREAM runs each kernel "NTIMES" times and reports the *best* result
 *         for any iteration after the first, therefore the minimum value
 *         for NTIMES is 2.
 *      There are no rules on maximum allowable values for NTIMES, but
 *         values larger than the default are unlikely to noticeably
 *         increase the reported performance.
 *      NTIMES can also be set on the compile line without changing the source
 *         code using, for example, "-DNTIMES=7".
 */
int NTIMES=10;

/*
 *	3) Compile the code with optimization.  Many compilers generate
 *       unreasonably bad code before the optimizer tightens things up.  
 *     If the results are unreasonably good, on the other hand, the
 *       optimizer might be too smart for me!
 *
 *     For a simple single-core version, try compiling with:
 *            cc -O stream.c -o stream
 *     This is known to work on many, many systems....
 *
 *     To use multiple cores, you need to tell the compiler to obey the OpenMP
 *       directives in the code.  This varies by compiler, but a common example is
 *            gcc -O -fopenmp stream.c -o stream_omp
 *       The environment variable OMP_NUM_THREADS allows runtime control of the 
 *         number of threads/cores used when the resulting "stream_omp" program
 *         is executed.
 *
 *     To run with single-precision variables and arithmetic, simply add
 *         -DSTREAM_TYPE=float
 *     to the compile line.
 *     Note that this changes the minimum array sizes required --- see (1) above.
 *
 *     The preprocessor directive "TUNED" does not do much -- it simply causes the 
 *       code to call separate functions to execute each kernel.  Trivial versions
 *       of these functions are provided, but they are *not* tuned -- they just 
 *       provide predefined interfaces to be replaced with tuned code.
 *
 *
 *	4) Optional: Mail the results to mccalpin@cs.virginia.edu
 *	   Be sure to include info that will help me understand:
 *		a) the computer hardware configuration (e.g., processor model, memory type)
 *		b) the compiler name/version and compilation flags
 *      c) any run-time information (such as OMP_NUM_THREADS)
 *		d) all of the output from the test case.
 *
 * Thanks!
 *
 *-----------------------------------------------------------------------*/

# define HLINE "-------------------------------------------------------------\n"

// Compare two doubles
int double_compare(const void *a, const void *b) {
    const double diff = *( double* ) a - *( double* ) b;

    if (diff > 0) 
        return  1;
    if (diff < 0)
        return -1;

    return 0;
}

#ifndef STREAM_TYPE
typedef double STREAM_TYPE;
#endif

enum benchmark_enum {
    COPY           = 0,
    SCALE          = 1,
    ADD            = 2,
    TRIAD          = 3,
    NUM_BENCHMARKS = 4
};

// AVX/AVX2 alignment: 32
// MIC alignment: 64
// getconf LEVEL1_DCACHE_LINESIZE -> 64
// const unsigned int alignment_bytes = 64;
#define alignment_bytes 64

STREAM_TYPE *restrict a, *restrict b, *restrict c;
#pragma omp threadprivate(a, b, c)

int checktick();
double mysecond();
void checkSTREAMresults();
void tuned_STREAM_Copy();
void tuned_STREAM_Scale(STREAM_TYPE scalar);
void tuned_STREAM_Add();
void tuned_STREAM_Triad(STREAM_TYPE scalar);

int main(int argc, char *argv[]) {
    int			quantum;
    const int BytesPerWord = sizeof(STREAM_TYPE);
    int omp_num_threads_req;
    int omp_num_threads_count;
    STREAM_TYPE		scalar;
    double              t;

    int opt;
    while ((opt = getopt(argc, argv, "hn:m:")) != -1) {
        switch (opt) {
            case 'n':
                STREAM_ARRAY_SIZE_proc = atol(optarg);
                break;
            case 'm':
                NTIMES=atoi(optarg);
                if (NTIMES <= 1)
                    NTIMES = 10;
                break;
            case '?':
                exit(1);
                break;
        }
    }

    // Explicitly turn off dynamic threads
    omp_set_dynamic(0);

    #pragma omp parallel 
    #pragma omp master
    omp_num_threads_req = omp_get_num_threads();

    omp_num_threads_count = 0;
    #pragma omp parallel reduction(+: omp_num_threads_count)
    omp_num_threads_count++;

    // Array size per thread and per process
    STREAM_ARRAY_SIZE_thread = STREAM_ARRAY_SIZE_proc   / omp_num_threads_count; 
    STREAM_ARRAY_SIZE_proc   = STREAM_ARRAY_SIZE_thread * omp_num_threads_count;

    // Measured times
    double times[NUM_BENCHMARKS][NTIMES];

    // allocate aligned memory thread private
    #pragma omp parallel
    {
        a = (STREAM_TYPE*) aligned_alloc(alignment_bytes, STREAM_ARRAY_SIZE_thread * sizeof(STREAM_TYPE));
        if ( a == NULL ) {
            printf("thread %i: aligned_alloc a[%li] failed\n", omp_get_thread_num(), STREAM_ARRAY_SIZE_thread);
            exit(1);
        }
        b = (STREAM_TYPE*) aligned_alloc(alignment_bytes, STREAM_ARRAY_SIZE_thread * sizeof(STREAM_TYPE));
        if ( b == NULL ) {
            printf("thread %i: aligned_alloc b[%li] failed\n", omp_get_thread_num(), STREAM_ARRAY_SIZE_thread);
            exit(1);
        }
        c = (STREAM_TYPE*) aligned_alloc(alignment_bytes, STREAM_ARRAY_SIZE_thread * sizeof(STREAM_TYPE));
        if ( c == NULL ) {
            printf("thread %i: aligned_alloc c[%li] failed\n", omp_get_thread_num(), STREAM_ARRAY_SIZE_thread);
            exit(1);
        }
    }

    double bytes[NUM_BENCHMARKS] = {
        2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE_proc, // Copy
        2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE_proc, // Scale
        3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE_proc, // Add
        3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE_proc  // Triad
    };

    double mediantime[NUM_BENCHMARKS],
           maxtime[NUM_BENCHMARKS],
           mintime[NUM_BENCHMARKS];

    char *label[NUM_BENCHMARKS] = {"Copy:      ", "Scale:     ", "Add:       ", "Triad:     "};

    const double kByte = 1024.0;
    const double MByte = 1024.0 * kByte;
    const double GByte = 1024.0 * MByte;

    /* --- SETUP --- determine precision and check timing --- */

    printf(HLINE);
    printf("STREAM version $Revision: 5.10 $\n");
    printf(HLINE);
    printf("This system uses %d bytes per array element.\n",
	BytesPerWord);

    printf(HLINE);

    printf("Array size = %llu (elements) (elements)\n" , (unsigned long long) STREAM_ARRAY_SIZE_proc);
    printf("Memory per array = %.1f MiB (= %.1f GiB).\n", 
	BytesPerWord * ( (double) STREAM_ARRAY_SIZE_proc / MByte ),
	BytesPerWord * ( (double) STREAM_ARRAY_SIZE_proc / GByte));
    printf("Total memory required = %.1f MiB (= %.1f GiB).\n",
	(3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE_proc / MByte),
	(3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE_proc / GByte));
    printf("Each kernel will be executed %d times.\n", NTIMES);
    printf(" The *best* time for each kernel (excluding the first iteration)\n"); 
    printf(" will be used to compute the reported bandwidth.\n");

    printf(HLINE);
    printf ("Number of Threads requested = %i\n", omp_num_threads_req);
    printf ("Number of Threads counted = %i\n", omp_num_threads_count);

    /* Get initial value for system clock. */
    #pragma omp parallel
    #pragma omp simd aligned (a : alignment_bytes)
    for (long int j = 0; j < STREAM_ARRAY_SIZE_thread; j++)
        a[j] = 1.0;

    #pragma omp parallel
    #pragma omp simd aligned (b : alignment_bytes)
    for (long int j = 0; j < STREAM_ARRAY_SIZE_thread; j++)
	b[j] = 2.0;

    #pragma omp parallel
    #pragma omp simd aligned (c : alignment_bytes)
    for (long int j = 0; j < STREAM_ARRAY_SIZE_thread; j++)
	c[j] = 0.0;

    printf(HLINE);

    if  ( (quantum = checktick()) >= 1) 
	printf("Your clock granularity/precision appears to be "
	    "%d microseconds.\n", quantum);
    else {
	printf("Your clock granularity appears to be "
	    "less than one microsecond.\n");
	quantum = 1;
    }

    t = mysecond();
    #pragma omp parallel
    #pragma omp simd aligned (a : alignment_bytes)
    for (long int j = 0; j < STREAM_ARRAY_SIZE_thread; j++)
        a[j] *= 2.0;
    t = 1.0E6 * (mysecond() - t);

    printf("Each test below will take on the order"
	" of %d microseconds.\n", (int) t  );
    printf("   (= %d clock ticks)\n", (int) (t/quantum) );
    printf("Increase the size of the arrays if this shows that\n");
    printf("you are not getting at least 20 clock ticks per test.\n");

    printf(HLINE);

    printf("WARNING -- The above is only a rough guideline.\n");
    printf("For best results, please be sure you know the\n");
    printf("precision of your system timer.\n");
    printf(HLINE);
    
    /*	--- MAIN LOOP --- repeat test cases NTIMES times --- */

    scalar = 3.0;

    /* note -- skip first iteration */
    tuned_STREAM_Copy();
    tuned_STREAM_Scale(scalar);
    tuned_STREAM_Add();
    tuned_STREAM_Triad(scalar);

    for (int NTIMES_count = 0; NTIMES_count < NTIMES; NTIMES_count++) {
        times[COPY][NTIMES_count] = mysecond();
        tuned_STREAM_Copy();
        times[COPY][NTIMES_count] = mysecond() - times[COPY][NTIMES_count];

        times[SCALE][NTIMES_count] = mysecond();
        tuned_STREAM_Scale(scalar);
        times[SCALE][NTIMES_count] = mysecond() - times[SCALE][NTIMES_count];
	
        times[ADD][NTIMES_count] = mysecond();
        tuned_STREAM_Add();
        times[ADD][NTIMES_count] = mysecond() - times[ADD][NTIMES_count];
	
        times[TRIAD][NTIMES_count] = mysecond();
        tuned_STREAM_Triad(scalar);
        times[TRIAD][NTIMES_count] = mysecond() - times[TRIAD][NTIMES_count];
    }

    for (int j = 0; j < NUM_BENCHMARKS; j++) {
        // Sort times
        qsort( times[j], NTIMES, sizeof(double), double_compare );

        // median
        if ( NTIMES % 2 == 0 )
            mediantime[j] = ( times[j][NTIMES/2] + times[j][NTIMES/2 - 1] ) / 2.0;
        else
            mediantime[j] = times[j][NTIMES/2];

        // min and max time
        mintime[j] = times[j][0];
        maxtime[j] = times[j][NTIMES - 1];
     }
    /*	--- SUMMARY --- */
    
    printf("Function    Best Rate MB/s  Med time     Min time     Max time\n");
    for (int j = 0; j < NUM_BENCHMARKS; j++) {
	printf("%s%12.1f  %11.6f  %11.6f  %11.6f\n", label[j],
	       1.0E-06 * bytes[j]/mintime[j],
	       mediantime[j],
	       mintime[j],
	       maxtime[j]);
    }
    printf(HLINE);

    /* --- Check Results --- */
    checkSTREAMresults();
    printf(HLINE);

    return 0;
}

int checktick() {
    struct timespec res;
    int error_code = clock_getres(CLOCK_MONOTONIC_RAW, &res);
    if (error_code != 0) {
        perror("checktick(): clock_gettime() failed");
        exit(1);
    }
    double res_usec = ((double) res.tv_sec * 1.e9 + (double) res.tv_nsec) * 1.e-3;
    return ceil(res_usec);
}

/* A gettimeofday routine to give access to the wall
   clock timer on most UNIX-like systems.  */
double mysecond() {

    struct timespec tp;
    int error_code = clock_gettime(CLOCK_MONOTONIC_RAW, &tp);
    if (error_code != 0) {
        perror("mysecond(): clock_gettime() failed");
        exit(1);
    }
    return ( (double) tp.tv_sec + (double) tp.tv_nsec * 1.e-9 );
}

#ifndef abs
#define abs(a) ((a) >= 0 ? (a) : -(a))
#endif
void checkSTREAMresults ()
{
	STREAM_TYPE aj, bj, cj, scalar;
	STREAM_TYPE aSumErr, bSumErr, cSumErr;
	STREAM_TYPE aAvgErr, bAvgErr, cAvgErr;
	double epsilon;
	int	ierr, err;

    /* reproduce initialization */
	aj = 1.0;
	bj = 2.0;
	cj = 0.0;
    /* a[] is modified during timing check */
	aj = 2.0E0 * aj;
    /* now execute timing loop */
	scalar = 3.0;
	for (int NTIMES_count = 0; NTIMES_count < NTIMES + 1; NTIMES_count++) {
            cj = aj;
            bj = scalar * cj;
            cj = aj + bj;
            aj = bj + scalar * cj;
        }

    /* accumulate deltas between observed and expected results */
	aSumErr = 0.0;
	bSumErr = 0.0;
	cSumErr = 0.0;
        #pragma omp parallel reduction(+: aSumErr, bSumErr, cSumErr, aAvgErr, bAvgErr, cAvgErr)
        {
            for (long int j = 0; j < STREAM_ARRAY_SIZE_thread; j++) {
                aSumErr += abs(a[j] - aj);
                bSumErr += abs(b[j] - bj);
                cSumErr += abs(c[j] - cj);
            }
            aAvgErr = aSumErr / (STREAM_TYPE) STREAM_ARRAY_SIZE_thread;
            bAvgErr = bSumErr / (STREAM_TYPE) STREAM_ARRAY_SIZE_thread;
            cAvgErr = cSumErr / (STREAM_TYPE) STREAM_ARRAY_SIZE_thread;
        }

        if      (sizeof(STREAM_TYPE) == 4) {
            // Epsilon for single precision
            epsilon = 1.e-6;
        }
	else if (sizeof(STREAM_TYPE) == 8) {
            // Epsilon for double precision
            epsilon = 1.e-13;
	}
	else {
            // Epsilon for unknown precision
            printf("WEIRD: sizeof(STREAM_TYPE) = %lu\n",sizeof(STREAM_TYPE));
            epsilon = 1.e-6;
	}

	err = 0;
	if (abs(aAvgErr/aj) > epsilon) {
		err++;
		printf ("Failed Validation on array a[], AvgRelAbsErr > epsilon (%e)\n",epsilon);
		printf ("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",aj,aAvgErr,abs(aAvgErr)/aj);
		ierr = 0;
                #pragma omp parallel reduction(+: ierr)
                for (long int j = 0; j < STREAM_ARRAY_SIZE_thread; j++) {
                    if (abs(a[j] / aj - 1.0) > epsilon) {
                        ierr++;
                    }
                }
		printf("     For array a[], %d errors were found.\n",ierr);
	}
	if (abs(bAvgErr/bj) > epsilon) {
		err++;
		printf ("Failed Validation on array b[], AvgRelAbsErr > epsilon (%e)\n",epsilon);
		printf ("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",bj,bAvgErr,abs(bAvgErr)/bj);
		printf ("     AvgRelAbsErr > Epsilon (%e)\n",epsilon);
		ierr = 0;
                #pragma omp parallel reduction(+: ierr)
                for (long int j = 0; j < STREAM_ARRAY_SIZE_thread; j++) {
                    if (abs(b[j]/bj-1.0) > epsilon) {
                        ierr++;
                    }
                }
		printf("     For array b[], %d errors were found.\n",ierr);
	}
	if (abs(cAvgErr/cj) > epsilon) {
		err++;
		printf ("Failed Validation on array c[], AvgRelAbsErr > epsilon (%e)\n",epsilon);
		printf ("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",cj,cAvgErr,abs(cAvgErr)/cj);
		printf ("     AvgRelAbsErr > Epsilon (%e)\n",epsilon);
		ierr = 0;
                #pragma omp parallel reduction(+: ierr)
                for (long int j = 0; j < STREAM_ARRAY_SIZE_thread; j++) {
                    if (abs(c[j]/cj-1.0) > epsilon) {
                        ierr++;
                    }
                }
		printf("     For array c[], %d errors were found.\n",ierr);
	}
	if (err == 0) {
		printf ("Solution Validates: avg error less than %e on all three arrays\n",epsilon);
	}
}

void inline tuned_STREAM_Copy() {
    #pragma omp parallel
    {
        #ifdef __INTEL_COMPILER
            // Instructs the compiler to use non-temporal (that is, streaming) stores
            #pragma vector nontemporal
        #endif
        #pragma omp simd aligned (a, c : alignment_bytes)
        for (long int j = 0; j < STREAM_ARRAY_SIZE_thread; j++)
           c[j] = a[j];
    }
}

void inline tuned_STREAM_Scale(STREAM_TYPE scalar) {
    #pragma omp parallel shared(scalar)
    {
        #ifdef __INTEL_COMPILER
            // Instructs the compiler to use non-temporal (that is, streaming) stores
            #pragma vector nontemporal
        #endif
        #pragma omp simd aligned (b, c : alignment_bytes)
        for (long int j = 0; j < STREAM_ARRAY_SIZE_thread; j++)
            b[j] = scalar*c[j];
    }
}

void inline tuned_STREAM_Add() {
    #pragma omp parallel
    {
        #ifdef __INTEL_COMPILER
            // Instructs the compiler to use non-temporal (that is, streaming) stores
            #pragma vector nontemporal
        #endif
        #pragma omp simd aligned (a, b, c : alignment_bytes)
        for (long int j = 0; j < STREAM_ARRAY_SIZE_thread; j++)
            c[j] = a[j] + b[j];
    }
}

void inline tuned_STREAM_Triad(STREAM_TYPE scalar) {
    #pragma omp parallel shared(scalar)
    {
        #ifdef __INTEL_COMPILER
            // Instructs the compiler to use non-temporal (that is, streaming) stores
            #pragma vector nontemporal
        #endif
        #pragma omp simd aligned (a, b, c : alignment_bytes)
        for (long int j = 0; j < STREAM_ARRAY_SIZE_thread; j++)
            a[j] = b[j] + scalar * c[j];
    }
}
