# OpenCL (Open Computing Language)

* OpenCL is a *low-level* programming framework 
* Full control on data placement and code execution
* Support for multiple heterogeneous types of execution resources
* Host code is written in C or C++, GPU code is written in OpenCL C (~ C99)
* Open standard maintained by non-profit technology consortium Khronos Group
* Links to external Resources:
  * [OpenCL Homepage](https://www.khronos.org/opencl/) (Specification and Introduction)

## Supported Compilers

* All C, C++ compiler

## Hardware portability

* CPUs
* AMD GPUs
* Intel GPUs
* NVIDIA GPUs

## Example Code

* Source code available in [exampleOpenCL.c](../src/exampleOpenCL.c)

* Include OpenCL Support into your code

  ```c
  #define CL_TARGET_OPENCL_VERSION 300
  #include <CL/opencl.h>
  #include <err.h>
  #include <stdio.h>
  #include <stdlib.h>

  int main() {
      cl_int errCode;

      // Obtain the first available platform.
      cl_platform_id platformID = NULL;
      cl_uint numPlatforms;
      errCode = clGetPlatformIDs(1, &platformID, &numPlatforms);
      if (errCode != CL_SUCCESS) {
          errx(1, "clGetPlatformIDs() failed");
      }

      // Obtain the first available device on the platform
      cl_device_id deviceID = NULL;
      cl_uint numDevices;
      errCode = clGetDeviceIDs(platformID, CL_DEVICE_TYPE_DEFAULT, 1,
                               &deviceID, &numDevices);
      if (errCode != CL_SUCCESS) {
          errx(1, "clGetDeviceIDs() failed");
      }

      // Create an OpenCL context
      // Contexts are used by the OpenCL runtime for managing objects such as command-queues, memory, program and kernel objects
      cl_context context =
          clCreateContext(NULL, 1, &deviceID, NULL, NULL, &errCode);
      if (errCode != CL_SUCCESS) {
          errx(1, "clCreateContext() failed");
      }

      // Create a command queue
      cl_command_queue commandQueue =
          clCreateCommandQueueWithProperties(context, deviceID, NULL, &errCode);
      if (errCode != CL_SUCCESS) {
          errx(1, "clCreateCommandQueue() failed");
      }
      // ...

      // Release a command queue
      errCode = clReleaseCommandQueue(commandQueue);

      // release the context
      errCode = clReleaseContext(context);

      // Release the device
      errCode = clReleaseDevice(deviceID);

      return 0;
  }
  ```

* Retrieve platform information

  ```c
  // Get size requirement for platform name
  size_t infoSize;
  clGetPlatformInfo(platformID, CL_PLATFORM_NAME, 0, NULL, &infoSize);

  // Get platform name
  char *platformName = (char *) malloc(infoSize);
  clGetPlatformInfo(platformID, CL_PLATFORM_NAME, infoSize, platformName, NULL);

  printf("OpenCL Platform name: %s\n", platformName);

  // Get size requirement for device name
  clGetDeviceInfo(deviceID, CL_DEVICE_NAME, 0, NULL, &infoSize);

  // Get device name
  char *deviceName = (char *) malloc(infoSize);
  clGetDeviceInfo(deviceID, CL_DEVICE_NAME, infoSize, deviceName, NULL);

  printf("OpenCL Device name: %s\n", deviceName);
  ```  

* Allocate memory on the host

  ```c
  double *a = (double *) malloc(size * sizeof(double));
  if (a == NULL) {
      errx(1, "malloc a[] failed");
  }
  ```

* Pre-process / initialize data on the host
  e.g. read data from storage

  ```c
  for (unsigned int i = 0; i < size; i++) {
      a[i] = 1.;
  }
  ```

* Allocate memory on the device

  ```c
  cl_mem device_a =
      clCreateBuffer(context, CL_MEM_READ_WRITE, size * sizeof(double), NULL, &errCode);
  if (errCode != CL_SUCCESS) {
      errx(1, "clCreateBuffer() failed");
  }
  ```

* Copy data from the host to the device

  ```c
  errCode = clEnqueueWriteBuffer(commandQueue, device_a, CL_TRUE, 0, size * sizeof(double), a, 0, NULL, NULL);
  if (errCode != CL_SUCCESS) {
      errx(1, "clEnqueueWriteBuffer() failed");
  }
  ```

* Compute on the device

  ```c
  const char *incSource =
      "\n"
      "__kernel void inc(                      \n"
      "   __global double* device_a,           \n"
      "   const unsigned int size) {           \n"
      "   int i = get_global_id(0);            \n"
      "   if(i < size)                         \n"
      "       device_a[i] = device_a[i] + 1.0; \n"
      "}                                       \n";


  // Creates a program object for a context, and loads source code specified by text strings into the program object
  cl_program program =
      clCreateProgramWithSource(context, 1, &incSource, NULL, &errCode);
  if (errCode != CL_SUCCESS) {
      errx(1, "clCreateProgramWithSource() failed");
  }

  // Builds (compiles and links) a program executable from the program source
  errCode = clBuildProgram(program, 1, &deviceID, NULL, NULL, NULL);
  if (errCode != CL_SUCCESS) {
      size_t len;
      char buffer[2048];
      clGetProgramBuildInfo(program, deviceID, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
      errx(1, "clBuildProgram() failed:\n%s", buffer);
  }

  // Creates a kernel object
  cl_kernel kernel =
      clCreateKernel(program, "inc", &errCode);
  if (errCode != CL_SUCCESS) {
      errx(1, "clCreateKernel() failed");
  }

  // Set the argument value for a specific argument of a kernel
  errCode = clSetKernelArg(kernel, 0, sizeof(cl_mem), &device_a);
  if (errCode != CL_SUCCESS) {
      errx(1, "clSetKernelArg() failed");
  }
  errCode = clSetKernelArg(kernel, 1, sizeof(unsigned int), &size);
  if (errCode != CL_SUCCESS) {
      errx(1, "clSetKernelArg() failed");
  }

  // Query the maximum workgroup size
  size_t local;
  errCode = clGetKernelWorkGroupInfo(kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
  if (errCode != CL_SUCCESS) {
      errx(1, "clGetKernelWorkGroupInfo() failed");
  }

  // Enqueues a command to execute a kernel on a device
  size_t global = size;
  errCode = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
  if (errCode != CL_SUCCESS) {
      errx(1, "clEnqueueNDRangeKernel() failed");
  }

  // Wait for command completion
  errCode = clFinish(commandQueue);
  if (errCode != CL_SUCCESS) {
      errx(1, "clFinish() failed");
  }

  // Release the kernel object
  errCode = clReleaseKernel(kernel);

  // Release the program object
  errCode = clReleaseProgram(program);  
  ```

* Transfer data back from the device to the host

  ```c
  errCode = clEnqueueReadBuffer(commandQueue, device_a, CL_TRUE, 0, size * sizeof(double), a, 0, NULL, NULL);
  if (errCode != CL_SUCCESS) {
      errx(1, "clEnqueueReadBuffer() failed");
  }
  ```

* Delete data on the device

  ```c
  errCode = clReleaseMemObject(device_a);
  if (errCode != CL_SUCCESS) {
      errx(1, "clReleaseMemObject() failed");
  }
  ```

* Post-process data on the host
  e.g. write data to storage

  ```c
  for (unsigned int i = 0; i < size; i++) {
      if (a[i] != 2.) {
          errx(2, "Computation on GPU failed");
      }
  }
  ```

* Free memory on the host

  ```c
  free(a);
  ```

### Compilation

In [None]:
#!/usr/bin/bash
# LLVM compiler
! module purge; \
  module add devel/cuda/12.9; \
  clang \
    -O2 -march=native -flto -Wall -Wextra \
    "../src/exampleOpenCL.c" -o "../bin/exampleOpenCL" -lOpenCL

In [None]:
#!/usr/bin/bash
# GNU compiler
! module purge; \
  module add devel/cuda/12.9; \
  gcc \
    -O2 -march=native -flto -Wall -Wextra \
    "../src/exampleOpenCL.c" -o "../bin/exampleOpenCL" -lOpenCL

### Execution

In [5]:
#!/usr/bin/bash
! module purge; \
  module add devel/cuda/12.9; \
  ../bin/exampleOpenCL

OpenCL Platform name: NVIDIA CUDA
OpenCL Device name: NVIDIA H100
* Allocate memory on the host
* Pre-process / initialize data on the host
  e.g. read data from storage
* Allocate memory on the device
* Copy data from the host to the device
* Compute on the device
* Transfer data back from the device to the host
* Delete data on the device
* Post-process data on the host
  e.g. write data to storage
* Free memory on the host
