OpenCL runs programs on a GPU. It does other things, but this is what I’m interested in. On macOS, the OpenCL API is provided by including <OpenCL/opencl.h>
, and you link with the OpenCL implemention using -framework OpenCL
. The following is an example program using OpenCL; it’s borrowed from this official example. I’ve stripped out all error checking to make the code more digestible.
#include <stdio.h>
#include <OpenCL/opencl.h>
#define DATA_SIZE (1024)
const char *KernelSource =
"__kernel void square(__global float* input, __global float* output, const unsigned int count) { \n" \
" int i = get_global_id(0); \n" \
" if(i < count) { output[i] = input[i] * input[i]; } \n" \
"}";
int main(void) {
int err;
cl_device_id device_id;
clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
cl_context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
cl_command_queue commands = clCreateCommandQueue(context, device_id, 0, &err);
cl_program program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err);
clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
cl_kernel kernel = clCreateKernel(program, "square", &err);
cl_mem input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * DATA_SIZE, NULL, NULL);
cl_mem output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * DATA_SIZE, NULL, NULL);
float data[DATA_SIZE];
for (int i = 0; i < DATA_SIZE; i++) { data[i] = i; }
err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * DATA_SIZE, data, 0, NULL, NULL);
clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
unsigned int count = DATA_SIZE;
clSetKernelArg(kernel, 2, sizeof(unsigned int), &count);
size_t local;
clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
size_t global = count;
clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
clFinish(commands);
float results[DATA_SIZE];
clEnqueueReadBuffer(commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL);
unsigned int correct = 0;
for (int i = 0; i < count; i++) {
if (results[i] == data[i] * data[i]) { correct++; }
}
printf("Computed '%d/%d' correct values!\n", correct, count);
clReleaseMemObject(input);
clReleaseMemObject(output);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(commands);
clReleaseContext(context);
return 0;
}
Compile and run this with:
$ clang -framework OpenCL main.c
$ ./a.out
Computed '1024/1024' correct values!
The big ideas in this program are that:
I’ll dissect the pieces of this program in a future post.
I wrote this because I felt like it. This post is my own, and not associated with my employer.
Jim. Public speaking. Friends. Vidrio.