はじめに
OpenCLを簡単に利用できるようにするフレームワークがなかったので作ってみた。 最新版はここから。
コンパイル方法
$ clang ocl.c -o ocl `pkg-config --libs --cflags OpenCL` (Linuxの場合)
$ clang ocl.c -o ocl -framework opencl (Macの場合)
OpenCLフレームワーク+サンプル
vadd.cl
OCLSTRINGIFY(
__kernel void vadd(global const float *a, global const float *b, global float *c, int n)
{
int i = get_global_id(0);
if (i < n) {
c[i] = a[i] + b[i];
}
}
);
ocl.c
#include <stdio.h>
#include <stdlib.h>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#define OCLSTRINGIFY(...) #__VA_ARGS__
char kernel_code[] =
#include "vadd.cl"
#define MAX_PLATFORMS 10
#define MAX_DEVICES 10
typedef struct {
int type;
int size;
void *p;
void *s;
int write, read;
} args_t;
int ocl_device;
cl_device_id device_id[MAX_DEVICES];
cl_context context;
cl_command_queue command_queue;
void oclSetup(int platform, int device)
{
cl_platform_id platform_id[MAX_PLATFORMS];
cl_uint num_devices;
cl_uint num_platforms;
cl_int ret;
ocl_device = device;
ret = clGetPlatformIDs(MAX_PLATFORMS, platform_id, &num_platforms);
ret = clGetDeviceIDs(platform_id[platform], CL_DEVICE_TYPE_ALL, MAX_DEVICES, device_id, &num_devices);
// device name (option)
// clGetDeviceInfo(device_id[device], CL_DEVICE_NAME, sizeof(str), str, &ret_size);
// sprintf(msg, "%s (platform = %d, device = %d)", str, platform, device);
context = clCreateContext(NULL, 1, &device_id[device], NULL, NULL, &ret);
command_queue = clCreateCommandQueue(context, device_id[device], 0, &ret);
}
cl_kernel oclKernel(char *k, char *opt, char *kernel_code, args_t *args)
{
cl_int ret;
const char* src[1] = { kernel_code };
cl_program program = clCreateProgramWithSource(context, 1, (const char **)&src, 0, &ret);
ret = clBuildProgram(program, 1, &device_id[ocl_device], NULL, NULL, NULL);
if (ret) {
size_t len = 0;
cl_int ret = CL_SUCCESS;
ret = clGetProgramBuildInfo(program, device_id[ocl_device], CL_PROGRAM_BUILD_LOG, 0, NULL, &len);
char *buffer = calloc(len, sizeof(char));
ret = clGetProgramBuildInfo(program, device_id[ocl_device], CL_PROGRAM_BUILD_LOG, len, buffer, NULL);
printf("\n%s\n", buffer);
}
cl_kernel kernel = clCreateKernel(program, k, &ret);
clReleaseProgram(program);
while (args->size) {
if (args->type>0) *(cl_mem*)(args->p) = clCreateBuffer(context, args->type, args->size, NULL, &ret);
args++;
}
return kernel;
}
void oclKernelArgsWrite(args_t *args)
{
while (args->size) {
if (args->write) {
clEnqueueWriteBuffer(command_queue, *(cl_mem*)(args->p), CL_TRUE, 0, args->size, args->s, 0, 0, 0);
}
args++;
}
}
void oclKernelArgsRead(args_t *args)
{
while (args->size) {
if (args->read) {
clEnqueueReadBuffer(command_queue, *(cl_mem*)(args->p), CL_TRUE, 0, args->size, args->s, 0, 0, 0);
}
args++;
}
}
void oclRun(cl_kernel kernel, args_t *args, size_t global_work_size, size_t local_work_size)
{
int n = 0;
while (args->size) {
if (args->type>0) clSetKernelArg(kernel, n++, sizeof(cl_mem), (void*)args->p);
else clSetKernelArg(kernel, n++, sizeof(int), (void*)args->p);
args++;
}
clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL);
}
void oclReleaseKernel(cl_kernel kernel, args_t *args)
{
while (args->size) {
if (args->type>0) clReleaseMemObject(*(cl_mem*)(args->p));
args++;
}
clReleaseKernel(kernel);
}
void oclFinish()
{
clFlush(command_queue);
clFinish(command_queue);
clReleaseCommandQueue(command_queue);
clReleaseContext(context);
}
#include <time.h>
int main()
{
clock_t t0, t1;
float *A, *B, *C;
cl_mem d_A, d_B, d_C;
int N = 100;//10000000;
int nloop = 1000;
size_t local_item_size = 256;
size_t global_item_size = ((N + local_item_size - 1) / local_item_size) * local_item_size;
int size = N * sizeof(float);
A = (float *)malloc(size);
B = (float *)malloc(size);
C = (float *)malloc(size);
for (int i=0; i<N; i++) {
A[i] = (float)(1 + i);
B[i] = (float)(1 + i);
}
args_t args[] = {
{ CL_MEM_READ_WRITE, size, &d_A, A, 1, 0 },
{ CL_MEM_READ_WRITE, size, &d_B, B, 1, 0 },
{ CL_MEM_READ_WRITE, size, &d_C, C, 0, 1 },
{ 0, sizeof(int), &N, 0, 0, 0 },
{ 0, 0, 0, 0, 0, 0 },
};
oclSetup(0, 0);
cl_kernel kernel = oclKernel("vadd", 0, kernel_code, args);
t0 = clock();
oclKernelArgsWrite(args);
for (int i=0; i<nloop; i++) {
oclRun(kernel, args, global_item_size, local_item_size);
}
oclKernelArgsRead(args);
t1 = clock();
float sum = 0;
for (int i=0; i<N; i++) {
sum += C[i];
}
double exact = N * (N + 1.0);
double cpu = (double)(t1 - t0) / CLOCKS_PER_SEC;
printf("g:%d l:%d\n", global_item_size, local_item_size);
printf("n=%d nloop=%d %e(%.6e) cpu[sec]=%.3f\n", N, nloop, sum, exact, cpu);
oclReleaseKernel(kernel, args);
oclFinish();
free(A);
free(B);
free(C);
return 0;
}