はじめに

OpenCLを簡単に利用できるようにするフレームワークがなかったので作ってみた。 最新版はここから。

コンパイル方法

$ clang ocl.c -o ocl `pkg-config --libs --cflags OpenCL` (Linuxの場合)
$ clang ocl.c -o ocl -framework opencl (Macの場合)

OpenCLフレームワーク+サンプル

vadd.cl

OCLSTRINGIFY(

__kernel void vadd(global const float *a, global const float *b, global float *c, int n)
{
	int i = get_global_id(0);

	if (i < n) {
		c[i] = a[i] + b[i];
	}
}

);

ocl.c

#include <stdio.h>
#include <stdlib.h>

#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif

#define OCLSTRINGIFY(...) #__VA_ARGS__
char kernel_code[] = 
#include "vadd.cl"

#define MAX_PLATFORMS	10
#define MAX_DEVICES	10

typedef struct {
	int type;
	int size;
	void *p;
	void *s;
	int write, read;
} args_t;

int ocl_device;
cl_device_id device_id[MAX_DEVICES];
cl_context context;
cl_command_queue command_queue;
void oclSetup(int platform, int device)
{
	cl_platform_id platform_id[MAX_PLATFORMS];
	cl_uint num_devices;
	cl_uint num_platforms;
	cl_int ret;

	ocl_device = device;

	ret = clGetPlatformIDs(MAX_PLATFORMS, platform_id, &num_platforms);
	ret = clGetDeviceIDs(platform_id[platform], CL_DEVICE_TYPE_ALL, MAX_DEVICES, device_id, &num_devices);

	// device name (option)
//	clGetDeviceInfo(device_id[device], CL_DEVICE_NAME, sizeof(str), str, &ret_size);
//	sprintf(msg, "%s (platform = %d, device = %d)", str, platform, device);

	context = clCreateContext(NULL, 1, &device_id[device], NULL, NULL, &ret);
	command_queue = clCreateCommandQueue(context, device_id[device], 0, &ret);
}
cl_kernel oclKernel(char *k, char *opt, char *kernel_code, args_t *args)
{
	cl_int ret;
	const char* src[1] = { kernel_code };

	cl_program program = clCreateProgramWithSource(context, 1, (const char **)&src, 0, &ret);
	ret = clBuildProgram(program, 1, &device_id[ocl_device], NULL, NULL, NULL);
	if (ret) {
		size_t len = 0;
		cl_int ret = CL_SUCCESS;
		ret = clGetProgramBuildInfo(program, device_id[ocl_device], CL_PROGRAM_BUILD_LOG, 0, NULL, &len);
		char *buffer = calloc(len, sizeof(char));
		ret = clGetProgramBuildInfo(program, device_id[ocl_device], CL_PROGRAM_BUILD_LOG, len, buffer, NULL);
		printf("\n%s\n", buffer);
	}
	cl_kernel kernel = clCreateKernel(program, k, &ret);
	clReleaseProgram(program);

	while (args->size) {
		if (args->type>0) *(cl_mem*)(args->p) = clCreateBuffer(context, args->type, args->size, NULL, &ret);
		args++;
	}

	return kernel;
}
void oclKernelArgsWrite(args_t *args)
{
	while (args->size) {
		if (args->write) {
			clEnqueueWriteBuffer(command_queue, *(cl_mem*)(args->p), CL_TRUE, 0, args->size, args->s, 0, 0, 0);
		}
		args++;
	}
}
void oclKernelArgsRead(args_t *args)
{
	while (args->size) {
		if (args->read) {
			clEnqueueReadBuffer(command_queue, *(cl_mem*)(args->p), CL_TRUE, 0, args->size, args->s, 0, 0, 0);
		}
		args++;
	}
}
void oclRun(cl_kernel kernel, args_t *args, size_t global_work_size, size_t local_work_size)
{
	int n = 0;
	while (args->size) {
		if (args->type>0) clSetKernelArg(kernel, n++, sizeof(cl_mem), (void*)args->p);
		else clSetKernelArg(kernel, n++, sizeof(int), (void*)args->p);
		args++;
	}

	clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL);
}
void oclReleaseKernel(cl_kernel kernel, args_t *args)
{
	while (args->size) {
		if (args->type>0) clReleaseMemObject(*(cl_mem*)(args->p));
		args++;
	}
	clReleaseKernel(kernel);
}
void oclFinish()
{
	clFlush(command_queue);
	clFinish(command_queue);
	clReleaseCommandQueue(command_queue);
	clReleaseContext(context);
}

#include <time.h>
int main()
{
	clock_t t0, t1;
	float *A, *B, *C;
	cl_mem d_A, d_B, d_C;
	int N = 100;//10000000;
	int nloop = 1000;

	size_t local_item_size = 256;
	size_t global_item_size = ((N + local_item_size - 1) / local_item_size) * local_item_size;

	int size = N * sizeof(float);
	A = (float *)malloc(size);
	B = (float *)malloc(size);
	C = (float *)malloc(size);
	for (int i=0; i<N; i++) {
		A[i] = (float)(1 + i);
		B[i] = (float)(1 + i);
	}

	args_t args[] = {
		{ CL_MEM_READ_WRITE, size, &d_A, A, 1, 0 },
		{ CL_MEM_READ_WRITE, size, &d_B, B, 1, 0 },
		{ CL_MEM_READ_WRITE, size, &d_C, C, 0, 1 },
		{ 0, sizeof(int), &N, 0, 0, 0 },
		{ 0, 0, 0, 0, 0, 0 },
	};

	oclSetup(0, 0);
	cl_kernel kernel = oclKernel("vadd", 0, kernel_code, args);

	t0 = clock();

	oclKernelArgsWrite(args);
	for (int i=0; i<nloop; i++) {
		oclRun(kernel, args, global_item_size, local_item_size);
	}
	oclKernelArgsRead(args);

	t1 = clock();

	float sum = 0;
	for (int i=0; i<N; i++) {
		sum += C[i];
	}
	double exact = N * (N + 1.0);
	double cpu = (double)(t1 - t0) / CLOCKS_PER_SEC;
	printf("g:%d l:%d\n", global_item_size, local_item_size);
	printf("n=%d nloop=%d %e(%.6e) cpu[sec]=%.3f\n", N, nloop, sum, exact, cpu);

	oclReleaseKernel(kernel, args);
	oclFinish();

	free(A);
	free(B);
	free(C);

	return 0;
}