Я новичок в параллельных вычислениях и OpenCL. Я последовал за книгой OpenCLProgramming Guide. В части реализации свертки.OpenCL реализовал алгоритмы медленнее, чем обычный цикл
Мой main.cpp:
#include <iostream>
#include <sstream>
#include <fstream>
#include <string>
#include <OpenCL/OpenCL.h>
using namespace std;
const unsigned int inputSignalWidth = 8;
const unsigned int inputSignalHeight = 8;
cl_uint inputSignal[inputSignalWidth][inputSignalHeight] =
{
{3, 1, 1, 4, 8, 2, 1, 3},
{4, 2, 1, 1, 2, 1, 2, 3},
{4, 4, 4, 4, 3, 2, 2, 2},
{9, 8, 3, 8, 9, 0, 0, 0},
{9, 3, 3, 9, 0, 0, 0, 0},
{0, 9, 0, 8, 0, 0, 0, 0},
{3, 0, 8, 8, 9, 4, 4, 4},
{5, 9, 8 ,1 ,8, 1, 1, 1}
};
const unsigned int outputSignalWidth = 6;
const unsigned int outputSignalHeight = 6;
cl_uint outputSignal[outputSignalWidth][outputSignalHeight];
const unsigned int maskWidth = 3;
const unsigned int maskHeight = 3;
cl_uint mask[maskWidth][maskHeight] =
{
{1, 1, 1}, {1, 0, 1}, {1, 1, 1}
};
inline void checkErr(cl_int err, const char* name)
{
if (err != CL_SUCCESS)
{
cerr << "Error: " << name << endl;
exit(EXIT_FAILURE);
}
}
void CL_CALLBACK contextCallback(const char * errInfo,
const void * private_info,
size_t cb,
void * user_data)
{
cout << "Error occurred during contxt use: " << errInfo << endl;
exit(EXIT_FAILURE);
}
int main(int argc, const char * argv[])
{
cl_int errNum;
cl_uint numPlatforms;
cl_uint numDevices;
cl_platform_id * platformIDs;
cl_device_id * deviceIDs;
cl_context context = NULL;
cl_command_queue queue;
cl_program program;
cl_kernel kernel;
cl_mem inputSignalBuffer;
cl_mem outputSignalBuffer;
cl_mem maskBuffer;
errNum = clGetPlatformIDs(0, NULL, &numPlatforms);
checkErr((errNum != CL_SUCCESS)? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), "clGetPlatformIDs");
platformIDs = (cl_platform_id*) alloca(sizeof(cl_platform_id) * numPlatforms);
errNum = clGetPlatformIDs(numPlatforms, platformIDs, NULL);
checkErr((errNum != CL_SUCCESS)? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), "clGetPlatFormIDs");
deviceIDs = NULL;
cl_uint i;
for (i = 0; i < numPlatforms; i++)
{
errNum = clGetDeviceIDs(platformIDs[i], CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
if (errNum != CL_SUCCESS && errNum != CL_DEVICE_NOT_FOUND)
{
checkErr(errNum, "clGetDeviceIDs");
} else if (numDevices > 0)
{
deviceIDs = (cl_device_id *) alloca(sizeof(cl_device_id) * numDevices);
errNum = clGetDeviceIDs(platformIDs[i], CL_DEVICE_TYPE_GPU, numDevices, &deviceIDs[0], NULL);
checkErr(errNum, "clGetDeviceIDs");
break;
}
}
if (deviceIDs == NULL)
{
cout << "No CPU devices found." << endl;
exit(-1);
}
cl_context_properties contextProperties[] =
{
CL_CONTEXT_PLATFORM, (cl_context_properties) platformIDs[i], 0
};
context = clCreateContext(contextProperties, numDevices, deviceIDs, &contextCallback, NULL, &errNum);
checkErr(errNum, "clCreateContext");
ifstream srcFile("Convolution.cl");
checkErr(srcFile.is_open()?CL_SUCCESS:-1, "reading Convolution.cl");
string srcProg(istreambuf_iterator<char>(srcFile),
(istreambuf_iterator<char>()));
const char* src = srcProg.c_str();
size_t length = srcProg.length();
program = clCreateProgramWithSource(context, 1, &src, &length, &errNum);
checkErr(errNum, "clCreateProgramWithSource");
cout << "Device count: " << sizeof(deviceIDs)/sizeof(cl_device_id) << endl;
errNum = clBuildProgram(program, numDevices, deviceIDs, NULL, NULL, NULL);
checkErr(errNum, "clBuildProgram");
kernel = clCreateKernel(program, "convolve", &errNum);
checkErr(errNum, "clCreateKernel");
inputSignalBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, sizeof(cl_uint) * inputSignalHeight*inputSignalWidth, static_cast<void*>(inputSignal), &errNum);
checkErr(errNum, "clCreateBuffer(inputSignal)");
maskBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, sizeof(cl_uint) * maskHeight * maskWidth, static_cast<void*>(mask), &errNum);
checkErr(errNum, "clCreateBuffer(mask)");
outputSignalBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_uint) * outputSignalHeight * outputSignalWidth, NULL, &errNum);
checkErr(errNum, "clCreateBuffer(outputSignal)");
queue = clCreateCommandQueue(context, deviceIDs[0], 0, &errNum);
checkErr(errNum, "clCreateCommandQueue");
errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputSignalBuffer);
errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &maskBuffer);
errNum |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &outputSignalBuffer);
errNum |= clSetKernelArg(kernel, 3, sizeof(cl_uint), &inputSignalWidth);
errNum |= clSetKernelArg(kernel, 4, sizeof(cl_uint), &maskWidth);
checkErr(errNum, "clSetKernelArg");
const size_t globalWorkSize[1] =
{
outputSignalWidth * outputSignalWidth
};
const size_t localWorkSize[1] =
{
1
};
clock_t start, end;
clFinish(queue);
start = clock();
errNum = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
clFinish(queue);
end = clock();
cout << "time for calculation: " << (float)(end - start) << endl;
checkErr(errNum, "clEnequeueNDRangeKernel");
errNum = clEnqueueReadBuffer(queue, outputSignalBuffer, CL_TRUE, 0, sizeof(cl_uint) * outputSignalHeight * outputSignalWidth, outputSignal, 0, NULL, NULL);
checkErr(errNum, "clEnqueueReadBuffer");
clFinish(queue);
start = clock();
for (int y = 0; y < outputSignalHeight; y++)
{
for (int x = 0; x < outputSignalHeight; x++)
{
uint sum = 0;
for (int r = 0; r < maskWidth; r++)
{
for (int c =0; c < maskWidth; c++)
{
sum += inputSignal[y+r][x+c]*mask[r][c];
}
}
outputSignal[y][x] = sum;
}
}
end = clock();
cout << "Loop version time: " << (float)(end - start) << endl;
return 0;
}
и Convolution.cl:
__kernel void convolve(const __global uint * const input,
__constant uint * const mask,
__global uint * const output,
const int inputWidth,
const int maskWidth)
{
const int x = get_global_id(0);
const int y = get_global_id(1);
uint sum = 0;
for (int r = 0; r < maskWidth; r++)
{
const int idxIntmp = (y + r) * inputWidth + x;
for (int c =0; c < maskWidth; c++)
{
sum+= mask[r * maskWidth + c] * input[idxIntmp + c];
}
}
output[y * get_global_id(0) + x] = sum;
}
Платформа MacOS 10.9 и AMD 6750M. Не имеет смысла, что версия CL намного медленнее, чем для версии цикла (примерно в 10 раз медленнее). Не могли бы вы, ребята, помочь мне указать, что не так с кодом?
OpenCL не всегда стоит: связь между процессором и графическим процессором имеет некоторые издержки, и если эта стоимость не пренебрежимо мала, вы обязательно проиграете. Возможно, попробуйте с гораздо большими данными. –
Не должно быть: 'output [y * get_global_size (0) + x] = sum;' вместо 'output [y * get_global_id (0) + x] = sum;'? – DarkZeros