OpenCL - результаты не совпадают с версией CPU

Я хочу использовать openCl для сравнения значений пикселей между двумя изображениями. Одно изображение должно быть «преобразовано» матрицей преобразования.OpenCL - результаты не совпадают с версией CPU

1) У меня возникает одна проблема: результат версии openCL отличается от версии CPU. Разница значений пикселей в моем примере изображений (изображение A: все пиксели 5, imagesB: все пиксели 6) всегда один, поэтому в общей сложности с 1000 * 1000 пикселей он должен быть 1000000. Версия ЦП всегда правильная, но версия openCL всегда немного неточна и также время от времени отличается (например, 998895 или 998829).

2) Другая проблема, с которой я столкнулась, - это время выполнения, поскольку добавление разности двух сравниваемых пикселей в переменную результата занимает много времени. Но мои чувства говорят, что это можно решить с помощью другого макета памяти.

Любые идеи для проблем, которые у меня есть? Может быть, и способ использования двумерного workingset приводит к ошибкам?

Спасибо и наилучшими пожеланиями Хендрик

Вот ядро: В основном он получает два изображения и 700 матриц преобразования (в данный момент все представляют личность).

__kernel void compliance(
     __read_only image2d_t imageA, 
     __read_only image2d_t imageB, 
     __constant float *matrix, 
     __global int *result 
    ) 
{ 


    for (int i = 0; i < 700; i++) 
    { 
     size_t x = get_global_id(0); 
     size_t y = get_global_id(1); 

     float t1 = matrix[0 + i * 6]; 
     float t2 = matrix[1 + i * 6]; 
     float t3 = matrix[2 + i * 6]; 
     float t4 = matrix[3 + i * 6]; 
     float t5 = matrix[4 + i * 6]; 
     float t6 = matrix[5 + i * 6]; 

     //calculate the other coords of the comparing pixel 
     int x_new = x * t1 + y * t2 + 1 * t3; 
     int y_new = x * t4 + y * t5 + 1 * t6; 

     int a = (read_imagei(imageA, (int2)(x, y)).x); 
     int b = (read_imagei(imageB, (int2)(x_new, y_new)).x); 
     int diff = b - a; 

     //add every different of two compared pixels to the result 
     result[i] += diff; 
    } 
}

Вот мой код хоста:

#define __CL_ENABLE_EXCEPTIONS 
#include <CL/cl.hpp> 
#include <utility> 
#include <iostream> 
#include <fstream> 
#include <string> 
#include <chrono> 
#include <opencv2\core.hpp> 
#include <opencv2\imgproc.hpp> 
#include <opencv2\highgui.hpp> 


using namespace std; 




int main(int argc, char** argv) { 
    //700 transformation matrices 
    int numberMatrices = 700; 
    bool opencl = true; 
    //iamge width 
    int width = 1000; 
    //image height 
    int height = 1000; 
    //total number of pixels of one image 
    int size = width*height; 


    // Create two example images 
    const int LIST_SIZE = size; 
    int *imageA = new int[LIST_SIZE]; 
    int *imageB = new int[LIST_SIZE]; 
    for (int i = 0; i < LIST_SIZE; i++) { 
     //every pixel value of imageA is 5 
     imageA[i] = 5; 
     //every pixel value of imageA is 6 
     imageB[i] = 6; 
    } 

    //creation of n transformation matrices 
    const int MATRIX_SIZE = 6* numberMatrices; 
    float *indi = new float[MATRIX_SIZE]; 
    //all the matrices are the same 
    for (int i = 0; i < numberMatrices; i++) 
    { 
     //identity matrix 
     indi[0 + i * 6] = 1; 
     indi[1 + i * 6] = 0; 
     indi[2 + i * 6] = 0; 
     indi[3 + i * 6] = 0; 
     indi[4 + i * 6] = 1; 
     indi[5 + i * 6] = 0; 
    } 

    //array to save the results of the comparison 
    const int RESULT_SIZE = numberMatrices; 
    int *result = new int[RESULT_SIZE]; 





    if (opencl) 
    { 
     try { 
      // Get available platforms 
      vector<cl::Platform> platforms; 
      cl::Platform::get(&platforms); 
      std::cerr << "Platform number is: " << platforms.size() << std::endl; 
      std::string platformVendor; 
      platforms[0].getInfo((cl_platform_info)CL_PLATFORM_VENDOR, &platformVendor); 
      std::cerr << "Platform is by: " << platformVendor << "\n"; 

      // Select the default platform and create a context using this platform and the GPU 
      cl_context_properties cps[3] = { 
       CL_CONTEXT_PLATFORM, 
       (cl_context_properties)(platforms[0])(), 
       0 
      }; 
      cl::Context context(CL_DEVICE_TYPE_CPU, cps); 

      vector<cl::ImageFormat> format; 
      context.getSupportedImageFormats(CL_MEM_READ_ONLY, CL_MEM_OBJECT_IMAGE2D, &format); 
      /* for (int i = 0; i < format.size(); i++) 
      { 
      cout << "Channel Data Type: " << format.at(i).image_channel_data_type 
      << " Channel order: " << format.at(i).image_channel_order << endl; 
      }*/ 



      // Get a list of devices on this platform 
      vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>(); 
      for (int i = 0; i < devices.size(); i++) 
      { 

       cout << "Device: " << devices.at(i).getInfo<CL_DEVICE_NAME>() << endl; 
       cout << "DOUBLE FP: " << devices.at(i).getInfo<CL_DEVICE_DOUBLE_FP_CONFIG>() << endl; 
       cout << "Image Max Height: " << devices.at(i).getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>() << endl; 
       cout << "Image Support: " << devices.at(i).getInfo<CL_DEVICE_IMAGE_SUPPORT>() << endl; 
       cout << "Local Memory Size: " << devices.at(i).getInfo<CL_DEVICE_LOCAL_MEM_SIZE>() << endl; 
       cout << "Clock Frequency: " << devices.at(i).getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>() << endl; 
       cout << "CUs: " << devices.at(i).getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>() << endl; 
       cout << "Driver: " << devices.at(i).getInfo<CL_DRIVER_VERSION>() << endl; 
       cout << "Version: " << devices.at(i).getInfo<CL_DEVICE_VERSION>() << endl; 
       cout << "Work Group: " << devices.at(i).getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>() << endl; 
       cout << "Items: " << devices.at(i).getInfo<CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS>(); 
       cout << endl; 
      } 


      //Create opencl image 
      cl::Image2D clImage_A = cl::Image2D(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, cl::ImageFormat(CL_RGBA, CL_UNSIGNED_INT8), (size_t)width, (size_t)height, 0, imageA); 
      cl::Image2D clImage_B = cl::Image2D(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, cl::ImageFormat(CL_RGBA, CL_UNSIGNED_INT8), (size_t)width, (size_t)height, 0, imageB); 


      // Create a command queue and use the first device 
      cl::CommandQueue queue = cl::CommandQueue(context, devices[0]); 

      // Read kernel source file 
      std::ifstream sourceFile("difference.cl"); 
      std::string sourceCode(
       std::istreambuf_iterator<char>(sourceFile), 
       (std::istreambuf_iterator<char>())); 
      cl::Program::Sources source(1, std::make_pair(sourceCode.c_str(), sourceCode.length() + 1)); 

      // Make program of the source code in the context 
      cl::Program program = cl::Program(context, source); 

      // Build program for these specific devices 
      program.build(devices); 

      // Make kernel 
      cl::Kernel kernel(program, "compliance"); 


      // Create memory buffers 
      cl::Buffer buffer_matrix = cl::Buffer(context, CL_MEM_READ_ONLY, LIST_SIZE * sizeof(float)); 
      cl::Buffer buffer_result = cl::Buffer(context, CL_MEM_READ_WRITE, RESULT_SIZE * sizeof(int)); 

      // Copy list of results to the memory buffers 
      queue.enqueueWriteBuffer(buffer_matrix, CL_TRUE, 0, MATRIX_SIZE * sizeof(float), indi); 


      // Set arguments to kernel 
      kernel.setArg(0, clImage_A); 
      kernel.setArg(1, clImage_B); 
      kernel.setArg(2, buffer_matrix); 
      kernel.setArg(3, buffer_result); 


      cl::Event event; 

      std::cout << "Start OpenCL processing.." << endl; 
      chrono::high_resolution_clock::time_point t1 = chrono::high_resolution_clock::now(); 

      // Run the kernel n-times on specific ND range 
      for (int i = 0; i < 1; i++) 
      { 
       queue.enqueueNDRangeKernel(
        kernel, 
        cl::NullRange, 
        cl::NDRange((size_t)width, (size_t)height), 
        cl::NDRange(1, 1), 
        NULL, 
        &event); 
       cout << i << " "; 
       event.wait(); 
      } 

      chrono::high_resolution_clock::time_point t2 = chrono::high_resolution_clock::now(); 

      auto duration_opencl = std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count(); 


      std::cout << "OpenCL processing done.." << endl; 
      std::cout << "Start CPU Processing.." << endl; 


      // Read buffer_result into result 
      queue.enqueueReadBuffer(buffer_result, CL_TRUE, 0, RESULT_SIZE * sizeof(int), result); 


      //cpu version to calculate the difference between the two arryays 
      t1 = chrono::high_resolution_clock::now(); 
      int different = 0; 
      int x_new; 
      int x; 
      for (int i = 0; i < numberMatrices; i++) 
      { 
       different = 0; 
       for (int n = 0; n < LIST_SIZE; n++) 
       { 
        x = imageA[n]; 
        x_new = x;; 
        int a = imageA[x]; 
        int b = imageB[x_new]; 
        int diff = imageB[x_new] - imageA[x]; 

        different += diff; 
       } 

      } 

      t2 = chrono::high_resolution_clock::now(); 
      auto duration_cpu = std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count(); 
      std::cout << "CPU processing done.." << endl; 


      //output of the results 
      std::cout << "opencl: diff " << result[0] << endl; 
      std::cout << "Runtime opencl: " << duration_opencl << endl; 

      std::cout << "CPU: diff " << different << endl; 
      std::cout << "Runtime CPU: " << duration_cpu << endl; 

      double times = (double)duration_cpu/(double)duration_opencl; 
      std::cout << "OpenCL is " << times << " times faster!!!" << endl; 

      char c; 
      std::cin >> c; 

     } 
     catch (cl::Error error) { 
      std::cout << error.what() << "(" << error.err() << ")" << std::endl; 
      char c; 
      std::cin >> c; 
     } 
    } 


    return 0; 
}

источник

2016-01-22 funkyfallk

один срок я есть сокращение. Здесь ссылка на ppt. Я не работал над этим, но, возможно, у кого-то такая же проблема: http://www.fz-juelich.de/SharedDocs/Downloads/IAS/JSC/EN/slides/advanced-gpu/adv-gpu-opencl-reduction .pdf? __ blob = publicationFile – funkyfallk

Не думаете ли вы, что у вас есть состояние гонки здесь дело в этой строке result[i] += diff; вашего кода OpenCL программа делать это в каждом рабочем пункте одновременно? Возможно, это может быть проблемой.

источник

2016-01-22 06:54:19 segevara

Да, это может быть проблемой. Это также является источником медленной работы. Знаете ли вы другой способ сохранить сумму отличий каждой пары пикселей? – funkyfallk

вы можете использовать разные методы, поскольку я знаю, что OpenCL имеет [атомные функции] (https://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/atomicFunctions.html): 'atomic_add (result [ i], diff) ', и вы можете [использовать барьеры] (https://www.khronos.org/registry/cl/sdk/1.0/docs/man/xhtml/barrier.html), чтобы предотвратить одновременную запись в одну и ту же память место – segevara

спасибо ... хотя я еще не успешно реализовал atom_add. проблема упоминается как ** параллельное сокращение ** – funkyfallk

Арифметика с плавающей точкой отличается на платформах. Вы, скорее всего, видите эффект аппаратной оптимизации MAD, выполняемой компилятором OpenCL. Насколько мне известно, отключение оптимизации с помощью -cl-opt-disable в этом случае не поможет.

источник

2016-01-27 14:15:21 l33t

OpenCL - результаты не совпадают с версией CPU

ответ

Смежные вопросы