Чтение неправильных данных в OpenCL

Я столкнулся с проблемой, что ядро записывает данные в неправильном месте или хост неправильно считывает данные. Я пишу те же данные (индекс, в котором я пишу данные), на два глобальных массива с разными типами. Чтобы гарантировать, что индекс является корневым, используется глобальный счетчик, который увеличивается с помощью atom_inc. Проблема возникает, когда данные считываются из второго массива на хосте. Например:Чтение неправильных данных в OpenCL

..... 
output array index: 442: (output1 value:442.0000  output2 value:442) 
output array index: 443: (output1 value:443.0000  output2 value:443) 
output array index: 444: (output1 value:444.0000  output2 value:444) 
output array index: 445: (output1 value:445.0000  output2 value:445) 
output array index: 446: (output1 value:446.0000  output2 value:1152892928) 
output array index: 447: (output1 value:447.0000  output2 value:447) 
output array index: 448: (output1 value:448.0000  output2 value:1152909312) 
output array index: 449: (output1 value:449.0000  output2 value:1152917504) 
output array index: 450: (output1 value:450.0000  output2 value:1152925696) 
......

Как вы можете видеть на indicies 446, 448, 449 и 450+ output2 содержит неверные значения. Какова может быть причина?

устройств: ATI Radeon HD5750

Пример кода:

#include <stdio.h> 
#include <math.h> 
#include <OpenCL/OpenCL.h> 

// wtf example 
const char *programSource = 
"__kernel void kernel1(__global uint *counter,\n" \ 
"__global float *weights,\n" \ 
"__global uint *weights_pos)\n" \ 
"{\n"\ 
"const uint global_size = get_global_size(0);\n" \ 
"const uint global_id = get_global_id(0);\n" \ 
"uint local_id = get_local_id(0);\n" \ 

"if(global_id == 0) {\n" \ 
"counter[5] = 0; // set index of pos in weights to zero\n" \ 
"}\n" \ 

"uint insert_index = atom_inc(&counter[5]);\n" \ 
"weights[insert_index] = insert_index;\n" \ 
"weights_pos[insert_index] = insert_index;\n" \ 
"}"; 

void art_process_sinogram(const char* tiff_filename, 
          const float *angles2, 
          const unsigned int n_angles2, 
          const unsigned int n_ray2s, 
          const float distanc2e) 
{ 
    /****************************** 
    * OPENCL ENVIRONMENT 
    */ 
    cl_int status; 
    cl_uint numPlatforms = 0; 
    cl_platform_id *platforms = NULL; 
    cl_device_id device_id; 

    //discover platforms 
    status = clGetPlatformIDs(0, NULL, &numPlatforms); 
    platforms = (cl_platform_id*)malloc(numPlatforms * sizeof(cl_platform_id)); 
    status = clGetPlatformIDs(numPlatforms, platforms, NULL); 

    //discover devices 
    cl_uint numDevices = 0; 
    cl_device_id *devices = NULL; 

    status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices); 
    devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id)); 
    status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL); 
    device_id = devices[1]; 
    //create context 
    cl_context context = NULL; 
    context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status); 

    cl_program program = clCreateProgramWithSource(context, 1, (const char **)&programSource, NULL, &status); 
    clBuildProgram(program, 0, NULL, NULL, NULL, NULL); 
    cl_kernel kernel_weights = clCreateKernel(program, "kernel1", &status); 

    //create queue 
    cl_command_queue command_queue1 = clCreateCommandQueue(context, device_id, 0, &status); 

    /****************************** 
    * HARDWARE PARAMETERS 
    */ 
    cl_uint wavefronts_per_SIMD = 7; 
    size_t global_work_size; 
    size_t local_work_size = 64; 

    cl_uint max_compute_units; 

    clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &max_compute_units, NULL); 

    size_t wg_count = max_compute_units * wavefronts_per_SIMD; 
    global_work_size = wg_count * local_work_size; 

    /**************************** DATA PART *************************************/ 

    size_t w_portion_size = 768 * sizeof(cl_float); 
    size_t w_pos_portion_size = 768 * sizeof(cl_uint); 

    size_t counters_data_size = 6 * sizeof(cl_uint); 
    cl_uint counters_data[6]; 
    counters_data[0] = 1; 
    counters_data[1] = 2; // max number of the cells intersected by the ray 
    counters_data[2] = 3; 
    counters_data[3] = 4; 
    counters_data[4] = 5; // same to the number of rays 
    counters_data[5] = 0; // counter inside kernel 

    /***************** 
    * Main buffers 
    */ 
    cl_mem weights1_buffer = clCreateBuffer(context, 
              CL_MEM_READ_WRITE, 
              w_portion_size, 
              NULL, 
              NULL); 

    cl_mem weights_pos1_buffer = clCreateBuffer(context, 
               CL_MEM_READ_WRITE, 
               w_pos_portion_size, 
               NULL, 
               NULL); 
    /***************** 
    * Supplement buffers (constant) 
    */ 
    cl_mem counters_data_buffer = clCreateBuffer(context, 
               CL_MEM_READ_ONLY, 
               counters_data_size, 
               NULL, 
               &status); 


    cl_event supplement_buffer_ready[1]; 

    status = clEnqueueWriteBuffer(command_queue1, 
           counters_data_buffer, 
           CL_FALSE, 
           0, 
           counters_data_size, 
           counters_data, 
           0, 
           NULL, 
           &supplement_buffer_ready[0]); 

    status = clSetKernelArg(kernel_weights, 0, sizeof(void *), (void *)&counters_data_buffer); 
    status = clSetKernelArg(kernel_weights, 1, sizeof(void *), (void *)&weights1_buffer); 
    status = clSetKernelArg(kernel_weights, 2, sizeof(void *), (void *)&weights_pos1_buffer); 

    status = clEnqueueNDRangeKernel(command_queue1, 
            kernel_weights, 
            1, // work dimensional 1D, 2D, 3D 
            NULL, // offset 
            &global_work_size, // total number of WI 
            &local_work_size, // nomber of WI in WG 
            1, // num events in wait list 
            supplement_buffer_ready, // event wait list 
            NULL); // event 

    clFinish(command_queue1); 
    cl_float *output1 = (cl_float *) clEnqueueMapBuffer(command_queue1, 
                 weights1_buffer,//*pmain_weights_buffer, 
                 CL_TRUE, 
                 CL_MAP_READ, 
                 0, 
                 w_portion_size, 
                 0, NULL, NULL, NULL); 
    cl_uint *output2 = malloc(w_portion_size); 
    status = clEnqueueReadBuffer(command_queue1, weights_pos1_buffer, 
           CL_TRUE, 0, w_pos_portion_size, output2, 
           0, NULL, NULL); 

    clFinish(command_queue1); 
    for(int i = 0; i < 790; ++i) { 
    printf("output array index: %d: (output1 value:%.4f \t output2 value:%d) \n", i, output1[i], output2[i]); 
    } 
}

РЕШЕНИЕ:

Ядро должно быть выглядит как (необходимость проверки индекса):

__kernel void k_1(__global uint *counter, 
        __global uint *weights, 
        __global uint2 *weights_pos) 
{ 
    const uint global_size = get_global_size(0); 
    const uint global_id = get_global_id(0); 
    uint local_id = get_local_id(0); 

    uint insert_index = atom_inc(&counter[5]); 
    if(insert_index < 768) { 
     weights[insert_index]= insert_index; 
     weights_pos[insert_index].x = insert_index; 
     weights_pos[insert_index].y = insert_index; 
    } 
}

источник

2012-05-19 Roman Shkarin

сообщение какой-то реальный код, а не просто ссылку, адресат вы удалите все равно, как только вы получили ответ на свой вопрос. –

@Christian Rau, Да, вы правы –

Я проверил ваш код на реализацию intel, и я получаю каждое поврежденное значение. Твой код мне кажется прекрасным. – sbabbi

Ядро должно быть похоже (необходимость проверки индекса):

__kernel void k_1(__global uint *counter, 
       __global uint *weights, 
       __global uint2 *weights_pos) 
{ 
    const uint global_size = get_global_size(0); 
    const uint global_id = get_global_id(0); 
    uint local_id = get_local_id(0); 

    uint insert_index = atom_inc(&counter[5]); 

    if(insert_index < 768) { 
    weights[insert_index]= insert_index; 
    weights_pos[insert_index].x = insert_index; 
    weights_pos[insert_index].y = insert_index; 
    } 
}

источник

2012-05-20 03:52:20

Вы смешиваете размеры буфера.

1) Ваши буфера содержат 768 элементов, каждый (см инициализации w_portion_size и w_pos_portion_size)

2) размер Workgroup на моей машине 896 (см инициализации wg_count)

3) При печати из 790 значений ,

Помимо этого, одна концептуальная ошибка здесь:

if(global_id == 0) { 
    counter[5] = 0; // set index of pos in weights to zero 
} 
//atomic increments on counter[5]

Вы не можете предположить, что первый виртуальный процессор будет выполнять эту строку перед другими. Вы должны полностью удалить эту строку, поскольку вы инициализируете counter[5] со стороны хоста. (Я считаю, что это причина вашей проблемы, но я не могу воспроизвести это).

После устранения этих проблем ваш код, кажется, работает нормально (реализация intel).

источник

2012-05-19 16:23:21 sbabbi

Спасибо вам за усилия. Я согласен с вашими заявлениями, но в любом случае исправление не принесло никаких положительных результатов. –

Чтение неправильных данных в OpenCL

ответ

Смежные вопросы