OpenCl: sample float4 program - Ошибка сегментации (core dumped)

Простая программа, которая считывает два вектора float4 из файлов, а затем вычисляет сумму противоположных чисел. я не мог найти проблему: MAIN файл:OpenCl: sample float4 program - Ошибка сегментации (core dumped)

#include <stdio.h> 
#include <stdlib.h> 
#include <iostream> 
#include <iomanip> 
#include <array> 
#include <fstream> 
#include <sstream> 
#include <string> 
#include <algorithm> 
#include <iterator> 

#ifdef __APPLE__ 
#include <OpenCL/opencl.h> 
#else 
#include <CL/cl.h> 
#include <time.h> 
#endif 



const int number_of_points = 16; // number of points in Both A and B files (number of rows) 
const int number_of_axis = 4;  // number of points axis in Both A and B files (number of Columns) 


using namespace std; 

void checkError(cl_int err, const char *operation) 
{ 
    if (err != CL_SUCCESS) 
    { 
    fprintf(stderr, "Error during operation '%s': %d\n", operation, err); 
    exit(1); 
    } 
} 

int main(int argc, char *argv[]) { 
    clock_t tStart = clock(); 
    // Create the two input vectors 
    // working variables 
    int i; 
    ifstream input_fileA, input_fileB; // input files 
    string line; // transfer row from file to array 
    float x;  // transfer word from file to array 
    int row = 0; // number of rows of file A,B (= array) 
    int col = 0; // number of rows of file A,B (= array) 

    // working arrays 

    // working arrays 
// int mem_size_TempA = number_of_points * number_of_axis * sizeof(cl_float); 
// int mem_size_TempB = number_of_points * number_of_axis * sizeof(cl_float); 

    float tempAArray[number_of_points][number_of_axis]={{0}}; // array contains file A data 
    float tempBArray[number_of_points][number_of_axis]={{0}}; // array contains file B data 



    int mem_size_InputA = number_of_points * number_of_axis ; 
    int mem_size_InputB = number_of_points * number_of_axis ; 
    int mem_size_Output = number_of_points * number_of_axis ; 

    float *inputAArray = (float*) malloc(number_of_points*sizeof(cl_float4)); // array contains file A data 
    float *inputBArray = (float*) malloc(number_of_points*sizeof(cl_float4)); // array contains file B data 
    float *outputArray = (float*) malloc(number_of_points*sizeof(cl_float4)); // array contains file B data 


    // import input files 
    input_fileA.open(argv[1]); 
    input_fileB.open(argv[2]); 


    // transfer input files data to array 
    // input file A to arrayA 
    row = 0; 
    while (getline(input_fileA, line)) 
    { 

     istringstream streamA(line); 
     col = 0; 
     while(streamA >> x){ 
      tempAArray[row][col] = x; 
      col++; 
     } 
     row++; 
    } 

    // input file B to arrayB 
    row = 0; 
    while (getline(input_fileB, line)) 
    { 

     istringstream streamB(line); 
     col = 0; 
     while(streamB >> x){ 
      tempBArray[row][col] = x; 
      col++; 
     } 
     row++; 
    } 

    // switch columns of B array 
    for(int row_of_arrayB = 0; row_of_arrayB < number_of_points; row_of_arrayB++) 
    { 
     float temporary = tempBArray[row_of_arrayB][2]; 
     tempBArray[row_of_arrayB][2] = tempBArray[row_of_arrayB][1]; 
     tempBArray[row_of_arrayB][1] = temporary; 
    } 

    // from Array to 3d vectors 
// for (int row_of_array = 0; row_of_array<number_of_points; row_of_array++) 
// { 
//  inputAArray[row_of_array] = (tempAArray[row_of_array][0], tempAArray[row_of_array][1], tempAArray[row_of_array][2],0); 
//  inputBArray[row_of_array] = (tempBArray[row_of_array][0], tempBArray[row_of_array][1], tempBArray[row_of_array][2],0); 
// } 

    for (int row_of_array=0; row_of_array < number_of_points; row_of_array++) 
    { 

     inputAArray[row_of_array*number_of_points+0] = tempAArray[row_of_array][0]; 
     inputAArray[row_of_array*number_of_points+1] = tempAArray[row_of_array][1]; 
     inputAArray[row_of_array*number_of_points+2] = tempAArray[row_of_array][2]; 
     inputAArray[row_of_array*number_of_points+3] = 0.0f; 

     inputBArray[row_of_array*number_of_points+0] = tempBArray[row_of_array][0]; 
     inputBArray[row_of_array*number_of_points+1] = tempBArray[row_of_array][1]; 
     inputBArray[row_of_array*number_of_points+2] = tempBArray[row_of_array][2]; 
     inputBArray[row_of_array*number_of_points+3] = tempBArray[row_of_array][3]; 

     outputArray[row_of_array*number_of_points+0] = 0.0f; 
     outputArray[row_of_array*number_of_points+1] = 0.0f; 
     outputArray[row_of_array*number_of_points+2] = 0.0f; 
     outputArray[row_of_array*number_of_points+3] = 0.0f; 
//  inputBArray[row_of_array] = (tempBArray[row_of_array][0], tempBArray[row_of_array][1], tempBArray[row_of_array][2],0); 

    } 
// for (int row_of_array=0; row_of_array < number_of_points; row_of_array++) 
// { 
//  printf("0: %f, 1: %f, 2: %f, 3: %f \n", inputAArray[row_of_array*number_of_points+0], inputAArray[row_of_array*number_of_points+1], 
//    inputAArray[row_of_array*number_of_points+2], inputAArray[row_of_array*number_of_points+3]); 
// } 
    // close input files 
    input_fileA.close(); 
    input_fileB.close(); 




    // Load the kernel source code into the array source_str 
    FILE *fp; 
    char *source_str; 
    size_t source_size; 

    fp = fopen("calculate_bottom_SNM_kernel.cl", "r"); 
    if (!fp) { 
     fprintf(stderr, "Failed to load kernel.\n"); 
     exit(1); 
    } 

    fseek(fp, 0, SEEK_END); 
    size_t programLength = ftell(fp); 
    rewind(fp); 

    source_str = (char*)malloc(programLength+1); 
    source_size = fread(source_str, 1, programLength, fp); 
    source_str[programLength] = '\0'; 
    fclose(fp); 

    // Get platform and device information 
    cl_platform_id platform_id = NULL; 
    cl_device_id device_id = NULL; 
    cl_uint ret_num_devices; 
    cl_uint ret_num_platforms; 
    cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms); 
    ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ALL, 1, 
      &device_id, &ret_num_devices); 

    // Create an OpenCL context 
    cl_context context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret); 

    // Create a command queue 
    cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret); 

    // Create memory buffers on the device for each vector 
    cl_mem inputa_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, 
      mem_size_InputA*sizeof(cl_float4) , NULL, &ret); 
    cl_mem inputb_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, 
      mem_size_InputB*sizeof(cl_float4), NULL, &ret); 

    cl_mem output_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, 
      mem_size_Output*sizeof(cl_float4), NULL, &ret); 


    // Copy the lists A and B to their respective memory buffers 
    ret = clEnqueueWriteBuffer(command_queue, inputa_mem_obj, CL_TRUE, 0, 
      mem_size_InputA*sizeof(cl_float4), inputAArray, 0, NULL, NULL); 
    ret = clEnqueueWriteBuffer(command_queue, inputb_mem_obj, CL_TRUE, 0, 
      mem_size_InputB*sizeof(cl_float4), inputBArray, 0, NULL, NULL); 


    // Create a program from the kernel source 
    cl_program program = clCreateProgramWithSource(context, 1, 
      (const char **)&source_str, (const size_t *)&source_size, &ret); 

    // Build the program 

    ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); 
    if (ret == CL_BUILD_PROGRAM_FAILURE) 
     { 
     // Get size of build log 
     size_t logSize; 
     ret = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 
            0, NULL, &logSize); 
     checkError(ret, "getting build log size"); 

     // Get build log 
     char log[logSize]; 
     ret = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 
            logSize, log, NULL); 
     checkError(ret, "getting build log"); 

     printf("OpenCL program build log:\n%s\n", log); 
     exit(1); 
     } 


    // Create the OpenCL kernel 
    cl_kernel kernel = clCreateKernel(program, "calculate_bottom_SNM", &ret); 

    // Set the arguments of the kernel 
    ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputa_mem_obj); 
    ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&inputb_mem_obj); 
    ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&output_mem_obj); 

    // Execute the OpenCL kernel on the list 
    size_t global_item_size = number_of_points; // Process the entire lists 
    size_t local_item_size = 4; // Process in groups of 64 

    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, 
      &global_item_size, &local_item_size, 0, NULL, NULL); 

    // Read the memory buffer C on the device to the local variable C 
// int *C = (int*)malloc(sizeof(int)*number_of_points); 


// float *C = (float*)malloc(sizeof(float)*number_of_points); 
    clEnqueueReadBuffer(command_queue, output_mem_obj, CL_TRUE, 0, 
      mem_size_Output, outputArray, 0, NULL, NULL); 


    // Display the result to the screen 
// float buttomSNM = 0; 
// for(i = 0; i < number_of_points; i++) 
// { 
//  for (int t=0; t<4; t++) 
//  { 
//   cout << "h" ; 
////   printf("%f, \n", outputArray[i*number_of_points+t]); 
//  } 
// } 

    // Clean up 
    ret = clFlush(command_queue); 
    ret = clFinish(command_queue); 
    ret = clReleaseKernel(kernel); 
    ret = clReleaseProgram(program); 
    ret = clReleaseMemObject(inputa_mem_obj); 
    ret = clReleaseMemObject(inputb_mem_obj); 
    ret = clReleaseMemObject(output_mem_obj); 
    ret = clReleaseCommandQueue(command_queue); 
    ret = clReleaseContext(context); 
    free (inputAArray); 
    free (inputBArray); 
    free (outputArray); 
printf("ALL Time taken: %.2fs\n", (double)(clock() - tStart)/CLOCKS_PER_SEC); 
    return 0; 
}

Kernel:

__kernel void calculate_bottom_SNM(__global float4 *inputAArray, __global float4 *inputBArray, 
         __global float4 *outputArray) { 

    // Get the index of the current element 
    int i = get_global_id(0); 
    int number_of_points = 16; 


    outputArray[i*number_of_points+0] = inputAArray[i*number_of_points+0] + inputBArray[i*number_of_points+0]; 
    outputArray[i*number_of_points+1] = inputAArray[i*number_of_points+1] + inputBArray[i*number_of_points+1]; 
    outputArray[i*number_of_points+2] = inputAArray[i*number_of_points+2] + inputBArray[i*number_of_points+2]; 
    outputArray[i*number_of_points+3] = inputAArray[i*number_of_points+3] + inputBArray[i*number_of_points+3]; 


}

Первые входные файлы: a.txt

0 0.000000e+00 9.998994e-01  
1 1.000000e-03 9.998981e-01  
2 2.000000e-03 9.998967e-01  
3 3.000000e-03 9.998953e-01  
4 4.000000e-03 9.998939e-01  
5 5.000000e-03 9.998925e-01  
6 6.000000e-03 9.998911e-01  
7 7.000000e-03 9.998896e-01  
8 8.000000e-03 9.998881e-01  
9 9.000000e-03 9.998865e-01  
10 1.000000e-02 9.998850e-01  
11 1.100000e-02 9.998834e-01  
12 1.200000e-02 9.998817e-01  
13 1.300000e-02 9.998800e-01  
14 1.400000e-02 9.998783e-01  
15 1.500000e-02 9.998766e-01

Второй входной файл B:

0 0.000000e+00 9.998966e-01  
1 1.000000e-03 9.998953e-01  
2 2.000000e-03 9.998939e-01  
3 3.000000e-03 9.998925e-01  
4 4.000000e-03 9.998911e-01  
5 5.000000e-03 9.998896e-01  
6 6.000000e-03 9.998881e-01  
7 7.000000e-03 9.998866e-01  
8 8.000000e-03 9.998850e-01  
9 9.000000e-03 9.998834e-01  
10 1.000000e-02 9.998818e-01  
11 1.100000e-02 9.998801e-01  
12 1.200000e-02 9.998785e-01  
13 1.300000e-02 9.998767e-01  
14 1.400000e-02 9.998750e-01  
15 1.500000e-02 9.998732e-01

Заранее спасибо

источник

2015-03-31 Rami Aqqad

Вы вычисляя свои индексы массива в ядре в довольно странным образом:

i*number_of_points+0 
i*number_of_points+1 
i*number_of_points+2 
i*number_of_points+3

Подумайте о том, что это на самом деле переводится для различных значений i (при условии, number_of_points=16):

i  array indices (i*16 + (0,1,2,3)) 
-------------------------------------- 
0  0, 1, 2, 3 
1  16, 17, 18, 19 
2  32, 33, 34, 35 
... 
etc

Это, безусловно, не то, что вы хотели! Кажется, что ваш пример кода просто пытается выполнить векторное добавление вектора. Если это так, то ваш код ядра просто должен выглядеть примерно так:

__kernel void vecadd(__global float4 *inputA, 
        __global float4 *inputB, 
        __global float4 *output) 
{ 
    int i = get_global_id(0); 
    output[i] = inputA[i] + inputB[i]; 
}

Это работает, потому что, были, выполняют ту же операцию для каждого элемента вектора. Если у вас есть ядро, которое должно использовать эти элементы отдельно, вы должны написать код следующим образом:

float4 valueA = inputA[i]; 
float4 valueB = inputB[i]; 

float4 result; 
result.x = valueA.x + valueB.x; // Do something with first component 
result.y = valueA.y * valueB.y; // Do something with second component 
result.z = valueA.z/valueB.z; // Do something with third component 
result.w = valueA.w - valueB.w; // Do something with fourth component

источник

2015-03-31 15:55:51 jprice

Спасибо @jprice, я сделал это, но когда я запустил его, было найдено 14 ошибок. Следующее последнее: «/tmp/OCL3559T1.cl», строка 45: ошибка: выражение должно иметь структуру или тип объединения \t outputArray.z = inputAArray.z/inputBArray.z; // Делаем что-то с третьим компонентом \t^ 14 ошибок, обнаруженных в компиляции «/tmp/OCL3559T1.cl». Фронтальная фаза не была скомпилирована. –

вы не должны компилировать файл CL во время компиляции своего приложения, это задача компилятора времени выполнения. – DarkZeros

@RamiAqqad Похоже, вы пытаетесь использовать '.z' в массиве векторов, а не в одном векторном типе. Обратите внимание, что в моем коде я загружаю значение 'float4' из массива во временное, а затем используя' .x', '.y' и т. Д. – jprice

OpenCl: sample float4 program - Ошибка сегментации (core dumped)

ответ

Смежные вопросы