Cuda - значения устройства 0 после выполнения ядра

-3

По какой-то причине, когда я выполняю свою программу, переменные устройства имеют нулевые значения. Перед выполнением ядра cuda переменные устройства имеют правильные значения. Выходное изображение просто черное от исходного размера изображения. Все распределения памяти и копирование на хост и из него кажутся правильными.Cuda - значения устройства 0 после выполнения ядра

Спасибо за помощь!

// Includes, system 
#include <stdlib.h> 
#include <stdio.h> 
#include <string.h> 
#include <math.h> 

#ifdef _WIN32 
# define WINDOWS_LEAN_AND_MEAN 
# define NOMINMAX 
# include <windows.h> 
#endif 

#define Image_Size 512 
#define Kernel_Size 3 

// Includes CUDA 
#include <cuda_runtime.h> 

// Utilities and timing functions 
#include "./inc/helper_functions.h" // includes cuda.h and cuda_runtime_api.h 

// CUDA helper functions 
#include "./inc/helper_cuda.h"   // helper functions for CUDA error check 

const char *imageFilename = "lena_bw.pgm"; 

const char *sampleName = "simpleTexture"; 

#define C_PI 3.141592653589793238462643383279502884197169399375 

void __global__ SwirlCu(int width, int height, int stride, float *pRawBitmapOrig, float *pBitmapCopy, double factor) 
{ 
    // This function effectively swirls an image 
    // This CUDA kernel is basically the exact same code as the CPU-only, except it has a slightly different setup 
    // Each thread on the GPU will process exactly one pixel 
    // Before doing anything, we need to determine the current pixel we are calculating in this thread 
    // Original code used i as y, and j as x. We will do the same so we can just re-use CPU code in the CUDA kernel 

    int i = blockIdx.y * blockDim.y + threadIdx.y; 
    int j = blockIdx.x * blockDim.x + threadIdx.x; 
    // Test to see if we're testing a valid pixel 
    if (i >= height || j >= width) return; // Don't bother doing the calculation. We're not in a valid pixel location 

    double cX = (double)width/2.0f; 
    double cY = (double)height/2.0f; 
    double relY = cY-i; 
    double relX = j-cX; 
    // relX and relY are points in our UV space 
    // Calculate the angle our points are relative to UV origin. Everything is in radians. 
    double originalAngle; 
    if (relX != 0) 
    { 
     originalAngle = atan(abs(relY)/abs(relX)); 
     if (relX > 0 && relY < 0) originalAngle = 2.0f*C_PI - originalAngle; 
     else if (relX <= 0 && relY >=0) originalAngle = C_PI-originalAngle; 
     else if (relX <=0 && relY <0) originalAngle += C_PI; 
    } 
    else 
    { 
     // Take care of rare special case 
     if (relY >= 0) originalAngle = 0.5f * C_PI; 
     else originalAngle = 1.5f * C_PI; 
    } 
    // Calculate the distance from the center of the UV using pythagorean distance 
    double radius = sqrt(relX*relX + relY*relY); 
    // Use any equation we want to determine how much to rotate image by 
    //double newAngle = originalAngle + factor*radius; // a progressive twist 
    double newAngle = originalAngle + 1/(factor*radius+(4.0f/C_PI)); 
    // Transform source UV coordinates back into bitmap coordinates 
    int srcX = (int)(floor(radius * cos(newAngle)+0.5f)); 
    int srcY = (int)(floor(radius * sin(newAngle)+0.5f)); 
    srcX += cX; 
    srcY += cY; 
    srcY = height - srcY; 
    // Clamp the source to legal image pixel 
    if (srcX < 0) srcX = 0; 
    else if (srcX >= width) srcX = width-1; 
    if (srcY < 0) srcY = 0; 
    else if (srcY >= height) srcY = height-1; 
    // Set the pixel color 
    // Since each thread writes to exactly 1 unique pixel, we don't have to do anything special here 
    pRawBitmapOrig[i*stride/4 + j] = pBitmapCopy[srcY*stride/4 + srcX]; 
} 




//////////////////////////////////////////////////////////////////////////////// 
// Declaration, forward 
void runTest(int argc, char **argv); 

//////////////////////////////////////////////////////////////////////////////// 
// Program main 
//////////////////////////////////////////////////////////////////////////////// 
int main(int argc, char **argv) 
{ 
    printf("%s starting...\n", sampleName); 

    // Process command-line arguments 
    if (argc > 1) 
    { 
     if (checkCmdLineFlag(argc, (const char **) argv, "input")) 
     { 
      getCmdLineArgumentString(argc,(const char **) argv,"input",(char **) &imageFilename); 
     } 
     else if (checkCmdLineFlag(argc, (const char **) argv, "reference")) 
     { 
      printf("-reference flag should be used with -input flag"); 
      exit(EXIT_FAILURE); 
     } 
    } 

    runTest(argc, argv); 

    cudaDeviceReset(); 
    printf("%s completed", 
      sampleName); 
    //exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE); 
} 

//////////////////////////////////////////////////////////////////////////////// 
//! Run a simple test for CUDA 
//////////////////////////////////////////////////////////////////////////////// 
void runTest(int argc, char **argv) 
{ 
    int devID = findCudaDevice(argc, (const char **) argv); 
    unsigned int kernel_bytes = Kernel_Size * Kernel_Size * sizeof(float); 
    // load image from disk 
    float *hData = NULL; 
    float *host_array_kernel = 0; 

    float *device_array_Image = 0; 
    float *device_array_kernel = 0; 
    float *device_array_Result = 0; 


    unsigned int width, height; 
    char *imagePath = sdkFindFilePath(imageFilename, argv[0]); 

    if (imagePath == NULL) 
    { 
     printf("Unable to source image file: %s\n", imageFilename); 
     exit(EXIT_FAILURE); 
    } 

    sdkLoadPGM(imagePath, &hData, &width, &height); 

    unsigned int size = width * height * sizeof(float); 
    printf("Loaded '%s', %d x %d pixels\n", imageFilename, width, height); 

    // Allocation of device arrays using CudaMalloc 
    cudaMalloc((void**)&device_array_Image, size); 
    cudaMalloc((void**)&device_array_kernel, kernel_bytes); 
    cudaMalloc((void**)&device_array_Result, size); 


    host_array_kernel = (float*)malloc(kernel_bytes); // kernel 


    // Allocate mem for the result on host side 
    float *hOutputDataSharp = (float *) malloc(size); 

    GenerateKernel (host_array_kernel); 


// copy arrays and kernel from host to device 
    checkCudaErrors(cudaMemcpy(device_array_Image, hData, size, cudaMemcpyHostToDevice)); 
    checkCudaErrors(cudaMemcpy(device_array_kernel, host_array_kernel, kernel_bytes, cudaMemcpyHostToDevice)); 



    dim3 dimBlock(16, 16, 1); 
    dim3 dimGrid(width/dimBlock.x, height/dimBlock.y, 1); 

    //Do the Convolution 
    printf("DImage : '%.8f'\n",device_array_Image); 
    printf("DKernel : '%.8f'\n",device_array_kernel); 
    //serialConvolution(hData, host_array_kernel ,hOutputDataSharp); 


    SwirlCu<<<512, 512>>>(width, height, width*4, device_array_Image,device_array_Result, 0.005f); 
    printf("DResult : '%.8f'\n",device_array_Result); 
    checkCudaErrors(cudaDeviceSynchronize()); 
    cudaMemcpy(hOutputDataSharp,device_array_Result, size, cudaMemcpyDeviceToHost); 
    printf("HResult : '%.8f'\n",hOutputDataSharp); 
    // Write result to file 
    char outputSharp[1024]; 

    strcpy(outputSharp, imagePath); 
    strcpy(outputSharp, "data/serial_sharptest.pgm"); 
    sdkSavePGM(outputSharp, hOutputDataSharp, width, height); 

    cudaFree(device_array_Result); 
    cudaFree(device_array_Image); 
    cudaFree(device_array_kernel); 
    free(hData); 
    free(imagePath); 
    //free(host_array_Image); 
    free(host_array_kernel); 
    free(hOutputDataSharp); 
    //free(hOutputImage); 
    //free(hOutputKernel); 
}

источник

2016-04-28 Alpha

проверка ошибок отсутствует ?? –

Не вызывает ошибок. Созданное изображение просто черное, а не обработанное изображение, которое оно должно быть. Я действительно новичок в cuda, по сути, начал несколько дней назад, поэтому я все еще собираюсь понять это. – Alpha

@Alpha: Откуда вы знаете, что он не создает ошибок, если в коде нет ошибок? – talonmies

Ваш код пишет в исходном изображении:.

pRawBitmapOrig[i*stride/4 + j] = pBitmapCopy[srcY*stride/4 + srcX];

, который пишет в device_array_Image, который является источником, а не цель, которую вы ожидаете результатов в

Кроме того, я очень любопытный на выходе printf("DResult : '%.8f'\n",device_array_Result); as device_array_Result в адресном пространстве и адрес, адрес cudaMalloc. На каком устройстве вы работаете?

источник

2016-04-28 20:48:12

Cuda - значения устройства 0 после выполнения ядра

ответ

Смежные вопросы