18赞

内核启动失败：参数无效，CUDA运行时错误

作者：周扒pi | 2023-09-10 20:03

如何解决《内核启动失败：参数无效，CUDA运行时错误》经验，为你挑选了1个好方法。

我正在尝试在随附的代码中启动内核。我收到消息“内核启动失败：参数无效”。

// System includes
#include 
#include 

// CUDA runtime
#include 

// Helper functions and utilities to work with CUDA
#include 

// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
#define checkCudaErrors(err)  __checkCudaErrors (err, __FILE__, __LINE__)

inline void __checkCudaErrors(cudaError err, const char *file, const int line )
{
    if(cudaSuccess != err)
    {
        fprintf(stderr, "%s(%i) : CUDA Runtime API error %d: %s.\n",file, line, (int)err, cudaGetErrorString( err ) );
        exit(-1);
    }
}

static const int MAX_FILTER_WIDTH = 7;

char *image_filename = "lena_bw_big.pgm";
char *out_filename = "lena_bw.out.pgm";
char *results_filename = "results.log";

// Loads filter configuration parameters from the command line
void load_filter(int argc, char** argv, int* filt_width, float* factor, float* bias, float* coefs, bool* use_shared)
{
    //forward declaration of a function that is being used here
    void parse_coefs(const char* coefs_txt, int radius, float* coefs);

    char* coefs_txt;

    if (argv==NULL || filt_| factor==NULL || bias==NULL || coefs==NULL)
    {
        printf("Error: Bad params to load_coefs\n");
        exit(-1);
    }

    if (checkCmdLineFlag(argc, (const char **)argv, "filter_width"))
    {
        *filt_width = getCmdLineArgumentInt(argc, (const char **)argv, "filter_width");
        if (*filt_width < 1 || *filt_width >  MAX_FILTER_WIDTH || (*filt_width % 2) != 1)
        {
            printf("Error: Invalid filter width (%d)\n",*filt_width);
            exit(-1);
        }
    }
    else
    {
        printf("Error: Filter width is not specified\n");
        exit(-1);
    }

    if (checkCmdLineFlag(argc, (const char **)argv, "bias"))
        *bias = getCmdLineArgumentFloat(argc, (const char **)argv, "bias");
    else
    {
        printf("Error: Bias is not specified\n");
        exit(-1);
    }

    if (checkCmdLineFlag(argc, (const char **)argv, "factor"))
        *factor = getCmdLineArgumentFloat(argc, (const char **)argv, "factor");
    else
    {
        printf("Error: Factor is not specified\n");
        exit(-1);
    }

    if (checkCmdLineFlag(argc, (const char **)argv, "coefs"))
        getCmdLineArgumentString(argc, (const char **)argv, "coefs",&coefs_txt);

    parse_coefs(coefs_txt,*filt_width,coefs);

    if (checkCmdLineFlag(argc, (const char **)argv, "shared"))
        *use_shared = true;
    else
        *use_shared = false;
}

// Parse filter coefficients from string. The number of coefficients should be radius*radius.
void parse_coefs(const char* coefs_txt, int filt_width, float* coefs)
{
    const char* ptxt = coefs_txt;
    int skip_chars;

    memset(coefs,0,MAX_FILTER_WIDTH*MAX_FILTER_WIDTH*sizeof(float));

    for (int i = filt_width - 1; i >= 0; i--)
    {
        for (int j = filt_width - 1; j >= 0; j--)
        {
            if (sscanf(ptxt,"%f%n", &coefs[i*MAX_FILTER_WIDTH+j], &skip_chars) != 1)
            {
                printf("Error: Not enough coefficients. Read %d/%d coefficients.\n",i*filt_width+j,filt_width*filt_width);
                exit(-1);
            }
            ptxt += skip_chars+1;
        }
    }
}

__global__ void convolution2D_kernel(
        unsigned char* inputImage,
        unsigned char* outputImage,
        float* filter,
        int imageWidth,
        int imageHeight,
        int imagePitch,
        int filterWidth,
        float hfactor,
        float hbias
        )
{/*
    int idx=blockDim.x*blockIdx.x+threadIdx.x;
    int idy=blockDim.y*blockIdx.y+threadIdx.y;
    if(0=0 && imageX < imageWidth && imageY >=0 && imageY < imageHeight) {
                sum += inputImage[imageX+imageWidth*imageY] * filter[filterX + filterY*filterWidth];
            }
            //sum*=hfactor;
            //sum+=hbias;
            //sum=
                                                                                                                                                                                                                                                                                                                                                                                                                                                        //truncate values smaller than zero and larger than 255
            outputImage[idx+imageWidth*idy] = fminf(fmaxf(int(hfactor * sum + hbias), 0), 255);

        }
    }*/
}

__global__ void convolution2DShared_kernel(
        unsigned char* inputImage,
        unsigned char* outputImage,
        int imageWidth,
        int imageHeight,
        int imagePitch,
        int filterWidth
        )
{

}

void convolution2D(unsigned char* input_img, unsigned char* output_img, float* hfilter, int width, int height,
                int hfilt_width, float hfactor, float hbias, float* hcoefs, bool use_shared)
{
    // Allocate device memory
    unsigned char *d_in=NULL, *d_out=NULL;
    float *d_filter=NULL;

    int imgSize=sizeof(float)*width*height;
    int filterSize=sizeof(float)*hfilt_width*hfilt_width;

    int blockWidth=32;
    int gridx=width/blockWidth;
    if(width%blockWidth!=0)
        gridx++;
    printf("gridx size is %d\n",gridx);
    int gridy=height/blockWidth;
    if(height%blockWidth!=0)
        gridy++;
    printf("gridy size is %d\n",gridy);
    printf("blockWidth size is %d\n",blockWidth);

    // measure execution time
    cudaEvent_t start,stop;
    const int iters = 10;

    checkCudaErrors(cudaEventCreate(&start));
    checkCudaErrors(cudaEventCreate(&stop));

    cudaEventRecord(start, NULL);

    printf("allocating mem\n");
    cudaMalloc((void **) d_in, imgSize);
    cudaMalloc((void **) d_out, imgSize);
    cudaMalloc((void **) &d_filter, filterSize);


    cudaMemcpy(d_in,input_img,imgSize,cudaMemcpyHostToDevice);
    cudaMemcpy(d_filter,hfilter,filterSize,cudaMemcpyHostToDevice);

    // Setup execution parameters
    dim3 threads(blockWidth, blockWidth);       
    dim3 grid(gridx,gridy);
    printf("kernel starts\n");
    // calculate execution time average over iters iterations
    for (int i=0; i>>(d_in, d_out, d_filter, width, height, width, hfilt_width, hfactor, hbias);
        else
            convolution2DShared_kernel<<>>(d_in, d_out, width, height, width, hfilt_width);
    }

    checkCudaErrors(cudaEventRecord(stop, NULL));
    checkCudaErrors(cudaEventSynchronize(stop));

    // check for errors during kernel launch
    cudaError_t err;
    if ((err = cudaGetLastError()) != cudaSuccess)
    {
        printf("Kernel launch failed: %s",cudaGetErrorString(err));
        exit(1);
    }

    float msec = 0.0f;
    checkCudaErrors(cudaEventElapsedTime(&msec, start, stop));

    printf("Applying %dx%d filter on image of size %dx%d %s using shared memory took %f ms\n",
            hfilt_width,hfilt_width,width,height,(use_shared?"with":"without"),msec/iters);

    // write results to results file
    unsigned long long result_values[] = {hfilt_width,hfilt_width,width,height,use_shared,msec/iters*1000};
    if (true != sdkWriteFile(results_filename,result_values,6,0,false,true))
    {
        printf("Error: Writing results file failed.");
        exit(1);
    }

    cudaFree(d_in);
    cudaFree(d_out);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
}

void convolution_cpu(unsigned char* input_img, unsigned char* output_img, int width, int height,
        int hfilt_width, float hfactor, float hbias, float* hcoefs)
{
    for(int x = 0; x < width; x++)
        for(int y = 0; y < height; y++)
        {
            float sum = 0.f;

            //multiply every value of the filter with corresponding image pixel
            for(int filterX = 0; filterX < hfilt_width; filterX++)
            for(int filterY = 0; filterY < hfilt_width; filterY++)
            {
                int imageX = x - hfilt_width / 2 + filterX;
                int imageY = y - hfilt_width / 2 + filterY;
                if (imageX >=0 && imageX < width && imageY >=0 && imageY < height) {
                    sum += input_img[imageX+width*imageY] * hcoefs[filterX + filterY*MAX_FILTER_WIDTH];
                }
            }

            //truncate values smaller than zero and larger than 255
            output_img[x+width*y] = std::min(std::max(int(hfactor * sum + hbias), 0), 255);
        }
}


/**
* Program main
*/
int main(int argc, char **argv)
{
    unsigned char* h_inimg = NULL;
    unsigned char* h_outimg = NULL;
    unsigned char* h_refimg = NULL;
    unsigned int width, height;
    int hfilt_width = -1;
    float hfactor = 1.f, hbias = 0.f;
    float hcoefs[MAX_FILTER_WIDTH * MAX_FILTER_WIDTH];
    bool use_shared = false;

    // load parameters of filter
    if (argc > 1)
        load_filter(argc,argv,&hfilt_width,&hfactor,&hbias,hcoefs,&use_shared);
    else {
        hfilt_width = 5;
        hfactor = 1.0f / 13.0f;
        hbias = 0.0f;
        parse_coefs(
            "0,0,1,0,0,"
            "0,1,1,1,0,"
            "1,1,1,1,1,"
            "0,1,1,1,0,"
            "0,0,1,0,0,",
            hfilt_width,hcoefs);
    }


    char* image_path = sdkFindFilePath(image_filename, argv[0]);
    if (image_path == NULL) {
        printf("Unable to source image file: %s\n", image_filename);
        exit(-1);
    }
    // Load image from disk
    sdkLoadPGM(image_path, &h_inimg, &width, &height);
    h_outimg = (unsigned char*)malloc(width * height);

    printf("Starting convolution\n");
    convolution2D(h_inimg,h_outimg,hcoefs,width,height,hfilt_width,hfactor,hbias,hcoefs,use_shared);

    printf("Validating...\n");
    h_refimg = (unsigned char*)malloc(width * height);
    convolution_cpu(h_inimg,h_refimg,width,height,hfilt_width,hfactor,hbias,hcoefs);
    int err_cnt = 0;
    for (int r=0; r 4)
                {
                    printf("Terminating...\n");
                    exit(1);
                }
            }
    if (0 == err_cnt)
        printf("OK\n");

    // Save image
    sdkSavePGM(out_filename,h_outimg,width,height);

    free(h_inimg);
    free(h_outimg);
}

如果我在第191行中添加注释，则一切运行正常且正常（内核中没有数据）。

谁能指出将数据传送到内核的正确方法？

1> Robert Crove..：

首先，您没有完成正确的cuda错误检查的工作。您应该检查每个 CUDA API调用的返回值。

如果这样做，您将发现“无效参数”错误与内核启动无关，但是由于这是您检查错误的唯一位置，因此该错误在那里得到报告。

实际错误发生在这些行上：

cudaMalloc((void **) d_in, imgSize);
cudaMalloc((void **) d_out, imgSize);
cudaMalloc((void **) &d_filter, filterSize);

您可以通过添加必要的＆来修复它：

cudaMalloc((void **) &d_in, imgSize);
cudaMalloc((void **) &d_out, imgSize);
cudaMalloc((void **) &d_filter, filterSize);

修复该错误后，您将发现下一个错误是某个cudaMemcpy操作的段错误：

cudaMemcpy(d_in,input_img,imgSize,cudaMemcpyHostToDevice);

根本原因在这里：

int imgSize=sizeof(float)*width*height;
            ^^^^^^^^^^^^^

由于你的d_in就是unsigned char你的input_img就是unsigned char，我不知道为什么你认为你应该乘以图像尺寸sizeof(float)。无论如何，将该行更改为此：

int imgSize=width*height;

将修复段错误。进行这些更改后，您的代码可以在没有任何CUDA错误的情况下运行。显然，结果是虚假的，因为您的内核什么都不做。

推荐阅读

程序员
当我尝试对列表进行排序时,我得到一个错误'dict'对象没有属性

如何解决《当我尝试对列表进行排序时,我得到一个错误'dict'对象没有属性》经验，为你挑选了1个好方法。 ... [详细]
程序员
如何在Windows 10 UWP应用程序中播放(MIDI)声音？

如何解决《如何在Windows10UWP应用程序中播放(MIDI)声音？》经验，为你挑选了0个好方法。 ... [详细]
程序员
要字符串的字符串列表

如何解决《要字符串的字符串列表》经验，为你挑选了1个好方法。 ... [详细]
程序员
JUnit assertEquals失败

如何解决《JUnitassertEquals失败》经验，为你挑选了1个好方法。 ... [详细]
程序员
在spark中访问嵌套数据

如何解决《在spark中访问嵌套数据》经验，为你挑选了1个好方法。 ... [详细]
程序员
删除&害羞; 元素中的(软连字符)实体

如何解决《删除&害羞;元素中的(软连字符)实体》经验，为你挑选了1个好方法。 ... [详细]
程序员
如何多次选择相同的输入[selectize.js]？

如何解决《如何多次选择相同的输入[selectize.js]？》经验，为你挑选了0个好方法。 ... [详细]
程序员
RecyclerView滚动到smoothScrollToPosition(0)后结束

如何解决《RecyclerView滚动到smoothScrollToPosition(0)后结束》经验，为你挑选了0个好方法。 ... [详细]
程序员
父控制器上的调用方法？

如何解决《父控制器上的调用方法？》经验，为你挑选了1个好方法。 ... [详细]
程序员
如何从锚链接中删除click事件？

如何解决《如何从锚链接中删除click事件？》经验，为你挑选了1个好方法。 ... [详细]
程序员
C#等效的PHP http_build_query

如何解决《C#等效的PHPhttp_build_query》经验，为你挑选了1个好方法。 ... [详细]
程序员
MVC DropDownList OnChange更新其他表单字段

如何解决《MVCDropDownListOnChange更新其他表单字段》经验，为你挑选了1个好方法。 ... [详细]
程序员
将背景过滤器应用于svg path元素

如何解决《将背景过滤器应用于svgpath元素》经验，为你挑选了1个好方法。 ... [详细]
程序员
ggplot2:更改条形图中每个构面的颜色

如何解决《ggplot2:更改条形图中每个构面的颜色》经验，为你挑选了1个好方法。 ... [详细]
程序员
Golang:如何在Linux上使用syscall.Syscall？

如何解决《Golang:如何在Linux上使用syscall.Syscall？》经验，为你挑选了0个好方法。 ... [详细]
程序员
如何从django rest框架的基于函数的视图中分页响应？

如何解决《如何从djangorest框架的基于函数的视图中分页响应？》经验，为你挑选了1个好方法。 ... [详细]
程序员
解析JSON响应的最简单方法

如何解决《解析JSON响应的最简单方法》经验，为你挑选了3个好方法。 ... [详细]
程序员
Google Feeds API无法正常工作-是否已关闭？

如何解决《GoogleFeedsAPI无法正常工作-是否已关闭？》经验，为你挑选了1个好方法。 ... [详细]
程序员
MATLAB - 获取当前工作空间

如何解决《MATLAB-获取当前工作空间》经验，为你挑选了1个好方法。 ... [详细]
程序员
为什么减法超过30天导致错误答案

如何解决《为什么减法超过30天导致错误答案》经验，为你挑选了1个好方法。 ... [详细]

周扒pi

这个屌丝很懒，什么也没留下！

关注作者

Tags | 热门标签

RankList | 热门文章