我正在尝试在随附的代码中启动内核。我收到消息“内核启动失败:参数无效”。
// System includes #include#include // CUDA runtime #include // Helper functions and utilities to work with CUDA #include // This will output the proper CUDA error strings in the event that a CUDA host call returns an error #define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__) inline void __checkCudaErrors(cudaError err, const char *file, const int line ) { if(cudaSuccess != err) { fprintf(stderr, "%s(%i) : CUDA Runtime API error %d: %s.\n",file, line, (int)err, cudaGetErrorString( err ) ); exit(-1); } } static const int MAX_FILTER_WIDTH = 7; char *image_filename = "lena_bw_big.pgm"; char *out_filename = "lena_bw.out.pgm"; char *results_filename = "results.log"; // Loads filter configuration parameters from the command line void load_filter(int argc, char** argv, int* filt_width, float* factor, float* bias, float* coefs, bool* use_shared) { //forward declaration of a function that is being used here void parse_coefs(const char* coefs_txt, int radius, float* coefs); char* coefs_txt; if (argv==NULL || filt_| factor==NULL || bias==NULL || coefs==NULL) { printf("Error: Bad params to load_coefs\n"); exit(-1); } if (checkCmdLineFlag(argc, (const char **)argv, "filter_width")) { *filt_width = getCmdLineArgumentInt(argc, (const char **)argv, "filter_width"); if (*filt_width < 1 || *filt_width > MAX_FILTER_WIDTH || (*filt_width % 2) != 1) { printf("Error: Invalid filter width (%d)\n",*filt_width); exit(-1); } } else { printf("Error: Filter width is not specified\n"); exit(-1); } if (checkCmdLineFlag(argc, (const char **)argv, "bias")) *bias = getCmdLineArgumentFloat(argc, (const char **)argv, "bias"); else { printf("Error: Bias is not specified\n"); exit(-1); } if (checkCmdLineFlag(argc, (const char **)argv, "factor")) *factor = getCmdLineArgumentFloat(argc, (const char **)argv, "factor"); else { printf("Error: Factor is not specified\n"); exit(-1); } if (checkCmdLineFlag(argc, (const char **)argv, "coefs")) getCmdLineArgumentString(argc, (const char **)argv, "coefs",&coefs_txt); parse_coefs(coefs_txt,*filt_width,coefs); if (checkCmdLineFlag(argc, (const char **)argv, "shared")) *use_shared = true; else *use_shared = false; } // Parse filter coefficients from string. The number of coefficients should be radius*radius. void parse_coefs(const char* coefs_txt, int filt_width, float* coefs) { const char* ptxt = coefs_txt; int skip_chars; memset(coefs,0,MAX_FILTER_WIDTH*MAX_FILTER_WIDTH*sizeof(float)); for (int i = filt_width - 1; i >= 0; i--) { for (int j = filt_width - 1; j >= 0; j--) { if (sscanf(ptxt,"%f%n", &coefs[i*MAX_FILTER_WIDTH+j], &skip_chars) != 1) { printf("Error: Not enough coefficients. Read %d/%d coefficients.\n",i*filt_width+j,filt_width*filt_width); exit(-1); } ptxt += skip_chars+1; } } } __global__ void convolution2D_kernel( unsigned char* inputImage, unsigned char* outputImage, float* filter, int imageWidth, int imageHeight, int imagePitch, int filterWidth, float hfactor, float hbias ) {/* int idx=blockDim.x*blockIdx.x+threadIdx.x; int idy=blockDim.y*blockIdx.y+threadIdx.y; if(0 =0 && imageX < imageWidth && imageY >=0 && imageY < imageHeight) { sum += inputImage[imageX+imageWidth*imageY] * filter[filterX + filterY*filterWidth]; } //sum*=hfactor; //sum+=hbias; //sum= //truncate values smaller than zero and larger than 255 outputImage[idx+imageWidth*idy] = fminf(fmaxf(int(hfactor * sum + hbias), 0), 255); } }*/ } __global__ void convolution2DShared_kernel( unsigned char* inputImage, unsigned char* outputImage, int imageWidth, int imageHeight, int imagePitch, int filterWidth ) { } void convolution2D(unsigned char* input_img, unsigned char* output_img, float* hfilter, int width, int height, int hfilt_width, float hfactor, float hbias, float* hcoefs, bool use_shared) { // Allocate device memory unsigned char *d_in=NULL, *d_out=NULL; float *d_filter=NULL; int imgSize=sizeof(float)*width*height; int filterSize=sizeof(float)*hfilt_width*hfilt_width; int blockWidth=32; int gridx=width/blockWidth; if(width%blockWidth!=0) gridx++; printf("gridx size is %d\n",gridx); int gridy=height/blockWidth; if(height%blockWidth!=0) gridy++; printf("gridy size is %d\n",gridy); printf("blockWidth size is %d\n",blockWidth); // measure execution time cudaEvent_t start,stop; const int iters = 10; checkCudaErrors(cudaEventCreate(&start)); checkCudaErrors(cudaEventCreate(&stop)); cudaEventRecord(start, NULL); printf("allocating mem\n"); cudaMalloc((void **) d_in, imgSize); cudaMalloc((void **) d_out, imgSize); cudaMalloc((void **) &d_filter, filterSize); cudaMemcpy(d_in,input_img,imgSize,cudaMemcpyHostToDevice); cudaMemcpy(d_filter,hfilter,filterSize,cudaMemcpyHostToDevice); // Setup execution parameters dim3 threads(blockWidth, blockWidth); dim3 grid(gridx,gridy); printf("kernel starts\n"); // calculate execution time average over iters iterations for (int i=0; i >>(d_in, d_out, d_filter, width, height, width, hfilt_width, hfactor, hbias); else convolution2DShared_kernel<< >>(d_in, d_out, width, height, width, hfilt_width); } checkCudaErrors(cudaEventRecord(stop, NULL)); checkCudaErrors(cudaEventSynchronize(stop)); // check for errors during kernel launch cudaError_t err; if ((err = cudaGetLastError()) != cudaSuccess) { printf("Kernel launch failed: %s",cudaGetErrorString(err)); exit(1); } float msec = 0.0f; checkCudaErrors(cudaEventElapsedTime(&msec, start, stop)); printf("Applying %dx%d filter on image of size %dx%d %s using shared memory took %f ms\n", hfilt_width,hfilt_width,width,height,(use_shared?"with":"without"),msec/iters); // write results to results file unsigned long long result_values[] = {hfilt_width,hfilt_width,width,height,use_shared,msec/iters*1000}; if (true != sdkWriteFile(results_filename,result_values,6,0,false,true)) { printf("Error: Writing results file failed."); exit(1); } cudaFree(d_in); cudaFree(d_out); cudaEventDestroy(start); cudaEventDestroy(stop); } void convolution_cpu(unsigned char* input_img, unsigned char* output_img, int width, int height, int hfilt_width, float hfactor, float hbias, float* hcoefs) { for(int x = 0; x < width; x++) for(int y = 0; y < height; y++) { float sum = 0.f; //multiply every value of the filter with corresponding image pixel for(int filterX = 0; filterX < hfilt_width; filterX++) for(int filterY = 0; filterY < hfilt_width; filterY++) { int imageX = x - hfilt_width / 2 + filterX; int imageY = y - hfilt_width / 2 + filterY; if (imageX >=0 && imageX < width && imageY >=0 && imageY < height) { sum += input_img[imageX+width*imageY] * hcoefs[filterX + filterY*MAX_FILTER_WIDTH]; } } //truncate values smaller than zero and larger than 255 output_img[x+width*y] = std::min(std::max(int(hfactor * sum + hbias), 0), 255); } } /** * Program main */ int main(int argc, char **argv) { unsigned char* h_inimg = NULL; unsigned char* h_outimg = NULL; unsigned char* h_refimg = NULL; unsigned int width, height; int hfilt_width = -1; float hfactor = 1.f, hbias = 0.f; float hcoefs[MAX_FILTER_WIDTH * MAX_FILTER_WIDTH]; bool use_shared = false; // load parameters of filter if (argc > 1) load_filter(argc,argv,&hfilt_width,&hfactor,&hbias,hcoefs,&use_shared); else { hfilt_width = 5; hfactor = 1.0f / 13.0f; hbias = 0.0f; parse_coefs( "0,0,1,0,0," "0,1,1,1,0," "1,1,1,1,1," "0,1,1,1,0," "0,0,1,0,0,", hfilt_width,hcoefs); } char* image_path = sdkFindFilePath(image_filename, argv[0]); if (image_path == NULL) { printf("Unable to source image file: %s\n", image_filename); exit(-1); } // Load image from disk sdkLoadPGM(image_path, &h_inimg, &width, &height); h_outimg = (unsigned char*)malloc(width * height); printf("Starting convolution\n"); convolution2D(h_inimg,h_outimg,hcoefs,width,height,hfilt_width,hfactor,hbias,hcoefs,use_shared); printf("Validating...\n"); h_refimg = (unsigned char*)malloc(width * height); convolution_cpu(h_inimg,h_refimg,width,height,hfilt_width,hfactor,hbias,hcoefs); int err_cnt = 0; for (int r=0; r 4) { printf("Terminating...\n"); exit(1); } } if (0 == err_cnt) printf("OK\n"); // Save image sdkSavePGM(out_filename,h_outimg,width,height); free(h_inimg); free(h_outimg); }
如果我在第191行中添加注释,则一切运行正常且正常(内核中没有数据)。
谁能指出将数据传送到内核的正确方法?
首先,您没有完成正确的cuda错误检查的工作。您应该检查每个 CUDA API调用的返回值。
如果这样做,您将发现“无效参数”错误与内核启动无关,但是由于这是您检查错误的唯一位置,因此该错误在那里得到报告。
实际错误发生在这些行上:
cudaMalloc((void **) d_in, imgSize); cudaMalloc((void **) d_out, imgSize); cudaMalloc((void **) &d_filter, filterSize);
您可以通过添加必要的&来修复它:
cudaMalloc((void **) &d_in, imgSize); cudaMalloc((void **) &d_out, imgSize); cudaMalloc((void **) &d_filter, filterSize);
修复该错误后,您将发现下一个错误是某个cudaMemcpy
操作的段错误:
cudaMemcpy(d_in,input_img,imgSize,cudaMemcpyHostToDevice);
根本原因在这里:
int imgSize=sizeof(float)*width*height; ^^^^^^^^^^^^^
由于你的d_in
就是unsigned char
你的input_img
就是unsigned char
,我不知道为什么你认为你应该乘以图像尺寸sizeof(float)
。无论如何,将该行更改为此:
int imgSize=width*height;
将修复段错误。进行这些更改后,您的代码可以在没有任何CUDA错误的情况下运行。显然,结果是虚假的,因为您的内核什么都不做。