错误处理
运行时 API 错误码
调用 CUDA 运行时 API 时,接口返回错误码。
1
| __host__ __device__ cudaError_t cudaGetDeviceCount ( int* count );
|
错误检查
1 2
| __host__ __device__ const char* cudaGetErrorName ( cudaError_t error ); __host__ __device__ const char* cudaGetErrorString ( cudaError_t error );
|
定义错误检查函数
1 2 3 4 5 6 7 8 9 10 11 12 13 14
| __host__ void error_check_entry() { int device_id_in_use; error_check(cudaGetDevice(&device_id_in_use), __FILE__, __LINE__); error_check(cudaSetDevice(999), __FILE__, __LINE__);
cudaDeviceSynchronize(); }
|
核函数中的异常
核函数的返回值必须是 void。
1
| __host__ __device__ cudaError_t cudaGetLastError ( void );
|
1 2 3 4 5 6 7 8 9
| __global__ void kernel_error_entry() { dim3 block(2048); print_build_in_vars<<<2, block>>>(); error_check(cudaGetLastError(), __FILE__, __LINE__); }
|
性能评估
事件计时
1 2 3 4 5
| __host__ cudaError_t cudaEventCreate ( cudaEvent_t* event ); __host__ __device__ cudaError_t cudaEventRecord ( cudaEvent_t event, cudaStream_t stream = 0 ); __host__ cudaError_t cudaEventSynchronize ( cudaEvent_t event ); __host__ cudaError_t cudaEventElapsedTime ( float* ms, cudaEvent_t start, cudaEvent_t end ); __host__ __device__ cudaError_t cudaEventDestroy ( cudaEvent_t event );
|
示例。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
| cudaEvent_t start, end; error_check(cudaEventCreate(&start), __FILE__, __LINE__); error_check(cudaEventCreate(&end), __FILE__, __LINE__); error_check(cudaEventRecord(start), __FILE__, __LINE__); cudaEventQuery(start);
error_check(cudaEventRecord(end), __FILE__, __LINE__); error_check(cudaEventSynchronize(end), __FILE__, __LINE__); float elapsed_time_ms; ERROR_CHECK(cudaEventElapsedTime(&elapsed_time_ms, start, end));
printf("elapsed time: %f ms\n", elapsed_time_ms); ERROR_CHECK(cudaEventDestroy(start)); ERROR_CHECK(cudaEventDestroy(end));
|
error_check。
1 2 3 4 5 6 7 8
| __host__ __device__ cudaError_t error_check(cudaError_t err, const char *fn, int line) { if (err != cudaSuccess) { printf("CUDA error:\n\tcode=%d, name=%s, description=%s, \n\tfile=%s, line=%d\n", err, cudaGetErrorName(err), cudaGetErrorString(err), fn, line); } return err; } #define ERROR_CHECK(exp) error_check(exp, __FILE__, __LINE__)
|
nvprof
nvprof 是评估 cuda 程序性能的工具。不过目前已经是过时的工具,不适用 compute capability >= 8.0 的设备。新设备适用 nsys 替代。
nsys
1 2
| $ nsys profile {cuda-program} $ nsys analyze {nsys-rep}
|
获取 GPU 信息
运行时 API
1
| __host__ cudaError_t cudaGetDeviceProperties ( cudaDeviceProp* prop, int device )
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
| __host__ void PrintDeviceInfo() { int deviceCount; cudaGetDeviceCount(&deviceCount); std::cout << "GPU device count: " << deviceCount << std::endl;
for (int i = 0; i < deviceCount; ++i) { cudaDeviceProp dp{}; cudaGetDeviceProperties(&dp, i); std::cout << "device.0 " << std::endl; std::cout << " sm count: \t\t\t\t" << dp.multiProcessorCount << std::endl; std::cout << " shared memory per block: \t\t" << dp.sharedMemPerBlock / 1024 << "KB" << std::endl; std::cout << " max threads per block:\t\t" << dp.maxThreadsPerBlock << std::endl; std::cout << " max threads per multi processor:\t" << dp.maxThreadsPerMultiProcessor << std::endl; std::cout << " max threads per sm:\t\t\t" << dp.maxThreadsPerMultiProcessor / 32 << std::endl; std::cout << " max blocks per multi processor:\t" << dp.maxBlocksPerMultiProcessor << std::endl; } }
|