当进行调试的时候可以把核函数设置成单线程:
kernelName<<<1,1>>>(argument list)
调整网格和线程块大小可以得到不同的性能。
建议在CUDA开发时加上错误信息提示处理,提高排错效率,在release版本可以去除这部分,例如添加下宏:
#define CHECK(call)\
{\const cudaError_t error=call;\if(error!=cudaSuccess)\{\printf("ERROR: %s:%d,",__FILE__,__LINE__);\printf("code:%d,reason:%s\n",error,cudaGetErrorString(error));\exit(1);\}\
}//使用示例
CHECK(cudaMalloc((float**)&a, size));
kernelName<<>>(argument list)
//一个Grid中有3x2x1=6个Block,在(x,y,z)三个方向上的排布方式分别是3、2、1
//一个Block中有4x3x1=12个Thread,在(x,y,z)三个方向上的排布方式分别是4、3、1
dim3 grid(3,2,1), block(4,3,1);
kernelName<<>>(...);//一个Grid中有5个Block,在(x,y,z)三个方向上的排布方式分别是5、1、1
//一个Block中有8个Thread,在(x,y,z)三个方向上的排布方式分别是8、1、1
kernelName<<<5,8>>>(...);
注:grid(3,2)表示第一维度有3个索引值,第二维度有2个索引值,即2行3列
//各维度索引计算公式//一维Grid 一维Block
int blockId = blockIdx.x;
int threadId = blockIdx.x *blockDim.x + threadIdx.x;//一维Grid 二维Block
int blockId = blockIdx.x;
int threadId = blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;//一维Grid 三维Block
int blockId = blockIdx.x;
int threadId = blockIdx.x * blockDim.x * blockDim.y * blockDim.z + threadIdx.z * blockDim.y * blockDim.x + threadIdx.y * blockDim.x + threadIdx.x;//二维Grid 一维Block
int blockId = blockIdx.y * gridDim.x + blockIdx.x;
int threadId = blockId * blockDim.x + threadIdx.x;//二维Grid 二维Block
int blockId = blockIdx.x + blockIdx.y * gridDim.x;
int threadId = blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x; //二维Grid 三维Block
int blockId = blockIdx.x + blockIdx.y * gridDim.x;
int threadId = blockId * (blockDim.x * blockDim.y * blockDim.z) + (threadIdx.z * (blockDim.x * blockDim.y)) + (threadIdx.y * blockDim.x) + threadIdx.x; //三维Grid 一维Block
int blockId = blockIdx.x + blockIdx.y * gridDim.x + gridDim.x * gridDim.y * blockIdx.z;
int threadId = blockId * blockDim.x + threadIdx.x; //三维Grid 二维Block
int blockId = blockIdx.x + blockIdx.y * gridDim.x + gridDim.x * gridDim.y * blockIdx.z;
int threadId = blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x; //三维Grid 三维Block
int blockId = blockIdx.x + blockIdx.y * gridDim.x + gridDim.x * gridDim.y * blockIdx.z;
int threadId = blockId * (blockDim.x * blockDim.y * blockDim.z) + (threadIdx.z * (blockDim.x * blockDim.y)) + (threadIdx.y * blockDim.x) + threadIdx.x;//二维
int blockId = blockIdx.x + blockId.y * gridDim.x;
int threadId = blockId * (blockDim.x * blockDim.y) + (threadIdx.y *blockDim.x) + threadIdx.x;//三维
int blockId = blockIdx.x + blockIdx.y * gridDim.x + gridDim.x * gridDim.y * blockIdx.z;
int threadIc = blockId * (blockDim.x * blockDim.y * blockDim.z) + (threadIdx.z * (blockDim.x * blockDim.y)) + (threadIdx.y * blockDim.x) + threadIdx.x;
下一篇:五分钟搞懂 POM 设计模式