使用CUDA写一个矩阵乘法C = A X B
(矩阵维度:A: M X K, B: K X N, C: M X N
),当然可以自己写核函数,但效率不如CUDA自带的cublas
算法效率高。使用cublas
唯一值得注意的地方是,在CPU中的矩阵数据存储是行优先存储,而在GPU中是列优先存储,这相当于对原矩阵做了一次转置,我们知道矩阵的两次转置等于原矩阵,要让最后的结果正确,在GPU中计算要使用:T(C) = T(A X B) = TB x TA
,这里的T
表示矩阵转置。具体原理可参见这篇博客:https://blog.csdn.net/HaoBBNuanMM/article/details/103054357,里面解释得非常直观详细。我刚开始没搞清楚这个差异,结果始终不对,还以为数据复制出了问题,直到查到这篇博客后才豁然开朗。下面是实现代码(matrix_multiply_cublas.cu
):
#include
#include
#include
#include
#include
#include
#include #include
#include // For units of time such as h, min, s, ms, us, ns
using namespace std::literals::chrono_literals;namespace {
constexpr size_t M = 400;
constexpr size_t N = 500;
constexpr size_t K = 600;
} // namespaceint main() {// Initialize CUBLAScublasHandle_t handle;cublasStatus_t status = cublasCreate(&handle);if (status != CUBLAS_STATUS_SUCCESS) {std::cerr << "!!!! CUBLAS initialization error\n";return EXIT_FAILURE;}// Initialize the host input matrix;std::vector mat_a(M * K, 0.0f);std::vector mat_b(K * N, 0.0f);// Fill the matrices with random numbersstd::random_device rd;std::mt19937 gen(rd());std::uniform_int_distribution dis(0, 100000);auto rand_num = [&dis, &gen]() { return dis(gen); };std::generate(mat_a.begin(), mat_a.end(), rand_num);std::generate(mat_b.begin(), mat_b.end(), rand_num);// Show the test matrix data.if (M == 2 && N == 4 && K == 3) {// 1 2 3// A =// 4 5 6std::iota(mat_a.begin(), mat_a.end(), 1.0f);// 1 2 3 4// B = 5 6 7 8// 9 10 11 12std::iota(mat_b.begin(), mat_b.end(), 1.0f);// 38 44 50 56// C = AB =// 83 98 113 128std::cout << "A = \n";for (size_t i = 0; i < M * K; ++i) {std::cout << mat_a[i] << '\t';if ((i + 1) % K == 0) {std::cout << '\n';}}std::cout << "B = \n";for (size_t i = 0; i < K * N; ++i) {std::cout << mat_b[i] << '\t';if ((i + 1) % N == 0) {std::cout << '\n';}}}// Allocate device memory for the matricesfloat *device_mat_a = nullptr;float *device_mat_b = nullptr;float *device_mat_c = nullptr;if (cudaMalloc(reinterpret_cast(&device_mat_a),M * K * sizeof(float)) != cudaSuccess) {std::cerr << "!!!! device memory allocation error (allocate A)\n";return EXIT_FAILURE;}if (cudaMalloc(reinterpret_cast(&device_mat_b),K * N * sizeof(float)) != cudaSuccess) {std::cerr << "!!!! device memory allocation error (allocate B)\n";cudaFree(device_mat_a);return EXIT_FAILURE;}if (cudaMalloc(reinterpret_cast(&device_mat_c),M * N * sizeof(float)) != cudaSuccess) {std::cerr << "!!!! device memory allocation error (allocate C)\n";cudaFree(device_mat_a);cudaFree(device_mat_b);return EXIT_FAILURE;}// Initialize the device matrices with the host matrices.cudaMemcpy(device_mat_a, mat_a.data(), M * K * sizeof(float),cudaMemcpyHostToDevice);cudaMemcpy(device_mat_b, mat_b.data(), K * N * sizeof(float),cudaMemcpyHostToDevice);// Free up host memories.mat_a.clear();mat_a.shrink_to_fit();mat_b.clear();mat_b.shrink_to_fit();// Performs operation using cublas// C = alpha * transa(A)*transb(B) + beta * C// `transa` indicates whether the matrix A is transposed or not.// `transb` indicates whether the matrix B is transposed or not.// A: m x k// B: k x n// C: m x n// LDA, LDB, LDC are the leading dimensions of the three matrices,// respectively.// If C = A x B is calculated, there is alpha = 1.0, beta = 0.0,// transa = CUBLAS_OP_N, transb = CUBLAS_OP_N// cublasStatus_t cublasSgemm(cublasHandle_t handle,// cublasOperation_t transa, cublasOperation_t// transb, int m, int n, int k, const float *alpha,// const float *A, int lda,// const float *B, int ldb,// const float *beta,// float *C, int ldc);float alpha = 1.0f;float beta = 0.0f;// Note that cublas is column primary! We need to transpose the order!// In CPU: C = A x B// But in GPU: CT = (A x B)T = BT x AT, T means transposestatus =cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, M, K, &alpha,device_mat_b, N, device_mat_a, K, &beta, device_mat_c, N);if (status != CUBLAS_STATUS_SUCCESS) {std::cerr << "!!!! cublas kernel execution error.\n";cudaFree(device_mat_a);cudaFree(device_mat_b);cudaFree(device_mat_c);return EXIT_FAILURE;}// Read back the result from device memory.std::vector mat_c(M * N, 0.0f);cudaMemcpy(mat_c.data(), device_mat_c, M * N * sizeof(float),cudaMemcpyDeviceToHost);// Show the test results.if (M == 2 && N == 4 && K == 3) {std::cout << "C = AB = \n";for (size_t i = 0; i < M * N; ++i) {std::cout << mat_c[i] << '\t';if ((i + 1) % N == 0) {std::cout << '\n';}}}// Memory clean upcudaFree(device_mat_a);cudaFree(device_mat_b);cudaFree(device_mat_c);// Shutdown cublascublasDestroy(handle);return 0;
}
上述代码中,我使用的是C++ 11方式来生成随机数:
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution dis(0, 100000);
auto rand_num = [&dis, &gen]() { return dis(gen); };
std::generate(mat_a.begin(), mat_a.end(), rand_num);
std::generate(mat_b.begin(), mat_b.end(), rand_num);
生成测试数据,我也是使用的C++ STL库算法std::iota
来实现:
std::iota(mat_a.begin(), mat_a.end(), 1.0f);
std::iota(mat_b.begin(), mat_b.end(), 1.0f);
主机内存我直接使用std::vector
,而不是使用C语言函数malloc
来分配,释放内存代码如下:
mat_a.clear();
mat_a.shrink_to_fit();
mat_b.clear();
mat_b.shrink_to_fit();
请大家熟悉C++的编码方式。
CMake编译文件CMakeLists.txt
内容如下所示:
cmake_minimum_required(VERSION 3.0.0)
# Set the project name and its version
project(matrix_multiply VERSION 0.1.0)# Set the c++17 standard
set(CMAKE_CXX_STANDARD 17)# Set the -O3 optimization level with debug information
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -O3 -g")# Disable warning messages
cmake_policy(SET CMP0074 NEW)# Find CUDA
find_package(CUDA REQUIRED)# These flags embed debugging information for both host and device code
set(CUDA_NVCC_FLAGS -G -g)# Generate a cuda executable file.
cuda_add_executable(${PROJECT_NAME} ${PROJECT_NAME}.cu)
target_link_libraries(${PROJECT_NAME} ${CUDA_cublas_LIBRARY})
上述配置文件的含义如下图所示:
编译方式说明:
mkdir build && cd build
# 生成Makefile
cmake ..
# 构建目标,也可直接使用:make
cmake --build .
nvcc
编译,指令为:nvcc -O3 -G -g -std=c++17 matrix_multiply_cublas.cu -lcublas -o matrix_multiply_cublas
上述编译语句的选项含义为:-O3
使用O3级优化,-G
设备(device)代码包含调试信息,-g
主机(host)代码包含调试信息,-std=c++17
使用C++17标准,-lcublas
表示链接cublas
库。
将常量修改为M = 2, N = 4, K = 3
,得到的测试结果如下:
./matrix_multiply_cublas
A =
1 2 3
4 5 6
B =
1 2 3 4
5 6 7 8
9 10 11 12
C = AB =
38 44 50 56
83 98 113 128
将常量修改为M = 400, N = 500, K = 600
,得到的性能分析结果如下:
nvprof ./matrix_multiply_cublas
==216602== NVPROF is profiling process 216602, command: ./matrix_multiply
==216602== Profiling application: ./matrix_multiply
==216602== Profiling result:Type Time(%) Time Calls Avg Min Max NameGPU activities: 42.78% 184.93us 2 92.463us 75.519us 109.41us [CUDA memcpy HtoD]42.44% 183.49us 1 183.49us 183.49us 183.49us volta_sgemm_128x32_nn14.50% 62.687us 1 62.687us 62.687us 62.687us [CUDA memcpy DtoH]0.28% 1.2160us 1 1.2160us 1.2160us 1.2160us [CUDA memset]API calls: 99.77% 670.35ms 8 83.794ms 2.8420us 352.22ms cudaFree0.08% 547.51us 3 182.50us 58.392us 347.88us cudaMemcpy0.06% 406.58us 297 1.3680us 81ns 98.280us cuDeviceGetAttribute0.05% 315.62us 6 52.602us 3.5290us 117.89us cudaMalloc0.02% 113.83us 751 151ns 88ns 772ns cuGetProcAddress0.01% 92.881us 3 30.960us 15.184us 60.794us cuDeviceGetName0.00% 14.385us 1 14.385us 14.385us 14.385us cudaLaunchKernel0.00% 11.389us 18 632ns 269ns 5.8650us cudaEventCreateWithFlags0.00% 10.808us 1 10.808us 10.808us 10.808us cuDeviceGetPCIBusId0.00% 6.9020us 2 3.4510us 524ns 6.3780us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags0.00% 6.3610us 4 1.5900us 628ns 2.7490us cudaDeviceSynchronize0.00% 5.6670us 1 5.6670us 5.6670us 5.6670us cudaMemsetAsync0.00% 5.5610us 18 308ns 226ns 947ns cudaEventDestroy0.00% 5.3820us 3 1.7940us 321ns 3.5870us cudaGetDevice0.00% 4.2940us 5 858ns 202ns 2.6630us cuDeviceGetCount0.00% 4.0470us 14 289ns 171ns 960ns cudaDeviceGetAttribute0.00% 2.6340us 2 1.3170us 1.2010us 1.4330us cuInit0.00% 2.1070us 1 2.1070us 2.1070us 2.1070us cudaEventRecord0.00% 1.9590us 1 1.9590us 1.9590us 1.9590us cudaEventQuery0.00% 1.7620us 3 587ns 197ns 1.1320us cudaStreamGetCaptureInfo0.00% 1.7510us 4 437ns 101ns 1.2370us cuDeviceGet0.00% 1.2910us 3 430ns 219ns 777ns cuDeviceTotalMem0.00% 1.0030us 1 1.0030us 1.0030us 1.0030us cudaGetSymbolAddress0.00% 944ns 3 314ns 133ns 649ns cuDeviceGetUuid0.00% 282ns 2 141ns 127ns 155ns cuDriverGetVersion0.00% 236ns 1 236ns 236ns 236ns cudaGetLastError0.00% 100ns 1 100ns 100ns 100ns cuModuleGetLoadingMode