CUDA 笔记（二）：CUDA C++ 概述

一个向量加法 kernel：

cpp
#include <cuda_runtime_api.h>
#include <memory.h>
#include <cstdlib>
#include <ctime>
#include <stdio.h>
#include <cuda/cmath>

__global__ void vecAdd(float* A, float* B, float* C, int vectorLength)
{
    int workIndex = threadIdx.x + blockIdx.x*blockDim.x;
    if(workIndex < vectorLength)
    {
        C[workIndex] = A[workIndex] + B[workIndex];
    }
}

void initArray(float* A, int length)
{
     std::srand(std::time({}));
    for(int i=0; i<length; i++)
    {
        A[i] = rand() / (float)RAND_MAX;
    }
}

void serialVecAdd(float* A, float* B, float* C,  int length)
{
    for(int i=0; i<length; i++)
    {
        C[i] = A[i] + B[i];
    }
}

bool vectorApproximatelyEqual(float* A, float* B, int length, float epsilon=0.00001)
{
    for(int i=0; i<length; i++)
    {
        if(fabs(A[i] -B[i]) > epsilon)
        {
            printf("Index %d mismatch: %f != %f", i, A[i], B[i]);
            return false;
        }
    }
    return true;
}

//explicit-memory-begin
void explicitMemExample(int vectorLength)
{
    // Pointers for host memory
    float* A = nullptr;
    float* B = nullptr;
    float* C = nullptr;
    float* comparisonResult = (float*)malloc(vectorLength*sizeof(float));
    
    // Pointers for device memory
    float* devA = nullptr;
    float* devB = nullptr;
    float* devC = nullptr;

    //Allocate Host Memory using cudaMallocHost API. This is best practice
    // when buffers will be used for copies between CPU and GPU memory
    cudaMallocHost(&A, vectorLength*sizeof(float));
    cudaMallocHost(&B, vectorLength*sizeof(float));
    cudaMallocHost(&C, vectorLength*sizeof(float));

    // Initialize vectors on the host
    initArray(A, vectorLength);
    initArray(B, vectorLength);

    // start-allocate-and-copy
    // Allocate memory on the GPU
    cudaMalloc(&devA, vectorLength*sizeof(float));
    cudaMalloc(&devB, vectorLength*sizeof(float));
    cudaMalloc(&devC, vectorLength*sizeof(float));

    // Copy data to the GPU
    cudaMemcpy(devA, A, vectorLength*sizeof(float), cudaMemcpyDefault);
    cudaMemcpy(devB, B, vectorLength*sizeof(float), cudaMemcpyDefault);
    cudaMemset(devC, 0, vectorLength*sizeof(float));
    // end-allocate-and-copy

    // Launch the kernel
    int threads = 256;
    int blocks = cuda::ceil_div(vectorLength, threads);
    vecAdd<<<blocks, threads>>>(devA, devB, devC, vectorLength);
    // wait for kernel execution to complete
    cudaDeviceSynchronize();

    // Copy results back to host
    cudaMemcpy(C, devC, vectorLength*sizeof(float), cudaMemcpyDefault);

    // Perform computation serially on CPU for comparison
    serialVecAdd(A, B, comparisonResult, vectorLength);

    // Confirm that CPU and GPU got the same answer
    if(vectorApproximatelyEqual(C, comparisonResult, vectorLength))
    {
        printf("Explicit Memory: CPU and GPU answers match\n");
    }
    else
    {
        printf("Explicit Memory: Error - CPU and GPU answers to not match\n");
    }

    // clean up
    cudaFree(devA);
    cudaFree(devB);
    cudaFree(devC);
    cudaFreeHost(A);
    cudaFreeHost(B);
    cudaFreeHost(C);
    free(comparisonResult);
}
//explicit-memory-end


int main(int argc, char** argv)
{
    int vectorLength = 1024;
    if(argc >=2)
    {
        vectorLength = std::atoi(argv[1]);
    }
    explicitMemExample(vectorLength);		
    return 0;
}

编译运行：

nvcc vecAdd.cu -o vecAdd
./vecAdd 4096

以下分段拆解。

一、Kernel#

CUDA 内核（在 GPU 执行的函数）使用 __global__ 修饰符声明，表示编译成 GPU 代码，并且在 kernel launch 时调用。通常从 CPU 代码调用内核。

cpp
__global__ void vecAdd(float* A, float* B, float* C, int vectorLength)
{
    int workIndex = threadIdx.x + blockIdx.x*blockDim.x;
    if(workIndex < vectorLength)
    {
        C[workIndex] = A[workIndex] + B[workIndex];
    }
}

kernel 的返回类型必须是 void。

可以通过 <<< >>> 来启动 kernel：

cpp
// 一维，1 个 block，256 个线程
vecAdd<<<1, 256>>>(A, B, C);

// 二维，16*16 个 block，每个 block 8*8 个线程
dim3 grid(16,16);
dim3 block(8,8);
MatAdd<<<grid, block>>>(A, B, C);

值得注意的是，一个 block 最多只能有 1024 个线程。例如 block 大小为 dim3 block(32, 32, 2) 就不合法。

kernel launch 是非阻塞的，CPU 代码会继续执行，因此需要显式同步：

cpp
cudaDeviceSynchronize();

在 kernel 中，线程可以访问到以下参数：

threadIdx：thread 在 block 内的索引（block 内每个线程唯一）
blockDim：每个 block 的维度
blockIdx：block 在 grid 内的索引
gridDim：grid 的维度

后三者都是在 kernel 启动时指定的。它们都是长度为 3 的向量，分别用 .x、.y、.z 来访问各维度的值。

threadIdx.x 的取值范围是 0, 1, ..., blockDim.x-1
blockIdx.x 的取值范围是 0, 1, ..., gridDim.x-1

运行时没有指定 grid 和 block 的维度时，默认都是 1。例如 vecAdd<<<1, 256>>> 等价于 gridDim = (1, 1, 1) 和 blockDim = (256, 1, 1)。

实际上线程在整个 grid 中某个维度的全局索引可以通过以下方式计算：

cpp
int workIndex = threadIdx.x + blockIdx.x*blockDim.x;

在 vecAdd 的例子中，vectorLength 不一定是 256 的倍数，所以线程需要检查 workIndex 是否越界：

cpp
if(workIndex < vectorLength)
{
    C[workIndex] = A[workIndex] + B[workIndex];
}

对于最后一个 block 里面那些越界的 workIndex，线程会退出而不执行任何操作。

所需要的 block 数量可以通过 vectorLength / 256 向上取整来计算：

cpp
int threads = 256;
// 相当于 blocks = (vectorLength + threads - 1) / threads
int blocks = cuda::ceil_div(vectorLength, threads);
vecAdd<<<blocks, threads>>>(devA, devB, devC, vectorLength);

二、Memory#

1. 显式内存管理#

流程：

在 CPU/GPU 上分别分配内存：cudaMallocHost / new（主机）和 cudaMalloc（设备）
初始化数据
将数据从 CPU 复制到 GPU：cudaMemcpy
启动 kernel 进行计算
将结果从 GPU 复制回 CPU：cudaMemcpy
在 CPU 上验证结果
释放内存：cudaFreeHost / delete（主机）和 cudaFree（设备）

注意：

cudaMemcpy 的第四个参数指定了复制的方向，常用的值有 cudaMemcpyHostToDevice、cudaMemcpyDeviceToHost 和 cudaMemcpyDeviceToDevice。也可以使用 cudaMemcpyDefault 让 CUDA 根据指针地址自动推导。
cudaMemcpy 是一个同步操作，调用后 CPU 线程会等待 GPU 上的复制完成。异步复制需要使用 cudaMemcpyAsync。
使用 cudaMallocHost 分配的主机内存是页锁定（page-locked）内存，可以提供更高的复制性能。如果异步复制，使用页锁定内存是必须的。最佳实践是在需要与 GPU 收发数据的内存上使用 cudaMallocHost。

cpp
void explicitMemExample(int vectorLength)
{
    // Pointers for host memory
    float* A = nullptr;
    float* B = nullptr;
    float* C = nullptr;
    float* comparisonResult = (float*)malloc(vectorLength*sizeof(float));
    
    // Pointers for device memory
    float* devA = nullptr;
    float* devB = nullptr;
    float* devC = nullptr;

    //Allocate Host Memory using cudaMallocHost API. This is best practice
    // when buffers will be used for copies between CPU and GPU memory
    cudaMallocHost(&A, vectorLength*sizeof(float));
    cudaMallocHost(&B, vectorLength*sizeof(float));
    cudaMallocHost(&C, vectorLength*sizeof(float));

    // Initialize vectors on the host
    initArray(A, vectorLength);
    initArray(B, vectorLength);

    // start-allocate-and-copy
    // Allocate memory on the GPU
    cudaMalloc(&devA, vectorLength*sizeof(float));
    cudaMalloc(&devB, vectorLength*sizeof(float));
    cudaMalloc(&devC, vectorLength*sizeof(float));

    // Copy data to the GPU
    cudaMemcpy(devA, A, vectorLength*sizeof(float), cudaMemcpyDefault);
    cudaMemcpy(devB, B, vectorLength*sizeof(float), cudaMemcpyDefault);
    cudaMemset(devC, 0, vectorLength*sizeof(float));
    // end-allocate-and-copy

    // Launch the kernel
    int threads = 256;
    int blocks = cuda::ceil_div(vectorLength, threads);
    vecAdd<<<blocks, threads>>>(devA, devB, devC, vectorLength);
    // wait for kernel execution to complete
    cudaDeviceSynchronize();

    // Copy results back to host
    cudaMemcpy(C, devC, vectorLength*sizeof(float), cudaMemcpyDefault);

    // Perform computation serially on CPU for comparison
    serialVecAdd(A, B, comparisonResult, vectorLength);

    // Confirm that CPU and GPU got the same answer
    if(vectorApproximatelyEqual(C, comparisonResult, vectorLength))
    {
        printf("Explicit Memory: CPU and GPU answers match\n");
    }
    else
    {
        printf("Explicit Memory: Error - CPU and GPU answers to not match\n");
    }

    // clean up
    cudaFree(devA);
    cudaFree(devB);
    cudaFree(devC);
    cudaFreeHost(A);
    cudaFreeHost(B);
    cudaFreeHost(C);
    free(comparisonResult);
}

2. 统一内存#

使用 unified memory 更简单，不需要显式地在 CPU 和 GPU 之间复制数据，驱动会确保数据在 CPU 和 GPU 之间都可访问，并且在需要时自动迁移。

缺点是性能上限可能不如显式内存管理高（因为可以让数据迁移和计算重叠）。

cpp
void unifiedMemExample(int vectorLength)
{
    // Pointers to memory vectors
    float* A = nullptr;
    float* B = nullptr;
    float* C = nullptr;
    float* comparisonResult = (float*)malloc(vectorLength*sizeof(float));

    // Use unified memory to allocate buffers
    cudaMallocManaged(&A, vectorLength*sizeof(float));
    cudaMallocManaged(&B, vectorLength*sizeof(float));
    cudaMallocManaged(&C, vectorLength*sizeof(float));

    // Initialize vectors on the host
    initArray(A, vectorLength);
    initArray(B, vectorLength);

    // Launch the kernel. Unified memory will make sure A, B, and C are
    // accessible to the GPU
    int threads = 256;
    int blocks = cuda::ceil_div(vectorLength, threads);
    vecAdd<<<blocks, threads>>>(A, B, C, vectorLength);
    // Wait for the kernel to complete execution
    cudaDeviceSynchronize();

    // Perform computation serially on CPU for comparison
    serialVecAdd(A, B, comparisonResult, vectorLength);

    // Confirm that CPU and GPU got the same answer
    if(vectorApproximatelyEqual(C, comparisonResult, vectorLength))
    {
        printf("Unified Memory: CPU and GPU answers match\n");
    }
    else
    {
        printf("Unified Memory: Error - CPU and GPU answers do not match\n");
    }

    // Clean Up
    cudaFree(A);
    cudaFree(B);
    cudaFree(C);
    free(comparisonResult);
}

三、异常处理#

每一个 CUDA API 都会返回一个 cudaError_t 类型的错误码，值为 cudaSuccess 表示成功。可以通过 cudaGetErrorString 来获取错误码对应的字符串描述。

cpp
#define CUDA_CHECK(expr_to_check) do {            \
    cudaError_t result  = expr_to_check;          \
    if(result != cudaSuccess)                     \
    {                                             \
        fprintf(stderr,                           \
                "CUDA Runtime Error: %s:%i:%d = %s\n", \
                __FILE__,                         \
                __LINE__,                         \
                result,\
                cudaGetErrorString(result));      \
    }                                             \
} while(0)

// 使用示例
CUDA_CHECK(cudaMalloc(&devA, vectorLength*sizeof(float)));
CUDA_CHECK(cudaMalloc(&devB, vectorLength*sizeof(float)));
CUDA_CHECK(cudaMalloc(&devC, vectorLength*sizeof(float)));

使用 triple chevron notation <<<>>> 启动的 kernel 不会直接返回错误码，而是需要在 kernel launch 后调用 cudaGetLastError 来检查是否有错误发生。如果返回 cudaSuccess，说明 kernel launch 的参数和配置是合法的，并且错误状态不是内核启动前的上一个错误或异步错误。

对于异步错误（比如 kernel 执行期间的错误）可以这样做：

cpp
vecAdd<<<blocks, threads>>>(devA, devB, devC);
// 检查 kernel launch 之后是否有错误发生（例如配置错误）
CUDA_CHECK(cudaGetLastError());
// 等待 kernel 执行完成
// CUDA_CHECK 会报告 kernel 执行期间发生的错误
CUDA_CHECK(cudaDeviceSynchronize());

使用上述 macro 来检测和报告错误时，设置 CUDA_LOG_FILE 环境变量可以将更详细的错误信息输出到指定的日志文件中。

四、修饰符#

函数有以下修饰符：

__global__：编译成 GPU 代码，通常从 host 调用，也可以从另一个 kernel 调用（dynamic parallelism）
__device__：编译成 GPU 代码，可以从另一个 __global__ 或 __device__ 函数调用
__host__：编译成 CPU 代码，在 CPU 内部调用（默认）

变量有以下修饰符：

__device__：在 GPU global memory 上分配内存，所有线程共享
__constant__：在 GPU constant memory 上分配内存，所有线程共享，但只能读不能写
__managed__：在 unified memory 上分配内存，CUDA 会自动管理数据迁移
__shared__：在 GPU shared memory 上分配内存，同一个 block 内的线程共享

五、Thread Block Clusters#

同一个 cluster 上的线程块一定在一个 GPU Process Cluster (GPC) 上调度。一个 cluster 中至多有 8 个 block。

cluster 中的线程使用 distributed shared memory 进行通信。

使用 cooperative groups API cluster.sync() 进行硬件支持的 block 之间的同步。

以下是一个带有 cluster 配置的 kernel launch：

cpp
// Compile time cluster size 2 in X-dimension and 1 in Y and Z dimension
__global__ void __cluster_dims__(2, 1, 1) cluster_kernel(float *input, float* output)
{
    // ...
}

int main()
{
    float *input, *output;
    // Kernel invocation with compile time cluster size
    dim3 threadsPerBlock(16, 16);
    dim3 numBlocks(N / threadsPerBlock.x, N / threadsPerBlock.y);

    // The grid dimension is not affected by cluster launch, and is still enumerated
    // using number of blocks.
    // The grid dimension must be a multiple of cluster size.
    cluster_kernel<<<numBlocks, threadsPerBlock>>>(input, output);
}