CUDA编程共享内存

冬至子 2023-05-19 1213

电子说

1.3w人已加入

描述

Shared Memory

共享内存是使用__shared__内存空间说明符分配的 。

共享内存预期要比全局内存快得多 。 它可以用作临时存储器(或软件管理缓存)，以最小化来自CUDA block 的全局内存访问 ，如下面的矩阵乘法示例所示。

下面的代码示例是一个简单的矩阵乘法实现，它不利用共享内存。每个线程读取A的一行和B的一列，并计算C的相应元素，如图1所示。因此， A从全局内存中读取B的width次数，B从全局内存中读取A的height次数 。

从左到右是x的方向，从上到下是y的方向。 (x,y) x是0-dim，y是1-dim，和正常的 shape 表示是反着的。

存储器

图1 Matrix Multiplication without Shared Memory

// Matrices are stored in row-major order:
// M(row, col) = *(M.elements + row * M.width + col)
typedef struct {
    int width;
    int height;
    float* elements;
} Matrix;

// Thread block size
#define BLOCK_SIZE 16

// Forward declaration of the matrix multiplication kernel
__global__ void MatMulKernel(const Matrix, const Matrix, Matrix);

// Matrix multiplication - Host code
// Matrix dimensions are assumed to be multiples of BLOCK_SIZE
void MatMul(const Matrix A, const Matrix B, Matrix C)
{
    // Load A and B to device memory
    Matrix d_A;
    d_A.width = A.width; d_A.height = A.height;
    size_t size = A.width * A.height * sizeof(float);
    cudaMalloc(&d_A.elements, size);
    cudaMemcpy(d_A.elements, A.elements, size,
               cudaMemcpyHostToDevice);
    Matrix d_B;
    d_B.width = B.width; d_B.height = B.height;
    size = B.width * B.height * sizeof(float);
    cudaMalloc(&d_B.elements, size);
    cudaMemcpy(d_B.elements, B.elements, size,
               cudaMemcpyHostToDevice);

    // Allocate C in device memory
    Matrix d_C;
    d_C.width = C.width; d_C.height = C.height;
    size = C.width * C.height * sizeof(float);
    cudaMalloc(&d_C.elements, size);

    // Invoke kernel
    dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
    dim3 dimGrid(B.width / dimBlock.x, A.height / dimBlock.y);
    MatMulKernel<<

下面的代码示例是一个利用共享内存的矩阵乘法的实现。在这个实现中， 每个线程块负责计算C的一个方阵子矩阵Csub，块中的每个线程负责计算Csub中的一个元素 。如图2所示， Csub等于两个矩形矩阵的乘积:一个是与Csub具有相同行索引的维数(A.width, block_size)的子矩阵，另一个是与Csub具有相同列索引的维数(block_size, A.width)的子矩阵。为了适应设备的资源，这两个矩形矩阵根据需要被分成多个尺寸为block_size的方阵，Csub被计算为这些方阵乘积的和。每一个乘积都是这样执行的:首先将两个对应的方阵从全局内存加载到共享内存，由一个线程加载每个矩阵的一个元素，然后让每个线程计算乘积的一个元素。每个线程将每个产品的结果累积到一个寄存器中，并将结果写入全局内存。

存储器

图2 Matrix Multiplication with Shared Memory

通过这种方式阻塞计算，我们利用了快速共享内存的优势，并节省了大量全局内存带宽， 因为A只从全局内存读取(B.width / block_size)次，而B是读取(a.height / block_size)次 。

前面代码示例中的Matrix类型使用stride字段进行了扩充，以便子矩阵可以有效地用相同的类型表示 。__device__函数用于获取和设置元素，并从矩阵中构建任何子矩阵。

// Matrices are stored in row-major order:
// M(row, col) = *(M.elements + row * M.stride + col)
typedef struct {
    int width;
    int height;
    int stride; 
    float* elements;
} Matrix;

// Get a matrix element
__device__ float GetElement(const Matrix A, int row, int col)
{
    return A.elements[row * A.stride + col];
}

// Set a matrix element
__device__ void SetElement(Matrix A, int row, int col,
                           float value)
{
    A.elements[row * A.stride + col] = value;
}

// Get the BLOCK_SIZExBLOCK_SIZE sub-matrix Asub of A that is
// located col sub-matrices to the right and row sub-matrices down
// from the upper-left corner of A
 __device__ Matrix GetSubMatrix(Matrix A, int row, int col) 
{
    Matrix Asub;
    Asub.width    = BLOCK_SIZE;
    Asub.height   = BLOCK_SIZE;
    Asub.stride   = A.stride;
    Asub.elements = &A.elements[A.stride * BLOCK_SIZE * row
                                         + BLOCK_SIZE * col];
    return Asub;
}

// Thread block size
#define BLOCK_SIZE 16

// Forward declaration of the matrix multiplication kernel
__global__ void MatMulKernel(const Matrix, const Matrix, Matrix);

// Matrix multiplication - Host code
// Matrix dimensions are assumed to be multiples of BLOCK_SIZE
void MatMul(const Matrix A, const Matrix B, Matrix C)
{
    // Load A and B to device memory
    Matrix d_A;
    d_A.width = d_A.stride = A.width; d_A.height = A.height;
    size_t size = A.width * A.height * sizeof(float);
    cudaMalloc(&d_A.elements, size);
    cudaMemcpy(d_A.elements, A.elements, size,
               cudaMemcpyHostToDevice);
    Matrix d_B;
    d_B.width = d_B.stride = B.width; d_B.height = B.height;
    size = B.width * B.height * sizeof(float);
    cudaMalloc(&d_B.elements, size);
    cudaMemcpy(d_B.elements, B.elements, size,
    cudaMemcpyHostToDevice);

    // Allocate C in device memory
    Matrix d_C;
    d_C.width = d_C.stride = C.width; d_C.height = C.height;
    size = C.width * C.height * sizeof(float);
    cudaMalloc(&d_C.elements, size);

    // Invoke kernel
    dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
    dim3 dimGrid(B.width / dimBlock.x, A.height / dimBlock.y);
    MatMulKernel<<

打开APP阅读更多精彩内容