cudaMatrixMultiplication

cuda

Publish Date: 2023-11-27

Read Count:

平铺矩阵乘法
这里一个线程控制了A的一行，B的一列; 但是是在for循环中分批次处理这一行和一列；因为要用共享内存
for i=0的时候，处理第一个patch；注意看 tx是列，ty是行

if (i*BLOCK_SIZE + tx < numAColumns && row < numARows)
    sharedM[ty][tx] = A[row*numAColumns + i * BLOCK_SIZE + tx];
else
    sharedM[ty][tx] = 0.0;

if (i*BLOCK_SIZE + ty < numBRows && col < numBColumns)
    sharedN[ty][tx] = B[(i*BLOCK_SIZE + ty)*numBColumns + col];
else
    sharedN[ty][tx] = 0.0;
__syncthreads();

比如第一个C矩阵的block，i=0 遇到__syncthreads()，然后A和B的第一个patch填充到共享内存中，然后sharedM和sharedN相乘再相加，现在处理i=1，注意：现在还是第一个block，这里就是一个线程处理了A的一行，B的一列，现在处理的是这一行和一列的第二个patch，sharedM和sharedN被重新填充，这里是同一个block的共享内存
这里是以A的列就行划分patch，因为B的行和A的列相同
A[row*numAColumns + i * BLOCK_SIZE + tx] 取A对应patch的值，这里i代表的是A的列，就是在列上，第几个patch
B[(i*BLOCK_SIZE + ty)*numBColumns + col] 这里不同是因为i代表列B矩阵的行，行上第几个patch

__global__ void matrixMultiplyShared(float *A, float *B, float *C,
    int numARows, int numAColumns, int numBRows, int numBColumns, int numCRows, int numCColumns)
{
    //@@ Insert code to implement matrix multiplication here
    //@@ You have to use shared memory for this MP

    __shared__ float sharedM[BLOCK_SIZE][BLOCK_SIZE];
    __shared__ float sharedN[BLOCK_SIZE][BLOCK_SIZE];

    int bx = blockIdx.x;
    int by = blockIdx.y;
    int tx = threadIdx.x;
    int ty = threadIdx.y;


    int row = by * BLOCK_SIZE + ty;
    int col = bx * BLOCK_SIZE + tx;


    float Csub = 0.0;

    for (int i = 0; i < (int)(ceil((float)numAColumns / BLOCK_SIZE)); i++)
    {
        //printf("block.x=%d,block.y=%d,threadIdx.x=%d,threadIdx.y=%d,row=%d,col=%d,sharedM[%d][%d]=A[%d],A的值：%f,sharedN[%d][%d]=B[%d],B的值：%f\n",
        //	blockIdx.x, blockIdx.y, threadIdx.x, threadIdx.y, row, col,
        //	threadIdx.y, threadIdx.x, row*numAColumns + i * BLOCK_SIZE + tx, A[row*numAColumns + i * BLOCK_SIZE + tx],
        //	threadIdx.y, threadIdx.x, (i*BLOCK_SIZE + ty)*numBColumns + col, B[(i*BLOCK_SIZE + ty)*numBColumns + col]);

        if (i*BLOCK_SIZE + tx < numAColumns && row < numARows)
            sharedM[ty][tx] = A[row*numAColumns + i * BLOCK_SIZE + tx];
        else
            sharedM[ty][tx] = 0.0;

        if (i*BLOCK_SIZE + ty < numBRows && col < numBColumns)
            sharedN[ty][tx] = B[(i*BLOCK_SIZE + ty)*numBColumns + col];
        else
            sharedN[ty][tx] = 0.0;
        __syncthreads();


        for (int j = 0; j < BLOCK_SIZE; j++)
            Csub += sharedM[ty][j] * sharedN[j][tx];
        __syncthreads();
    }


    if (row < numCRows && col < numCColumns)
        C[row*numCColumns + col] = Csub;

}

Moule Lin

https://moulelin.github.io/2023/11/27/cudaMatrixMultiplication/