cuda matrix tiled multiply


假设A为3x4,B为4x3

physical structure

A[0,1,2,...,11];B[0,...,11]

logical structure

A[0,1,2,3]

A[4,5,6,7]

A[8,9,10,11]

B[0,1,2]

B[3,4,5]

B[6,7,8]

B[9,10,11]


implv1

ph=0

threadx=0,thready=0.

Mds[0][0] = A[0]
Nds[0][0]=B[0]

threadx=1,thready=0.

Mds[0][1] = A[1]
Nds[0][1] = B[1]

threadx=0,thready=1

Mds[1][0] = A[4]
Nds[1][0] =B[3]

threadx=1,thready=1

Mds[1][1]=A[5]
Nds[1][1] = B[4]

在phase=0阶段,A[0][1][4][5]完成加载,B[0][1][3][4]完成加载

A
[A[0],A[1]]
[A[4],A[5]]
B
[B[0],B[1]]
[B[3],B[4]]

ph=1
threadx=0,thready=0
Mds[0][0]=A[2]
Nds[0][0]=B[6]

threadx=1,thready=0
Mds[0][1]=A[3]
Nds[0][1]=B[7]

threadx=0,thready=1
Mds[1][0]=A[4+2+0]=A[6]
Nds[1][0]=B[(2+1)*3+0]=B[9]

threadx=1,thready=1
Mds[1][1]=A[7]
Nds[1][1]=B[10]

phase=1阶段,A[2,3,6,7]完成加载,B[6,7,9,10]完成加载

A
[A[2],A[3]]
[A[6],A[7]]
B
[B[6],B[7]]
[B[9],B[10]]

以上均符合矩阵乘法所需元素


impl v1

__global__ void MatrixMulKernel(float* M, float* N, float* P,
      int Width) {
1. __shared__ float Mds[TILE_WIDTH][TILE_WIDTH];
2. __shared__ float Nds[TILE_WIDTH][TILE_WIDTH];
3. int bx = blockIdx.x; int by = blockIdx.y;
4. int tx = threadIdx.x; int ty = threadIdx.y;
      // Identify the row and column of the P element to work on
5. int Row = by * TILE_WIDTH + ty;
6. int Col = bx * TILE_WIDTH + tx;
7. float Pvalue = 0;
// Loop over the M and N tiles required to compute P element
8. for (int ph = 0; ph < Width/TILE_WIDTH; ++ph) {
        // Collaborative loading of M and N tiles into shared memory
9.  Mds[ty][tx] = M[Row*Width + ph*TILE_WIDTH + tx];
10. Nds[ty][tx] = N[(ph*TILE_WIDTH + ty)*Width + Col];
11. __syncthreads();
12. for (int k = 0; k < TILE_WIDTH; ++k) {
13. Pvalue += Mds[ty][k] * Nds[k][tx];
}
14. __syncthreads();
}
15. P[Row*Width + Col] = Pvalue;

phase=0

threadx=0,thready=0

Ads[0][0]=A[0]
Bds[0][0]=B[0]

threadx=1,thready=0

Ads[0][1]=A[1]
Bds[1][0]=B[1*3]=B[3]

threadx=0,thready=1

Ads[1][0] = A[1*4]=A[4]
Bds[0][1] = B[1]

threadx=1,thready=1

Ads[1][1]=A[14+1]=A[5]
Bds[1][1]=B[1
3+1]=B[4]

phase=0后加载A[0,1,4,5] & B[0,3,1,4]

A
[A[0],A[1]]
[A[4],A[5]]
B
[B[0],B[1]]
[B[3],B[4]]

phase=1

threadx=0,thready=0

Ads[0][0]=A[2]
Bds[0][0]=B[2*3]=B[6]

threadx=1,thready=0

Ads[0][1]=A[2+1]=A[3]
Bds[1][0]=B[(2+1)*3]=B[9]

threadx=0,thready=1

Ads[1][0]=A[14+2]=A[6]
Bds[0][1]=B[(2)
3+1]=B[7]

threadx=1,thready=1

Ads[1][1]=A[4+2+1]=A[7]
Bds[1][1]=B[(2+1)*3+1]=B[10]

phase=1后加载A[2,3,6,7] & B[6,9,7,10]

A
[A[2],A[3]]
[A[6],A[7]]
B
[B[6],B[7]]
[B[9],B[10]]

impl v2

    //@@ You have to use shared memory for this MP
    __shared__ float Ads[TILE_WIDTH][TILE_WIDTH];
    __shared__ float Bds[TILE_WIDTH][TILE_WIDTH];

    // blockDim.x = blockDim.x = TILE_WIDTH
    int row = blockIdx.y * TILE_WIDTH + threadIdx.y;
    int col = blockIdx.x * TILE_WIDTH + threadIdx.x;

    // XXX 
    float p_value = 0;
    for (int phase = 0; phase < (numAColumns/TILE_WIDTH) ; phase++) {
        Ads[threadIdx.y][threadIdx.x] = A[row*numAColumns + phase*TILE_WIDTH + threadIdx.x];
        Bds[threadIdx.x][threadIdx.y] = B[(phase*TILE_WIDTH+threadIdx.x)*numBColumns + row];
        __syncthreads(); 
        for (int k=0; k

没问题啊,为什么最后所有测试数据没有一个能过的??除非从两个点检查,1检查blockidx=1的情况,是否依然相同;2检查求和是否相同,写入内存的位置是否相同