chundoong-lab-ta/SHPC2022/hw6_answer/matmul/matmul.cu

#include "matmul.h"
#include "util.h"

#include <cuda_runtime.h>
#include <mpi.h>

#define CUDA_CALL(f)                                                           \
  {                                                                            \
    cudaError_t err = (f);                                                     \
    if (err != cudaSuccess) {                                                  \
      fprintf(stderr, "CUDA error at [%s:%d] %d %s\n", __FILE__, __LINE__,     \
              err, cudaGetErrorString(err));                                   \
      exit(1);                                                                 \
    }                                                                          \
  }

#define MAX_NUM_GPU 4
int num_devices = 0;

static int mpi_rank, mpi_world_size;
static int Asendcounts[4];
static int Adispls[4];
static int Crecvcounts[4];
static int Cdispls[4];

// Array of device (GPU) pointers
static float *a_d[MAX_NUM_GPU];
static float *b_d[MAX_NUM_GPU];
static float *c_d[MAX_NUM_GPU];
static int Mbegin[MAX_NUM_GPU], Mend[MAX_NUM_GPU];

cudaStream_t streams[MAX_NUM_GPU];

#define BLOCK_SIZE 32
#define MIN(a, b) (((a) < (b)) ? (a) : (b))
__global__ void matmul_kernel(float *A, float *B, float *C, int M, int N, int K) {
  int j = blockIdx.x * blockDim.x + threadIdx.x;
  int i = blockIdx.y * blockDim.y + threadIdx.y;

  int gj = blockIdx.x;
  int gi = blockIdx.y;

  if (gi * BLOCK_SIZE >= M || gj * BLOCK_SIZE >= N) return; // boundary check 

  int lj = threadIdx.x;
  int li = threadIdx.y;

  __shared__ float Alocal[BLOCK_SIZE][BLOCK_SIZE];
  __shared__ float Blocal[BLOCK_SIZE][BLOCK_SIZE];

  float c = 0.f;

  int A_row_index = (gi * BLOCK_SIZE + li);
  int B_col_index = (gj * BLOCK_SIZE + lj);

  for (int bk = 0; bk < K; bk += BLOCK_SIZE) {
    int A_col_index = bk + lj;
    Alocal[li][lj] = (A_row_index < M && A_col_index < K) ?
      A[A_row_index * K + A_col_index] :
      0.f;

    int B_row_index = bk + li;
    Blocal[li][lj] = (B_row_index < K && B_col_index < N) ?
      B[B_row_index * N + B_col_index] : 
      0.f;

    __syncthreads();

    for (int lk = 0; lk < BLOCK_SIZE; ++lk) {
        c  += Alocal[li][lk] * Blocal[lk][lj];
    }
    __syncthreads();
  }

  if (i < M && j < N)
    C[i * N + j] = c;
}

void matmul(const float *A, const float *B, float *C, int M, int N, int K) {

  MPI_Scatterv(A, Asendcounts, Adispls,
      MPI_FLOAT, (void*)A, Asendcounts[mpi_rank], MPI_FLOAT, 
      0, MPI_COMM_WORLD);
  MPI_Bcast((void*)B, K * N, MPI_FLOAT, 0, MPI_COMM_WORLD);

  // Upload A and B matrix to every GPU
#pragma omp parallel for
  for (int i = 0; i < num_devices; i++) {
    CUDA_CALL(cudaSetDevice(i));

    CUDA_CALL(cudaMemcpyAsync(
          a_d[i], A + Mbegin[i] * K,
          (Mend[i] - Mbegin[i]) * K * sizeof(float),
          cudaMemcpyHostToDevice,
          streams[i]));
    CUDA_CALL(cudaMemcpyAsync(b_d[i], B, K * N * sizeof(float),
          cudaMemcpyHostToDevice, streams[i]));

    dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE);
    dim3 gridDim((N + BLOCK_SIZE - 1) / BLOCK_SIZE, (Mend[i] - Mbegin[i] + BLOCK_SIZE-1) / BLOCK_SIZE);

    matmul_kernel<<<gridDim, blockDim, 0, streams[i]>>>(a_d[i], b_d[i], c_d[i], Mend[i] - Mbegin[i], N, K);

    CUDA_CALL(cudaMemcpyAsync(C + Mbegin[i] * N, c_d[i],
          (Mend[i] - Mbegin[i]) * N * sizeof(float),
          cudaMemcpyDeviceToHost, streams[i]));
  }

#pragma omp parallel for
  for (int i = 0; i < num_devices; i++) {
    CUDA_CALL(cudaSetDevice(i));
    CUDA_CALL(cudaStreamSynchronize(streams[i]));
  }

  MPI_Gatherv(C, Crecvcounts[mpi_rank], MPI_FLOAT,
      C, Crecvcounts, Cdispls, MPI_FLOAT,
      0, MPI_COMM_WORLD);
}

void matmul_initialize(int M, int N, int K) {
  MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
  MPI_Comm_size(MPI_COMM_WORLD, &mpi_world_size);

  for (int i = 0; i < mpi_world_size; i++) {
    Adispls[i] = ((M / mpi_world_size) * K) * i;
    Asendcounts[i] = ((M / mpi_world_size) * K);
    Cdispls[i] = ((M / mpi_world_size) * N) * i;
    Crecvcounts[i] = ((M / mpi_world_size) * N);
  }
  Asendcounts[mpi_world_size - 1] = M*K - Adispls[mpi_world_size-1];
  Crecvcounts[mpi_world_size - 1] = M*N - Cdispls[mpi_world_size-1];

  // Only root process do something
  CUDA_CALL(cudaGetDeviceCount(&num_devices));

  int num_global_devices = 0;
  MPI_Reduce(&num_devices, (void*)&num_global_devices, 1, MPI_INT,
      MPI_SUM, 0, MPI_COMM_WORLD);

  if (mpi_rank == 0) {
    printf("Using %d devices\n", num_devices);
  }
  MPI_Barrier(MPI_COMM_WORLD);

  for (int j = 0; j < mpi_world_size; ++j) {
    if (mpi_rank == j) {
      for (int i = 0; i < num_devices; i++) {
        cudaDeviceProp prop;
        CUDA_CALL(cudaGetDeviceProperties(&prop, i));

        // Try printing more detailed information here
        printf("[rank %d] GPU %d: %s\n", mpi_rank, i, prop.name);
      }
    }
    MPI_Barrier(MPI_COMM_WORLD);
  }

  if (num_devices <= 0) {
    printf("[rank %d] No CUDA device found. Aborting\n", mpi_rank);
    exit(1);
  }

  // Setup problem size for each GPU
  for (int i = 0; i < num_devices; i++) {
    Mbegin[i] = ((Asendcounts[mpi_rank] / K) / num_devices) * i;
    Mend[i] = ((Asendcounts[mpi_rank] / K) / num_devices) * (i + 1);
  }
  Mend[num_devices - 1] = (Asendcounts[mpi_rank] / K);

  // Allocate device memory for each GPU
  for (int i = 0; i < num_devices; i++) {
    CUDA_CALL(cudaSetDevice(i));
    CUDA_CALL(cudaMalloc(&a_d[i], (Mend[i] - Mbegin[i]) * K * sizeof(float)));
    CUDA_CALL(cudaMalloc(&b_d[i], K * N * sizeof(float)));
    CUDA_CALL(cudaMalloc(&c_d[i], (Mend[i] - Mbegin[i]) * N * sizeof(float)));

    CUDA_CALL(cudaStreamCreate(&streams[i]));
  }
}

void matmul_finalize() {
  for (int i = 0; i < num_devices; i++) {
    CUDA_CALL(cudaFree(a_d[i]));
    CUDA_CALL(cudaFree(b_d[i]));
    CUDA_CALL(cudaFree(c_d[i]));

    CUDA_CALL(cudaStreamDestroy(streams[i]));
  }
}
hw6_answer 2022-11-24 20:51:04 +09:00			`#include "matmul.h"`
			`#include "util.h"`

			`#include <cuda_runtime.h>`
			`#include <mpi.h>`

			`#define CUDA_CALL(f) \`
			`{ \`
			`cudaError_t err = (f); \`
			`if (err != cudaSuccess) { \`
			`fprintf(stderr, "CUDA error at [%s:%d] %d %s\n", __FILE__, __LINE__, \`
			`err, cudaGetErrorString(err)); \`
			`exit(1); \`
			`} \`
			`}`

			`#define MAX_NUM_GPU 4`
			`int num_devices = 0;`

			`static int mpi_rank, mpi_world_size;`
			`static int Asendcounts[4];`
			`static int Adispls[4];`
			`static int Crecvcounts[4];`
			`static int Cdispls[4];`

			`// Array of device (GPU) pointers`
			`static float *a_d[MAX_NUM_GPU];`
			`static float *b_d[MAX_NUM_GPU];`
			`static float *c_d[MAX_NUM_GPU];`
			`static int Mbegin[MAX_NUM_GPU], Mend[MAX_NUM_GPU];`

			`cudaStream_t streams[MAX_NUM_GPU];`

			`#define BLOCK_SIZE 32`
			`#define MIN(a, b) (((a) < (b)) ? (a) : (b))`
			`__global__ void matmul_kernel(float A, float B, float *C, int M, int N, int K) {`
			`int j = blockIdx.x * blockDim.x + threadIdx.x;`
			`int i = blockIdx.y * blockDim.y + threadIdx.y;`

			`int gj = blockIdx.x;`
			`int gi = blockIdx.y;`

			`if (gi * BLOCK_SIZE >= M \|\| gj * BLOCK_SIZE >= N) return; // boundary check`

			`int lj = threadIdx.x;`
			`int li = threadIdx.y;`

			`__shared__ float Alocal[BLOCK_SIZE][BLOCK_SIZE];`
			`__shared__ float Blocal[BLOCK_SIZE][BLOCK_SIZE];`

			`float c = 0.f;`

			`int A_row_index = (gi * BLOCK_SIZE + li);`
			`int B_col_index = (gj * BLOCK_SIZE + lj);`

			`for (int bk = 0; bk < K; bk += BLOCK_SIZE) {`
			`int A_col_index = bk + lj;`
			`Alocal[li][lj] = (A_row_index < M && A_col_index < K) ?`
			`A[A_row_index * K + A_col_index] :`
			`0.f;`

			`int B_row_index = bk + li;`
			`Blocal[li][lj] = (B_row_index < K && B_col_index < N) ?`
			`B[B_row_index * N + B_col_index] :`
			`0.f;`

			`__syncthreads();`

			`for (int lk = 0; lk < BLOCK_SIZE; ++lk) {`
			`c += Alocal[li][lk] * Blocal[lk][lj];`
			`}`
			`__syncthreads();`
			`}`

			`if (i < M && j < N)`
			`C[i * N + j] = c;`
			`}`

			`void matmul(const float A, const float B, float *C, int M, int N, int K) {`

			`MPI_Scatterv(A, Asendcounts, Adispls,`
			`MPI_FLOAT, (void*)A, Asendcounts[mpi_rank], MPI_FLOAT,`
			`0, MPI_COMM_WORLD);`
			`MPI_Bcast((void)B, K N, MPI_FLOAT, 0, MPI_COMM_WORLD);`

			`// Upload A and B matrix to every GPU`
			`#pragma omp parallel for`
			`for (int i = 0; i < num_devices; i++) {`
			`CUDA_CALL(cudaSetDevice(i));`

			`CUDA_CALL(cudaMemcpyAsync(`
			`a_d[i], A + Mbegin[i] * K,`
			`(Mend[i] - Mbegin[i]) * K * sizeof(float),`
			`cudaMemcpyHostToDevice,`
			`streams[i]));`
			`CUDA_CALL(cudaMemcpyAsync(b_d[i], B, K * N * sizeof(float),`
			`cudaMemcpyHostToDevice, streams[i]));`

			`dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE);`
			`dim3 gridDim((N + BLOCK_SIZE - 1) / BLOCK_SIZE, (Mend[i] - Mbegin[i] + BLOCK_SIZE-1) / BLOCK_SIZE);`

			`matmul_kernel<<<gridDim, blockDim, 0, streams[i]>>>(a_d[i], b_d[i], c_d[i], Mend[i] - Mbegin[i], N, K);`

			`CUDA_CALL(cudaMemcpyAsync(C + Mbegin[i] * N, c_d[i],`
			`(Mend[i] - Mbegin[i]) * N * sizeof(float),`
			`cudaMemcpyDeviceToHost, streams[i]));`
			`}`

			`#pragma omp parallel for`
			`for (int i = 0; i < num_devices; i++) {`
			`CUDA_CALL(cudaSetDevice(i));`
			`CUDA_CALL(cudaStreamSynchronize(streams[i]));`
			`}`

			`MPI_Gatherv(C, Crecvcounts[mpi_rank], MPI_FLOAT,`
			`C, Crecvcounts, Cdispls, MPI_FLOAT,`
			`0, MPI_COMM_WORLD);`
			`}`

			`void matmul_initialize(int M, int N, int K) {`
			`MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);`
			`MPI_Comm_size(MPI_COMM_WORLD, &mpi_world_size);`

			`for (int i = 0; i < mpi_world_size; i++) {`
			`Adispls[i] = ((M / mpi_world_size) * K) * i;`
			`Asendcounts[i] = ((M / mpi_world_size) * K);`
			`Cdispls[i] = ((M / mpi_world_size) * N) * i;`
			`Crecvcounts[i] = ((M / mpi_world_size) * N);`
			`}`
			`Asendcounts[mpi_world_size - 1] = M*K - Adispls[mpi_world_size-1];`
			`Crecvcounts[mpi_world_size - 1] = M*N - Cdispls[mpi_world_size-1];`

			`// Only root process do something`
			`CUDA_CALL(cudaGetDeviceCount(&num_devices));`

			`int num_global_devices = 0;`
			`MPI_Reduce(&num_devices, (void*)&num_global_devices, 1, MPI_INT,`
			`MPI_SUM, 0, MPI_COMM_WORLD);`

			`if (mpi_rank == 0) {`
			`printf("Using %d devices\n", num_devices);`
			`}`
			`MPI_Barrier(MPI_COMM_WORLD);`

			`for (int j = 0; j < mpi_world_size; ++j) {`
			`if (mpi_rank == j) {`
			`for (int i = 0; i < num_devices; i++) {`
			`cudaDeviceProp prop;`
			`CUDA_CALL(cudaGetDeviceProperties(&prop, i));`

			`// Try printing more detailed information here`
			`printf("[rank %d] GPU %d: %s\n", mpi_rank, i, prop.name);`
			`}`
			`}`
			`MPI_Barrier(MPI_COMM_WORLD);`
			`}`

			`if (num_devices <= 0) {`
			`printf("[rank %d] No CUDA device found. Aborting\n", mpi_rank);`
			`exit(1);`
			`}`

			`// Setup problem size for each GPU`
			`for (int i = 0; i < num_devices; i++) {`
			`Mbegin[i] = ((Asendcounts[mpi_rank] / K) / num_devices) * i;`
			`Mend[i] = ((Asendcounts[mpi_rank] / K) / num_devices) * (i + 1);`
			`}`
			`Mend[num_devices - 1] = (Asendcounts[mpi_rank] / K);`

			`// Allocate device memory for each GPU`
			`for (int i = 0; i < num_devices; i++) {`
			`CUDA_CALL(cudaSetDevice(i));`
			`CUDA_CALL(cudaMalloc(&a_d[i], (Mend[i] - Mbegin[i]) * K * sizeof(float)));`
			`CUDA_CALL(cudaMalloc(&b_d[i], K * N * sizeof(float)));`
			`CUDA_CALL(cudaMalloc(&c_d[i], (Mend[i] - Mbegin[i]) * N * sizeof(float)));`

			`CUDA_CALL(cudaStreamCreate(&streams[i]));`
			`}`
			`}`

			`void matmul_finalize() {`
			`for (int i = 0; i < num_devices; i++) {`
			`CUDA_CALL(cudaFree(a_d[i]));`
			`CUDA_CALL(cudaFree(b_d[i]));`
			`CUDA_CALL(cudaFree(c_d[i]));`

			`CUDA_CALL(cudaStreamDestroy(streams[i]));`
			`}`
			`}`