Merge branch 'master' of https://github.com/kjp4155/chundoong-lab-ta

2023-08-15 17:23:08 +00:00 · 2023-08-15 17:23:08 +00:00 · fe46ababde
parent 7147340f9d 5c7cd68cd3
commit fe46ababde
18 changed files with 1122 additions and 23 deletions
--- a/APSS23/ans/advanced_11_1_allgather.cu
+++ b/APSS23/ans/advanced_11_1_allgather.cu
@ -0,0 +1,143 @@
+#include <cstdio>
+#include <nccl.h>
+#include <sys/time.h>
+
+#define CHECK_CUDA(call)                                                 \
+  do {                                                                   \
+    cudaError_t status_ = call;                                          \
+    if (status_ != cudaSuccess) {                                        \
+      fprintf(stderr, "CUDA error (%s:%d): %s:%s\n", __FILE__, __LINE__, \
+              cudaGetErrorName(status_), cudaGetErrorString(status_));   \
+      exit(EXIT_FAILURE);                                                \
+    }                                                                    \
+  } while (0)
+
+#define CHECK_NCCL(call)                                               \
+  do {                                                                 \
+    ncclResult_t status_ = call;                                       \
+    if (status_ != ncclSuccess && status_ != ncclInProgress) {         \
+      fprintf(stderr, "NCCL error (%s:%d): %s\n", __FILE__, __LINE__,  \
+              ncclGetErrorString(status_));                            \
+      exit(EXIT_FAILURE);                                                \
+    }                                                                  \
+  } while (0)
+
+const int NUM_GPU = 4;
+const int NITER = 3;
+const size_t nbytes = 256 * 1024 * 1024; // 256MiB
+int* sendbuf[NUM_GPU];
+int* recvbuf[NUM_GPU];
+cudaStream_t streams[NUM_GPU];
+
+double SyncAllGPUsAndGetTime() {
+  for (int i = 0; i < NUM_GPU; ++i) {
+    CHECK_CUDA(cudaSetDevice(i));
+    CHECK_CUDA(cudaDeviceSynchronize());
+  }
+  struct timeval tv;
+  gettimeofday(&tv, 0);
+  return tv.tv_sec + tv.tv_usec * 1e-6;
+}
+
+__global__ void FillBuffer(int* buf, size_t nbytes, size_t offset) {
+  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
+  size_t stride = blockDim.x * gridDim.x;
+  for (size_t i = idx; i < nbytes / sizeof(int); i += stride) {
+    buf[i] = i * sizeof(int) + offset;
+  }
+}
+
+void InitBuffers() {
+  for (int i = 0; i < NUM_GPU; ++i) {
+    CHECK_CUDA(cudaSetDevice(i));
+    FillBuffer<<<1, 1024>>>(sendbuf[i], nbytes, i * nbytes);
+    CHECK_CUDA(cudaMemset(recvbuf[i], 0, nbytes * NUM_GPU));
+  }
+}
+
+void CheckBuffers() {
+  int* buf = (int*)malloc(nbytes * NUM_GPU);
+  for (int i = 0; i < NUM_GPU; ++i) {
+    CHECK_CUDA(cudaSetDevice(i));
+    CHECK_CUDA(cudaMemcpy(buf, recvbuf[i], nbytes * NUM_GPU, cudaMemcpyDeviceToHost));
+    for (size_t j = 0; j < nbytes * NUM_GPU / sizeof(int); ++j) {
+      if (buf[j] != j * sizeof(int)) {
+        printf("Incorrect! buf[%zu] should be %zu, but %d found\n", j, j * sizeof(int), buf[j]);
+        goto end;
+      }
+    }
+  }
+  printf("Correct!\n");
+  end: free(buf);
+}
+
+void AllGatherWithNCCL() {
+  printf("[AllGather with NCCL]\n");
+
+  ncclComm_t comms[NUM_GPU];
+  int devlist[NUM_GPU];
+  for (int i = 0; i < NUM_GPU; ++i) {
+    devlist[i] = i;
+  }
+  CHECK_NCCL(ncclCommInitAll(comms, NUM_GPU, devlist));
+
+  for (int iter = 0; iter < NITER; ++iter) {
+    double st = SyncAllGPUsAndGetTime();
+    for (int i = 0; i < NUM_GPU; ++i) {
+      CHECK_NCCL(ncclAllGather(sendbuf[i], recvbuf[i], nbytes / sizeof(int), ncclInt, comms[i], streams[i]));
+    }
+    double et = SyncAllGPUsAndGetTime();
+    double gbps = nbytes * (NUM_GPU - 1) / (et - st) / 1e9;
+    printf("[Iter %d] %f sec (Effective bandwidth %f GB/s)\n", iter, et - st, gbps);
+  }
+
+  for (int i = 0; i < NUM_GPU; ++i) {
+    CHECK_NCCL(ncclCommDestroy(comms[i]));
+  }
+}
+
+void AllGatherWithMemcpy() {
+  printf("[AllGather with Memcpy]\n");
+
+  int canAccessPeer;
+  for (int i = 0; i < NUM_GPU; ++i) {
+    for (int j = 0; j < NUM_GPU; ++j) {
+      cudaDeviceCanAccessPeer(&canAccessPeer, i, j);
+      if (canAccessPeer == 1) {
+        cudaSetDevice(i);
+        cudaDeviceEnablePeerAccess(j, 0);
+      }
+    }
+  }
+
+  for (int iter = 0; iter < NITER; ++iter) {
+    double st = SyncAllGPUsAndGetTime();
+    for (int i = 0; i < NUM_GPU; ++i) {
+      for (int j = 0; j < NUM_GPU; ++j) {
+        CHECK_CUDA(cudaMemcpyAsync(recvbuf[j] + i * (nbytes / sizeof(int)), sendbuf[i], nbytes, cudaMemcpyDefault, streams[i]));
+      }
+    }
+    double et = SyncAllGPUsAndGetTime();
+    double gbps = nbytes * (NUM_GPU - 1) / (et - st) / 1e9;
+    printf("[Iter %d] %f sec (Effective bandwidth %f GB/s)\n", iter, et - st, gbps);
+  }
+}
+
+int main(int argc, char **argv) {
+  for (int i = 0; i < NUM_GPU; ++i) {
+    CHECK_CUDA(cudaSetDevice(i));
+    CHECK_CUDA(cudaMalloc(&sendbuf[i], nbytes));
+    CHECK_CUDA(cudaMalloc(&recvbuf[i], nbytes * NUM_GPU));
+    CHECK_CUDA(cudaStreamCreate(&streams[i]));
+  }
+
+  InitBuffers();
+  AllGatherWithNCCL();
+  CheckBuffers();
+
+  InitBuffers();
+  AllGatherWithMemcpy();
+  CheckBuffers();
+
+  return 0;
+}
--- a/APSS23/ans/advanced_8_1_matmul.cu
+++ b/APSS23/ans/advanced_8_1_matmul.cu
@ -0,0 +1,114 @@
+#include <cstdio>
+#include <mma.h>
+#include "matmul.h"
+
+using namespace nvcuda;
+
+#define CHECK_CUDA(call)                                              \
+  do {                                                                \
+    cudaError_t status_ = call;                                       \
+    if (status_ != cudaSuccess) {                                     \
+      fprintf(stderr, "CUDA error (%s:%d): %s\n", __FILE__, __LINE__, \
+              cudaGetErrorString(status_));                           \
+      exit(EXIT_FAILURE);                                             \
+    }                                                                 \
+  } while (0)
+
+#define BLOCK_SIZE 32
+
+#define WMMA_M 16
+#define WMMA_N 16
+#define WMMA_K 16
+#define WARP_SIZE 32
+#define NUM_WARP ((WMMA_M * WMMA_N) / (WARP_SIZE))
+#define C_LAYOUT wmma::mem_row_major
+
+static __global__ void matmul_kernel(half *A, half *B, float *C, int M, int N,
+                                     int K) {
+  int gj = blockIdx.x;
+  int gi = blockIdx.y;
+  if (gi * BLOCK_SIZE >= M || gj * BLOCK_SIZE >= N) return;  // boundary check
+  int lj = threadIdx.x;
+  int li = threadIdx.y;
+  int warpId = li;
+
+  __shared__ half Alocal[BLOCK_SIZE * BLOCK_SIZE];
+  __shared__ half Blocal[BLOCK_SIZE * BLOCK_SIZE];
+
+    // Declare the fragments
+  wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K, half, wmma::row_major> a_frag;
+  wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, half, wmma::row_major> b_frag;
+  wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, float> c_frag;
+  wmma::fill_fragment(c_frag, 0.0f);
+
+  int A_row_index = (gi * BLOCK_SIZE + li);
+  int B_col_index = (gj * BLOCK_SIZE + lj);
+
+  for (int bk = 0; bk < K; bk += BLOCK_SIZE) {
+    
+    for(int offset = 0 ; offset < NUM_WARP ; ++offset){
+      int A_col_index = bk + lj;
+      Alocal[(li + offset * blockDim.y) * BLOCK_SIZE + lj] = 
+        ((A_row_index + offset * blockDim.y) < M && A_col_index < K)
+        ? A[(A_row_index + offset * blockDim.y) * K + A_col_index]
+        : (half)(0.0);
+
+      int B_row_index = bk + li + (offset * blockDim.y);
+      Blocal[(li + offset * blockDim.y) * BLOCK_SIZE + lj] = 
+      (B_row_index < K && B_col_index < N)
+        ? B[B_row_index * N + B_col_index]
+        : (half)(0.0);  
+    }
+    __syncthreads();
+
+    for (int i = 0; i < BLOCK_SIZE; i += WMMA_K) {
+      int aCol = i;
+      int aRow = (warpId / 2) * WMMA_M;
+      int bCol = (warpId % 2) * WMMA_N;
+      int bRow = i;
+
+      wmma::load_matrix_sync(a_frag, Alocal + aCol + aRow * BLOCK_SIZE, BLOCK_SIZE);
+      wmma::load_matrix_sync(b_frag, Blocal + bCol + bRow * BLOCK_SIZE, BLOCK_SIZE);
+      
+      wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
+    }
+
+    __syncthreads();
+  }
+
+  int cRow = (warpId / 2) * WMMA_M + blockIdx.y * blockDim.y * NUM_WARP;
+  int cCol = (warpId % 2) * WMMA_N + blockIdx.x * blockDim.x;
+
+  if(cRow + WMMA_M <= M && cCol + WMMA_N <= N){
+    wmma::store_matrix_sync(C + cCol + cRow * N, c_frag, N, C_LAYOUT);
+  }
+}
+
+static half *A_gpu, *B_gpu;
+static float *C_gpu;
+
+void matmul(half *_A, half *_B, float *_C, int M, int N, int K) {
+  CHECK_CUDA(
+      cudaMemcpy(A_gpu, _A, M * K * sizeof(half), cudaMemcpyHostToDevice));
+  CHECK_CUDA(
+      cudaMemcpy(B_gpu, _B, K * N * sizeof(half), cudaMemcpyHostToDevice));
+  dim3 blockDim(BLOCK_SIZE, 4);
+  dim3 gridDim((N + BLOCK_SIZE - 1) / BLOCK_SIZE,
+               (M + BLOCK_SIZE - 1) / BLOCK_SIZE);
+  matmul_kernel<<<gridDim, blockDim>>>(A_gpu, B_gpu, C_gpu, M, N, K);
+  CHECK_CUDA(cudaGetLastError());
+  CHECK_CUDA(
+      cudaMemcpy(_C, C_gpu, M * N * sizeof(float), cudaMemcpyDeviceToHost));
+}
+
+void matmul_init(int M, int N, int K) {
+  CHECK_CUDA(cudaMalloc(&A_gpu, M * K * sizeof(half)));
+  CHECK_CUDA(cudaMalloc(&B_gpu, K * N * sizeof(half)));
+  CHECK_CUDA(cudaMalloc(&C_gpu, M * N * sizeof(float)));
+}
+
+void matmul_cleanup(half *_A, half *_B, float *_C, int M, int N, int K) {
+  CHECK_CUDA(cudaFree(A_gpu));
+  CHECK_CUDA(cudaFree(B_gpu));
+  CHECK_CUDA(cudaFree(C_gpu));
+}
--- a/APSS23/ans/advanced_8_2_im2col_convolution_with_TC.cu
+++ b/APSS23/ans/advanced_8_2_im2col_convolution_with_TC.cu
@ -0,0 +1,215 @@
+#include <cstdlib>
+#include <cstdio>
+#include <mma.h>
+#include "convolution.cuh"
+
+using namespace nvcuda;
+
+
+#define CHECK_CUDA(call)                                                 \
+  do {                                                                   \
+    cudaError_t status_ = call;                                          \
+    if (status_ != cudaSuccess) {                                        \
+      fprintf(stderr, "CUDA error (%s:%d): %s:%s\n", __FILE__, __LINE__, \
+              cudaGetErrorName(status_), cudaGetErrorString(status_));   \
+      exit(EXIT_FAILURE);                                                \
+    }                                                                    \
+  } while (0)
+
+#define MAX_BLOCK 1024
+#define NUM_SM 80
+#define WARP_SIZE 32
+#define BLOCK_PER_WARP (MAX_BLOCK / WARP_SIZE)
+#define BLOCK_SIZE 32
+
+#define WMMA_M 16
+#define WMMA_N 16
+#define WMMA_K 16
+#define NUM_WARP ((WMMA_M * WMMA_N) / (WARP_SIZE))
+#define C_LAYOUT wmma::mem_row_major
+// Device(GPU) pointers
+static half *I_gpu, *F_gpu, *O_gpu, *BUF1_gpu, *BUF2_gpu;
+
+__global__ void im2col_kernel(half *_I, half *workspace, int N, int C, int H, int W,
+                              int R, int S, int pad_h, int pad_w, int stride_h,
+                              int stride_w, int dilation_h, int dilation_w, int OH, int OW){
+
+  int tidx = threadIdx.x + blockIdx.x * blockDim.x;
+
+  for(int crs = 0 ; crs < C * R * S ; ++crs){
+    int row_offset = crs * N * OH * OW;
+    const int c = crs / (R * S);
+    const int r = (crs / S) % R;
+    const int s = crs % S;
+
+    for(int nhw = tidx ; nhw < N * OH * OW ; nhw += MAX_BLOCK){
+      const int n = nhw / (OH * OW);
+      const int oh = (nhw / OW) % OH;
+      const int ow = nhw % OW;
+      
+      const int h = oh * stride_h - pad_h + r * dilation_h;
+      const int w = ow * stride_w - pad_w + s * dilation_w;
+
+      if (h < 0 || h >= H || w < 0 || w >= W) continue;
+
+      workspace[row_offset + nhw] =
+      _I[n * C * H * W + c * H * W + h * W + w];
+
+    }
+  }
+}
+
+static __global__ void matmul_kernel(half *A, half *B, half *C, int M, int N,
+                                     int K) {
+  int gj = blockIdx.x;
+  int gi = blockIdx.y;
+  if (gi * BLOCK_SIZE >= M || gj * BLOCK_SIZE >= N) return;  // boundary check
+  int lj = threadIdx.x;
+  int li = threadIdx.y;
+  int warpId = li;
+
+  __shared__ half Alocal[BLOCK_SIZE * BLOCK_SIZE];
+  __shared__ half Blocal[BLOCK_SIZE * BLOCK_SIZE];
+
+    // Declare the fragments
+  wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K, half, wmma::row_major> a_frag;
+  wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, half, wmma::row_major> b_frag;
+  wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, half> c_frag;
+  wmma::fill_fragment(c_frag, 0.0f);
+
+  int A_row_index = (gi * BLOCK_SIZE + li);
+  int B_col_index = (gj * BLOCK_SIZE + lj);
+
+  for (int bk = 0; bk < K; bk += BLOCK_SIZE) {
+    
+    for(int offset = 0 ; offset < NUM_WARP ; ++offset){
+      int A_col_index = bk + lj;
+      Alocal[(li + offset * blockDim.y) * BLOCK_SIZE + lj] = 
+        ((A_row_index + offset * blockDim.y) < M && A_col_index < K)
+        ? A[(A_row_index + offset * blockDim.y) * K + A_col_index]
+        : (half)(0.0);
+
+      int B_row_index = bk + li + (offset * blockDim.y);
+      Blocal[(li + offset * blockDim.y) * BLOCK_SIZE + lj] = 
+      (B_row_index < K && B_col_index < N)
+        ? B[B_row_index * N + B_col_index]
+        : (half)(0.0);  
+    }
+    __syncthreads();
+
+    for (int i = 0; i < BLOCK_SIZE; i += WMMA_K) {
+      int aCol = i;
+      int aRow = (warpId / 2) * WMMA_M;
+      int bCol = (warpId % 2) * WMMA_N;
+      int bRow = i;
+
+      wmma::load_matrix_sync(a_frag, Alocal + aCol + aRow * BLOCK_SIZE, BLOCK_SIZE);
+      wmma::load_matrix_sync(b_frag, Blocal + bCol + bRow * BLOCK_SIZE, BLOCK_SIZE);
+      
+      wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
+    }
+
+    __syncthreads();
+  }
+
+  int cRow = (warpId / 2) * WMMA_M + blockIdx.y * blockDim.y * NUM_WARP;
+  int cCol = (warpId % 2) * WMMA_N + blockIdx.x * blockDim.x;
+
+  if(cRow + WMMA_M <= M && cCol + WMMA_N <= N){
+    wmma::store_matrix_sync(C + cCol + cRow * N, c_frag, N, C_LAYOUT);
+  }
+}
+
+__global__ void reshape_kernel(half *_src, half *_dst, int N, int K, int OH, int OW){
+
+  int bidx = blockIdx.x;
+  int widx = threadIdx.x / WARP_SIZE;
+  int lidx = threadIdx.x % WARP_SIZE;
+
+  for(int k = widx ; k < K ; k += BLOCK_PER_WARP){
+    for(int on = bidx ; on < N ; on += NUM_SM){
+      for(int hw = lidx ; hw < OH * OW ; hw += WARP_SIZE){
+        _dst[on * K * OH * OW + k * OH * OW + hw] = 
+          _src[k * N * OH * OW + on * OH * OW + hw];
+      }
+    }
+  }
+}
+
+void convolution_im2col(half *_I, half *_F, half *_O, half *_BUF1,
+                                  half *_BUF2, int N, int C, int H, int W,
+                                  int K, int R, int S, int pad_h, int pad_w,
+                                  int stride_h, int stride_w, int dilation_h,
+                                  int dilation_w) {
+
+  const int OH = 1 + (H + 2 * pad_h - (((R - 1) * dilation_h) + 1)) / stride_h;
+  const int OW = 1 + (W + 2 * pad_w - (((S - 1) * dilation_w) + 1)) / stride_w;
+
+  CHECK_CUDA(
+      cudaMemcpy(I_gpu, _I, sizeof(half) * N * C * H * W, cudaMemcpyHostToDevice));
+  CHECK_CUDA(
+      cudaMemcpy(F_gpu, _F, sizeof(half) * K * C * R * S, cudaMemcpyHostToDevice));
+
+  dim3 griddim_im2col(NUM_SM);
+  dim3 blockdim_im2col(MAX_BLOCK);
+  im2col_kernel<<<griddim_im2col, blockdim_im2col>>>(I_gpu, BUF1_gpu, N, C, H, W, R, S, 
+                                      pad_h, pad_w, stride_h, stride_w,
+                                      dilation_h, dilation_w, OH, OW);
+  CHECK_CUDA(cudaGetLastError());
+  
+  dim3 griddim_matmul(((N * OH * OW) + BLOCK_SIZE - 1) / BLOCK_SIZE,
+                      (K + BLOCK_SIZE - 1) / BLOCK_SIZE);
+  dim3 blockDim_matmul(BLOCK_SIZE, 4);
+  matmul_kernel<<<griddim_matmul, blockDim_matmul>>>(F_gpu, BUF1_gpu, BUF2_gpu, K, N * OH * OW, C * R * S);
+  CHECK_CUDA(cudaGetLastError());
+
+  dim3 griddim_reshape(NUM_SM);
+  dim3 blockDim_reshape(MAX_BLOCK);
+  reshape_kernel<<<griddim_reshape, blockDim_reshape>>>(BUF2_gpu, O_gpu, N, K, OH, OW);
+  CHECK_CUDA(cudaGetLastError());
+
+  CHECK_CUDA(
+    cudaMemcpy(_O, O_gpu, sizeof(half) * N * K * OH * OW, cudaMemcpyDeviceToHost));
+
+  // DO NOT REMOVE; NEEDED FOR TIME MEASURE
+  CHECK_CUDA(cudaDeviceSynchronize());
+}
+
+void convolution(half *_I, half *_F, half *_O, half *_BUF1, half *_BUF2,
+                 int N, int C, int H, int W, int K, int R, int S, int pad_h,
+                 int pad_w, int stride_h, int stride_w, int dilation_h,
+                 int dilation_w) {
+  // Remove this line after you complete the convolution on GPU
+  convolution_im2col(_I, _F, _O, _BUF1, _BUF2, N, C, H, W, K, R, S,
+                               pad_h, pad_w, stride_h, stride_w, dilation_h,
+                               dilation_w);
+}
+
+void convolution_initialize(int N, int C, int H, int W, int K, int R, int S,
+                            int pad_h, int pad_w, int stride_h, int stride_w,
+                            int dilation_h, int dilation_w) {
+  
+  const int OH = 1 + (H + 2 * pad_h - (((R - 1) * dilation_h) + 1)) / stride_h;
+  const int OW = 1 + (W + 2 * pad_w - (((S - 1) * dilation_w) + 1)) / stride_w;
+
+  size_t alloc = sizeof(half) * N * C * H * W + sizeof(half) * K * C * R * S + sizeof(half) * N * K * OH * OW + 
+  sizeof(half) * C * R * S * N * OH * OW + sizeof(half) * N * K * OH * OW;
+  printf("Alloc Memory : %lf\n",(double)alloc/1e9);
+
+  CHECK_CUDA(cudaMalloc((void **) &I_gpu, sizeof(half) * N * C * H * W));
+  CHECK_CUDA(cudaMalloc((void **) &F_gpu, sizeof(half) * K * C * R * S));
+  CHECK_CUDA(cudaMalloc((void **) &O_gpu, sizeof(half) * N * K * OH * OW));
+  CHECK_CUDA(cudaMalloc((void **) &BUF1_gpu, sizeof(half) * C * R * S * N * OH * OW));
+  CHECK_CUDA(cudaMalloc((void **) &BUF2_gpu, sizeof(half) * N * K * OH * OW));
+}
+
+void convolution_cleanup(half *_I, half *_F, half *_O, int N, int C, int H,
+                         int W, int K, int R, int S, int pad_h, int pad_w,
+                         int stride_h, int stride_w, int dilation_h,
+                         int dilation_w) {
+  CHECK_CUDA(cudaFree(I_gpu));
+  CHECK_CUDA(cudaFree(F_gpu));
+  CHECK_CUDA(cudaFree(O_gpu));
+  CHECK_CUDA(cudaFree(BUF1_gpu));
+  CHECK_CUDA(cudaFree(BUF2_gpu));
+}
--- a/APSS23/ans/advanced_8_2_im2col_convolution_without_TC.cu
+++ b/APSS23/ans/advanced_8_2_im2col_convolution_without_TC.cu
@ -0,0 +1,176 @@
+#include <cstdlib>
+#include <cstdio>
+#include "convolution.cuh"
+
+#define CHECK_CUDA(call)                                                 \
+  do {                                                                   \
+    cudaError_t status_ = call;                                          \
+    if (status_ != cudaSuccess) {                                        \
+      fprintf(stderr, "CUDA error (%s:%d): %s:%s\n", __FILE__, __LINE__, \
+              cudaGetErrorName(status_), cudaGetErrorString(status_));   \
+      exit(EXIT_FAILURE);                                                \
+    }                                                                    \
+  } while (0)
+
+#define MAX_BLOCK 1024
+#define NUM_SM 80
+#define NUM_WARP 32
+#define BLOCK_PER_WARP (MAX_BLOCK / NUM_WARP)
+#define BLOCK_SIZE 32
+// Device(GPU) pointers
+static float *I_gpu, *F_gpu, *O_gpu, *BUF1_gpu, *BUF2_gpu;
+
+__global__ void im2col_kernel(float *_I, float *workspace, int N, int C, int H, int W,
+                              int R, int S, int pad_h, int pad_w, int stride_h,
+                              int stride_w, int dilation_h, int dilation_w, int OH, int OW){
+
+  int tidx = threadIdx.x + blockIdx.x * blockDim.x;
+
+  for(int crs = 0 ; crs < C * R * S ; ++crs){
+    int row_offset = crs * N * OH * OW;
+    const int c = crs / (R * S);
+    const int r = (crs / S) % R;
+    const int s = crs % S;
+
+    for(int nhw = tidx ; nhw < N * OH * OW ; nhw += MAX_BLOCK){
+      const int n = nhw / (OH * OW);
+      const int oh = (nhw / OW) % OH;
+      const int ow = nhw % OW;
+      
+      const int h = oh * stride_h - pad_h + r * dilation_h;
+      const int w = ow * stride_w - pad_w + s * dilation_w;
+
+      if (h < 0 || h >= H || w < 0 || w >= W) continue;
+
+      workspace[row_offset + nhw] =
+      _I[n * C * H * W + c * H * W + h * W + w];
+
+    }
+  }
+}
+
+static __global__ void matmul_kernel(float *A, float *B, float *C, int M, int N,
+                                     int K) {
+  int j = blockIdx.x * blockDim.x + threadIdx.x;
+  int i = blockIdx.y * blockDim.y + threadIdx.y;
+  int gj = blockIdx.x;
+  int gi = blockIdx.y;
+  if (gi * BLOCK_SIZE >= M || gj * BLOCK_SIZE >= N) return;  // boundary check
+  int lj = threadIdx.x;
+  int li = threadIdx.y;
+  __shared__ float Alocal[BLOCK_SIZE][BLOCK_SIZE];
+  __shared__ float Blocal[BLOCK_SIZE][BLOCK_SIZE];
+  float c = 0.f;
+  int A_row_index = (gi * BLOCK_SIZE + li);
+  int B_col_index = (gj * BLOCK_SIZE + lj);
+  for (int bk = 0; bk < K; bk += BLOCK_SIZE) {
+    int A_col_index = bk + lj;
+    Alocal[li][lj] = (A_row_index < M && A_col_index < K)
+                         ? A[A_row_index * K + A_col_index]
+                         : 0.f;
+    int B_row_index = bk + li;
+    Blocal[li][lj] = (B_row_index < K && B_col_index < N)
+                         ? B[B_row_index * N + B_col_index]
+                         : 0.f;
+    __syncthreads();
+    for (int lk = 0; lk < BLOCK_SIZE; ++lk) {
+      c += Alocal[li][lk] * Blocal[lk][lj];
+    }
+    __syncthreads();
+  }
+  if (i < M && j < N) C[i * N + j] = c;
+}
+
+__global__ void reshape_kernel(float *_src, float *_dst, int N, int K, int OH, int OW){
+
+  int bidx = blockIdx.x;
+  int widx = threadIdx.x / NUM_WARP;
+  int lidx = threadIdx.x % NUM_WARP;
+
+  for(int k = widx ; k < K ; k += BLOCK_PER_WARP){
+    for(int on = bidx ; on < N ; on += NUM_SM){
+      for(int hw = lidx ; hw < OH * OW ; hw += NUM_WARP){
+        _dst[on * K * OH * OW + k * OH * OW + hw] = 
+          _src[k * N * OH * OW + on * OH * OW + hw];
+      }
+    }
+  }
+}
+
+void convolution_im2col(float *_I, float *_F, float *_O, float *_BUF1,
+                                  float *_BUF2, int N, int C, int H, int W,
+                                  int K, int R, int S, int pad_h, int pad_w,
+                                  int stride_h, int stride_w, int dilation_h,
+                                  int dilation_w) {
+
+  const int OH = 1 + (H + 2 * pad_h - (((R - 1) * dilation_h) + 1)) / stride_h;
+  const int OW = 1 + (W + 2 * pad_w - (((S - 1) * dilation_w) + 1)) / stride_w;
+
+  CHECK_CUDA(
+      cudaMemcpy(I_gpu, _I, sizeof(float) * N * C * H * W, cudaMemcpyHostToDevice));
+  CHECK_CUDA(
+      cudaMemcpy(F_gpu, _F, sizeof(float) * K * C * R * S, cudaMemcpyHostToDevice));
+
+  dim3 griddim_im2col(NUM_SM);
+  dim3 blockdim_im2col(MAX_BLOCK);
+  im2col_kernel<<<griddim_im2col, blockdim_im2col>>>(I_gpu, BUF1_gpu, N, C, H, W, R, S, 
+                                      pad_h, pad_w, stride_h, stride_w,
+                                      dilation_h, dilation_w, OH, OW);
+  CHECK_CUDA(cudaGetLastError());
+
+  dim3 griddim_matmul(((N * OH * OW)+ BLOCK_SIZE - 1) / BLOCK_SIZE,
+                      (K + BLOCK_SIZE - 1) / BLOCK_SIZE);
+  dim3 blockDim_matmul(BLOCK_SIZE, BLOCK_SIZE);
+  matmul_kernel<<<griddim_matmul, blockDim_matmul>>>(F_gpu, BUF1_gpu, BUF2_gpu, K, N * OH * OW, C * R * S);
+  CHECK_CUDA(cudaGetLastError());
+
+  dim3 griddim_reshape(NUM_SM);
+  dim3 blockDim_reshape(MAX_BLOCK);
+  reshape_kernel<<<griddim_reshape, blockDim_reshape>>>(BUF2_gpu, O_gpu, N, K, OH, OW);
+  CHECK_CUDA(cudaGetLastError());
+
+  CHECK_CUDA(
+    cudaMemcpy(_O, O_gpu, sizeof(float) * N * K * OH * OW, cudaMemcpyDeviceToHost));
+
+  // DO NOT REMOVE; NEEDED FOR TIME MEASURE
+  CHECK_CUDA(cudaDeviceSynchronize());
+}
+
+void convolution(float *_I, float *_F, float *_O, float *_BUF1, float *_BUF2,
+                 int N, int C, int H, int W, int K, int R, int S, int pad_h,
+                 int pad_w, int stride_h, int stride_w, int dilation_h,
+                 int dilation_w) {
+  // Remove this line after you complete the convolution on GPU
+  convolution_im2col(_I, _F, _O, _BUF1, _BUF2, N, C, H, W, K, R, S,
+                               pad_h, pad_w, stride_h, stride_w, dilation_h,
+                               dilation_w);
+}
+
+void convolution_initialize(int N, int C, int H, int W, int K, int R, int S,
+                            int pad_h, int pad_w, int stride_h, int stride_w,
+                            int dilation_h, int dilation_w) {
+  
+  const int OH = 1 + (H + 2 * pad_h - (((R - 1) * dilation_h) + 1)) / stride_h;
+  const int OW = 1 + (W + 2 * pad_w - (((S - 1) * dilation_w) + 1)) / stride_w;
+
+  size_t alloc = sizeof(float) * N * C * H * W + sizeof(float) * K * C * R * S + sizeof(float) * N * K * OH * OW + 
+  sizeof(float) * C * R * S * N * OH * OW + sizeof(float) * N * K * OH * OW;
+  printf("Alloc Memory : %lf\n",(double)alloc/1e9);
+
+  CHECK_CUDA(cudaMalloc((void **) &I_gpu, sizeof(float) * N * C * H * W));
+  CHECK_CUDA(cudaMalloc((void **) &F_gpu, sizeof(float) * K * C * R * S));
+  CHECK_CUDA(cudaMalloc((void **) &O_gpu, sizeof(float) * N * K * OH * OW));
+  CHECK_CUDA(cudaMalloc((void **) &BUF1_gpu, sizeof(float) * C * R * S * N * OH * OW));
+  CHECK_CUDA(cudaMalloc((void **) &BUF2_gpu, sizeof(float) * N * K * OH * OW));
+}
+
+void convolution_cleanup(float *_I, float *_F, float *_O, int N, int C, int H,
+                         int W, int K, int R, int S, int pad_h, int pad_w,
+                         int stride_h, int stride_w, int dilation_h,
+                         int dilation_w) {
+  CHECK_CUDA(cudaFree(I_gpu));
+  CHECK_CUDA(cudaFree(F_gpu));
+  CHECK_CUDA(cudaFree(O_gpu));
+  CHECK_CUDA(cudaFree(BUF1_gpu));
+  CHECK_CUDA(cudaFree(BUF2_gpu));
+}
--- a/APSS23/convolution_im2col_TC/Makefile
+++ b/APSS23/convolution_im2col_TC/Makefile
@ -2,7 +2,7 @@ TARGET=main
 OBJECTS=main.o util.o convolution.o

 CPPFLAGS=-std=c++14 -O3 -Wall -march=native -mavx2 -mfma -fopenmp -mno-avx512f -I/usr/local/cuda/include
-CUDA_CFLAGS:=$(foreach option, $(CPPFLAGS),-Xcompiler=$(option))
+CUDA_CFLAGS:=$(foreach option, $(CPPFLAGS),-Xcompiler=$(option) -arch=sm_70)

 LDFLAGS=-L/usr/local/cuda/lib64
 LDLIBS=-lstdc++ -lcudart -lm -lcudnn
--- a/APSS23/convolution_im2col_TC/convolution.cu
+++ b/APSS23/convolution_im2col_TC/convolution.cu
@ -15,7 +15,6 @@
  void naive_cpu_im2col(half *_I, half *workspace, int N, int C, int H, int W,
                      int R, int S, int pad_h, int pad_w, int stride_h,
                      int stride_w, int dilation_h, int dilation_w) {
-  half *I = _I;

  // Naive CPU im2col
  const int ON = N;
@ -36,7 +35,7 @@ void naive_cpu_im2col(half *_I, half *workspace, int N, int C, int H, int W,

              workspace[((c * R * S) + (r * S) + s) * (ON * OH * OW) +
                        (on * OH * OW + oh * OW + ow)] =
-                  I[n * C * H * W + c * H * W + h * W + w];
+                  _I[n * C * H * W + c * H * W + h * W + w];
            }
          }
        }
@ -46,10 +45,11 @@ void naive_cpu_im2col(half *_I, half *workspace, int N, int C, int H, int W,
 }

 void naive_cpu_matmul_TC(half *_A, half *_B, half *_C, int M, int N, int K) {
+
  for (int i = 0; i < M; i++) {
    for (int k = 0; k < K; k++) {
      for (int j = 0; j < N; j++) {
-        _C[i * N + j] += _A[i * K + k] * _B[k * N + j];
+        _C[i * N + j] = (half)((float)(_C[i * N + j]) + (float)_A[i * K + k] * (float)_B[k * N + j]);
      }
    }
  }
--- a/APSS23/convolution_im2col_TC/convolution.cuh
+++ b/APSS23/convolution_im2col_TC/convolution.cuh
@ -1,9 +1,6 @@
 #pragma once

-#include "half.hpp"
-
-using half_float::half;
-using namespace half_float::literal;
+#include <cuda_fp16.h>

 void convolution(half *_I, half *_F, half *_O, half *_BUF1, half *_BUF2,
                 int N, int C, int H, int W, int K, int R, int S, int pad_h,
--- a/APSS23/convolution_im2col_TC/main.cpp
+++ b/APSS23/convolution_im2col_TC/main.cpp
@ -5,12 +5,9 @@
 #include <string.h>

 #include "convolution.cuh"
-#include "half.hpp"
+#include <cuda_fp16.h>
 #include "util.h"

-using half_float::half;
-using namespace half_float::literal;
-
 static bool print = false;
 static bool validation = false;
 static int N = 1;
--- a/APSS23/convolution_im2col_TC/util.cpp
+++ b/APSS23/convolution_im2col_TC/util.cpp
@ -22,7 +22,7 @@ half *alloc_tensor(int N, int C, int H, int W) {

 void rand_tensor(half *m, int N, int C, int H, int W) {
  int L = N * C * H * W;
-  for (int j = 0; j < L; j++) { m[j] = (half) rand() / RAND_MAX - 0.5; }
+  for (int j = 0; j < L; j++) { m[j] = (half) (float)rand() / RAND_MAX - 0.5; }
 }

 void zero_tensor(half *m, int N, int C, int H, int W) {
@ -60,7 +60,7 @@ void check_convolution(half *I, half *F, half *O, int N, int C, int H, int W,
    for (int oc = 0; oc < OC; ++oc) {
      for (int oh = 0; oh < OH; ++oh) {
        for (int ow = 0; ow < OW; ++ow) {
-          half sum = 0.0_h;
+          half sum = (half)0.0;
          for (int c = 0; c < C; ++c) {
            for (int r = 0; r < R; ++r) {
              for (int s = 0; s < S; ++s) {
@ -69,8 +69,8 @@ void check_convolution(half *I, half *F, half *O, int N, int C, int H, int W,
                const int w = ow * stride_w - pad_w + s * dilation_w;
                const int k = oc;
                if (h < 0 || h >= H || w < 0 || w >= W) continue;
-                sum += I[((n * C + c) * H + h) * W + w] *
-                       F[((k * C + c) * R + r) * S + s];
+                sum = sum + (half)(I[((n * C + c) * H + h) * W + w] *
+                       F[((k * C + c) * R + r) * S + s]);
              }
            }
          }
@ -82,7 +82,7 @@ void check_convolution(half *I, half *F, half *O, int N, int C, int H, int W,

  bool is_valid = true;
  int cnt = 0, thr = 10;
-  half eps = 1e-3_h;
+  half eps = (half)1e-3;
  for (int on = 0; on < ON; ++on) {
    for (int oc = 0; oc < OC; ++oc) {
      for (int oh = 0; oh < OH; ++oh) {
--- a/APSS23/convolution_im2col_TC/util.h
+++ b/APSS23/convolution_im2col_TC/util.h
@ -1,9 +1,5 @@
 #pragma once
-
-#include "half.hpp"
-
-using half_float::half;
-using namespace half_float::literal;
+#include <cuda_fp16.h>

 double get_time();

--- a/APSS23/matmul_fp16_TC/Makefile
+++ b/APSS23/matmul_fp16_TC/Makefile
@ -0,0 +1,20 @@
+TARGET=main
+OBJECTS=util.o matmul.o main.o
+
+CPPFLAGS=-std=c++11 -O3 -Wall -march=native -mavx2 -mno-avx512f -mfma -fopenmp
+CPPFLAGS+= -I/usr/local/cuda/include/
+LDFLAGS=-lm -lcudart -lcublas
+LDFLAGS+=-L/usr/local/cuda/lib64
+
+NVCC=/usr/local/cuda/bin/nvcc
+
+all: $(TARGET)
+
+$(TARGET): $(OBJECTS)
+	g++ $(CPPFLAGS) $^ -o $@ $(LDFLAGS)
+
+matmul.o: matmul.cu
+	$(NVCC) -c -o $@ $^ -arch=sm_70
+
+clean:
+	rm -rf $(TARGET) $(OBJECTS)
--- a/APSS23/matmul_fp16_TC/main.cpp
+++ b/APSS23/matmul_fp16_TC/main.cpp
@ -0,0 +1,103 @@
+#include <stdio.h>
+#include <getopt.h>
+#include <stdbool.h>
+#include <stdlib.h>
+
+#include <cuda_fp16.h>
+#include "util.h"
+#include "matmul.h"
+
+static void print_help(const char* prog_name) {
+  printf("Usage: %s [-pvh]  [-n num_iterations] M N K\n", prog_name);
+  printf("Options:\n");
+  printf("  -p : print matrix data. (default: off)\n");
+  printf("  -v : validate matrix multiplication. (default: off)\n");
+  printf("  -h : print this page.\n");
+  printf("  -t : number of threads (default: 1)\n");
+  printf("  -n : number of iterations (default: 1)\n");
+  printf("   M : number of rows of matrix A and C. (default: 8)\n");
+  printf("   N : number of columns of matrix B and C. (default: 8)\n");
+  printf("   K : number of columns of matrix A and rows of B. (default: 8)\n");
+}
+
+static bool print_matrix = false;
+static bool validation = false;
+static int M = 8, N = 8, K = 8;
+static int num_iterations = 1;
+
+static void parse_opt(int argc, char **argv) {
+  int c;
+  while ((c = getopt(argc, argv, "pvht:n:")) != -1) {
+    switch (c) {
+      case 'p':
+        print_matrix = true;
+        break;
+      case 'v':
+        validation = true;
+        break;
+      case 'n':
+        num_iterations = atoi(optarg);
+        break;
+      case 'h':
+      default:
+        print_help(argv[0]);
+        exit(0);
+    }
+  }
+  for (int i = optind, j = 0; i < argc; ++i, ++j) {
+    switch (j) {
+      case 0: M = atoi(argv[i]); break;
+      case 1: N = atoi(argv[i]); break;
+      case 2: K = atoi(argv[i]); break;
+      default: break;
+    }
+  }
+  printf("Options:\n");
+  printf("  Problem size: M = %d, N = %d, K = %d\n", M, N, K);
+  printf("  Number of iterations: %d\n", num_iterations);
+  printf("  Print matrix: %s\n", print_matrix ? "on" : "off");
+  printf("  Validation: %s\n", validation ? "on" : "off");
+  printf("\n");
+}
+
+int main(int argc, char **argv) {
+  parse_opt(argc, argv);
+
+  printf("Initializing... "); fflush(stdout);
+  half *A = alloc_mat(M, K);
+  half *B = alloc_mat(K, N);
+  float *C = alloc_mat_float(M, N);
+  rand_mat(A, M, K);
+  rand_mat(B, K, N);
+  matmul_init(M, N, K);
+  printf("done!\n"); fflush(stdout);
+
+  double elapsed_time_sum = 0;
+  for (int i = 0; i < num_iterations; ++i) {
+    printf("Calculating...(iter=%d) ", i); fflush(stdout);
+    zero_mat_float(C, M, N);
+    double start_time = get_time();
+    matmul(A, B, C, M, N, K);
+    double elapsed_time = get_time() - start_time;
+    printf("%f sec\n", elapsed_time);
+    elapsed_time_sum += elapsed_time;
+  }
+
+  if (print_matrix) {
+    printf("MATRIX A:\n"); print_mat(A, M, K);
+    printf("MATRIX B:\n"); print_mat(B, K, N);
+    printf("MATRIX C:\n"); print_mat_float(C, M, N);
+  }
+
+  matmul_cleanup(A, B, C, M, N, K);
+
+  if (validation) {
+    check_matmul(A, B, C, M, N, K);
+  }
+
+  double elapsed_time_avg = elapsed_time_sum / num_iterations;
+  printf("Avg. time: %f sec\n", elapsed_time_avg);
+  printf("Avg. throughput: %f GFLOPS\n", 2.0 * M * N * K / elapsed_time_avg / 1e9);
+
+  return 0;
+}
--- a/APSS23/matmul_fp16_TC/matmul.cu
+++ b/APSS23/matmul_fp16_TC/matmul.cu
@ -0,0 +1,57 @@
+#include <cstdio>
+#include <mma.h>
+
+#include "matmul.h"
+
+using namespace nvcuda;
+
+#define CHECK_CUDA(call)                                                 \
+  do {                                                                   \
+    cudaError_t status_ = call;                                          \
+    if (status_ != cudaSuccess) {                                        \
+      fprintf(stderr, "CUDA error (%s:%d): %s:%s\n", __FILE__, __LINE__, \
+              cudaGetErrorName(status_), cudaGetErrorString(status_));   \
+      exit(EXIT_FAILURE);                                                \
+    }                                                                    \
+  } while (0)
+
+// Device(GPU) pointers
+static float *A_gpu, *B_gpu, *C_gpu;
+
+void naive_cpu_matmul(half *_A, half *_B, float *_C, int M, int N, int K) {
+  for (int i = 0; i < M; i++) {
+    for (int k = 0; k < K; k++) {
+      for (int j = 0; j < N; j++) {
+        _C[i * N + j] += (float)_A[i * K + k] * (float)_B[k * N + j];
+      }
+    }
+  }
+}
+
+void matmul(half *_A, half *_B, float *_C, int M, int N, int K) {
+  // Remove this line after you complete the matmul on GPU
+  naive_cpu_matmul(_A, _B, _C, M, N, K);
+
+  // (TODO) Upload A and B matrix to GPU
+
+  // (TODO) Launch kernel on a GPU
+
+  // (TODO) Download C matrix from GPU
+
+  // DO NOT REMOVE; NEEDED FOR TIME MEASURE
+  CHECK_CUDA(cudaDeviceSynchronize());
+}
+
+void matmul_init(int M, int N, int K) {
+  // (TODO) Allocate device memory
+
+  // DO NOT REMOVE; NEEDED FOR TIME MEASURE
+  CHECK_CUDA(cudaDeviceSynchronize());
+}
+
+void matmul_cleanup(half *_A, half *_B, float *_C, int M, int N, int K) {
+  // (TODO) Do any post-matmul cleanup work here.
+
+  // DO NOT REMOVE; NEEDED FOR TIME MEASURE
+  CHECK_CUDA(cudaDeviceSynchronize());
+}
--- a/APSS23/matmul_fp16_TC/matmul.h
+++ b/APSS23/matmul_fp16_TC/matmul.h
@ -0,0 +1,7 @@
+#pragma once
+
+#include <cuda_fp16.h>
+
+void matmul(half *_A, half *_B, float *_C, int M, int N, int K);
+void matmul_init(int M, int N, int K);
+void matmul_cleanup(half *_A, half *_B, float *_C, int M, int N, int K);
--- a/APSS23/matmul_fp16_TC/util.cpp
+++ b/APSS23/matmul_fp16_TC/util.cpp
@ -0,0 +1,96 @@
+#include "util.h"
+
+#include <omp.h>
+#include <sys/time.h>
+
+
+#include <cmath>
+#include <cstdbool>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+double get_time() {
+  struct timeval tv;
+  gettimeofday(&tv, 0);
+  return tv.tv_sec + tv.tv_usec * 1e-6;
+}
+
+void check_matmul(half *A, half *B, float *C, int M, int N, int K) {
+  printf("Validating...\n");
+
+  float *C_ans = alloc_mat_float(M, N);
+  zero_mat_float(C_ans, M, N);
+
+#pragma omp parallel for num_threads(20)
+  for (int i = 0; i < M; ++i) {
+    for (int k = 0; k < K; ++k) {
+      for (int j = 0; j < N; ++j) {
+        C_ans[i * N + j] = C_ans[i * N + j] + (float)((A[i * K + k]) * (B[k * N + j]));
+      }
+    }
+  }
+
+  bool is_valid = true;
+  int cnt = 0, thr = 10;
+  float eps = 1e-3;
+  for (int i = 0; i < M; ++i) {
+    for (int j = 0; j < N; ++j) {
+      float c = C[i * N + j];
+      float c_ans = C_ans[i * N + j];
+      if (fabsf(c - c_ans) > eps &&
+          (c_ans == 0 || fabsf((c - c_ans) / c_ans) > eps)) {
+        ++cnt;
+        if (cnt <= thr)
+          printf("C[%d][%d] : correct_value = %f, your_value = %f\n", i, j,
+                 (float)c_ans, (float)c);
+        if (cnt == thr + 1)
+          printf("Too many error, only first %d values are printed.\n", thr);
+        is_valid = false;
+      }
+    }
+  }
+
+  if (is_valid) {
+    printf("Result: VALID\n");
+  } else {
+    printf("Result: INVALID\n");
+  }
+}
+
+void print_mat(half *m, int R, int C) {
+  for (int i = 0; i < R; ++i) {
+    for (int j = 0; j < C; ++j) { printf("%+.3f ", (float)(m[i * C + j])); }
+    printf("\n");
+  }
+}
+
+void print_mat_float(float *m, int R, int C) {
+  for (int i = 0; i < R; ++i) {
+    for (int j = 0; j < C; ++j) { printf("%+.3f ", (float)(m[i * C + j])); }
+    printf("\n");
+  }
+}
+
+half *alloc_mat(int R, int C) {
+  half *m = (half *) aligned_alloc(32, sizeof(half) * R * C);
+  return m;
+}
+
+float *alloc_mat_float(int R, int C) {
+  float *m = (float *) aligned_alloc(32, sizeof(float) * R * C);
+  return m;
+}
+
+void rand_mat(half *m, int R, int C) {
+  for (int i = 0; i < R; i++) {
+    for (int j = 0; j < C; j++) {
+      m[i * C + j] = (half) ((float)rand() / RAND_MAX - 0.5);
+    }
+  }
+}
+
+
+void zero_mat(half *m, int R, int C) { memset(m, 0, sizeof(half) * R * C); }
+
+void zero_mat_float(float *m, int R, int C) { memset(m, 0, sizeof(float) * R * C); }
--- a/APSS23/matmul_fp16_TC/util.h
+++ b/APSS23/matmul_fp16_TC/util.h
@ -0,0 +1,20 @@
+#pragma once
+#include <cuda_fp16.h>
+
+double get_time();
+
+void check_matmul(half *A, half *B, float *C, int M, int N, int K);
+
+void print_mat(half *m, int R, int C);
+
+void print_mat_float(float *m, int R, int C);
+
+half* alloc_mat(int R, int C);
+
+float* alloc_mat_float(int R, int C);
+
+void rand_mat(half *m, int R, int C);
+
+void zero_mat(half *m, int R, int C);
+
+void zero_mat_float(float *m, int R, int C);
--- a/APSS23/multi-gpu-comm/Makefile
+++ b/APSS23/multi-gpu-comm/Makefile
@ -0,0 +1,11 @@
+TARGET=main
+
+NVCC=/usr/local/cuda/bin/nvcc
+
+all: $(TARGET)
+
+main: main.cu
+	$(NVCC) -o $@ $^ -lnccl
+
+clean:
+	rm -rf $(TARGET)
--- a/APSS23/multi-gpu-comm/main.cu
+++ b/APSS23/multi-gpu-comm/main.cu
@ -0,0 +1,147 @@
+#include <cstdio>
+#include <nccl.h>
+#include <sys/time.h>
+
+#define CHECK_CUDA(call)                                                 \
+  do {                                                                   \
+    cudaError_t status_ = call;                                          \
+    if (status_ != cudaSuccess) {                                        \
+      fprintf(stderr, "CUDA error (%s:%d): %s:%s\n", __FILE__, __LINE__, \
+              cudaGetErrorName(status_), cudaGetErrorString(status_));   \
+      exit(EXIT_FAILURE);                                                \
+    }                                                                    \
+  } while (0)
+
+#define CHECK_NCCL(call)                                               \
+  do {                                                                 \
+    ncclResult_t status_ = call;                                       \
+    if (status_ != ncclSuccess && status_ != ncclInProgress) {         \
+      fprintf(stderr, "NCCL error (%s:%d): %s\n", __FILE__, __LINE__,  \
+              ncclGetErrorString(status_));                            \
+      exit(EXIT_FAILURE);                                                \
+    }                                                                  \
+  } while (0)
+
+const int NUM_GPU = 4;
+const int NITER = 3;
+const size_t nbytes = 256 * 1024 * 1024; // 256MiB
+int* sendbuf[NUM_GPU];
+int* recvbuf[NUM_GPU];
+cudaStream_t streams[NUM_GPU];
+
+double SyncAllGPUsAndGetTime() {
+  for (int i = 0; i < NUM_GPU; ++i) {
+    CHECK_CUDA(cudaSetDevice(i));
+    CHECK_CUDA(cudaDeviceSynchronize());
+  }
+  struct timeval tv;
+  gettimeofday(&tv, 0);
+  return tv.tv_sec + tv.tv_usec * 1e-6;
+}
+
+__global__ void FillBuffer(int* buf, size_t nbytes, size_t offset) {
+  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
+  size_t stride = blockDim.x * gridDim.x;
+  for (size_t i = idx; i < nbytes / sizeof(int); i += stride) {
+    buf[i] = i * sizeof(int) + offset;
+  }
+}
+
+void InitBuffers() {
+  for (int i = 0; i < NUM_GPU; ++i) {
+    CHECK_CUDA(cudaSetDevice(i));
+    FillBuffer<<<1, 1024>>>(sendbuf[i], nbytes, i * nbytes);
+    CHECK_CUDA(cudaMemset(recvbuf[i], 0, nbytes * NUM_GPU));
+  }
+}
+
+void CheckBuffers() {
+  int* buf = (int*)malloc(nbytes * NUM_GPU);
+  for (int i = 0; i < NUM_GPU; ++i) {
+    CHECK_CUDA(cudaSetDevice(i));
+    CHECK_CUDA(cudaMemcpy(buf, recvbuf[i], nbytes * NUM_GPU, cudaMemcpyDeviceToHost));
+    for (size_t j = 0; j < nbytes * NUM_GPU / sizeof(int); ++j) {
+      if (buf[j] != j * sizeof(int)) {
+        printf("Incorrect! buf[%zu] should be %zu, but %d found\n", j, j * sizeof(int), buf[j]);
+        goto end;
+      }
+    }
+  }
+  printf("Correct!\n");
+  end: free(buf);
+}
+
+void AllGatherWithNCCL() {
+  printf("[AllGather with NCCL]\n");
+
+  ncclComm_t comms[NUM_GPU];
+  int devlist[NUM_GPU];
+  for (int i = 0; i < NUM_GPU; ++i) {
+    devlist[i] = i;
+  }
+  CHECK_NCCL(ncclCommInitAll(comms, NUM_GPU, devlist));
+
+  for (int iter = 0; iter < NITER; ++iter) {
+    double st = SyncAllGPUsAndGetTime();
+    for (int i = 0; i < NUM_GPU; ++i) {
+      /*
+       * TODO
+       * Implement AllGather with NCCL here.
+       */
+    }
+    double et = SyncAllGPUsAndGetTime();
+    double gbps = nbytes * (NUM_GPU - 1) / (et - st) / 1e9;
+    printf("[Iter %d] %f sec (Effective bandwidth %f GB/s)\n", iter, et - st, gbps);
+  }
+
+  for (int i = 0; i < NUM_GPU; ++i) {
+    CHECK_NCCL(ncclCommDestroy(comms[i]));
+  }
+}
+
+void AllGatherWithMemcpy() {
+  printf("[AllGather with Memcpy]\n");
+
+  int canAccessPeer;
+  for (int i = 0; i < NUM_GPU; ++i) {
+    for (int j = 0; j < NUM_GPU; ++j) {
+      cudaDeviceCanAccessPeer(&canAccessPeer, i, j);
+      if (canAccessPeer == 1) {
+        cudaSetDevice(i);
+        cudaDeviceEnablePeerAccess(j, 0);
+      }
+    }
+  }
+
+  for (int iter = 0; iter < NITER; ++iter) {
+    double st = SyncAllGPUsAndGetTime();
+    for (int i = 0; i < NUM_GPU; ++i) {
+      /*
+       * TODO
+       * Implement AllGather using memcpy here.
+       */
+    }
+    double et = SyncAllGPUsAndGetTime();
+    double gbps = nbytes * (NUM_GPU - 1) / (et - st) / 1e9;
+    printf("[Iter %d] %f sec (Effective bandwidth %f GB/s)\n", iter, et - st, gbps);
+  }
+}
+
+int main(int argc, char **argv) {
+  for (int i = 0; i < NUM_GPU; ++i) {
+    CHECK_CUDA(cudaSetDevice(i));
+    CHECK_CUDA(cudaMalloc(&sendbuf[i], nbytes));
+    CHECK_CUDA(cudaMalloc(&recvbuf[i], nbytes * NUM_GPU));
+    CHECK_CUDA(cudaStreamCreate(&streams[i]));
+  }
+
+  InitBuffers();
+  AllGatherWithNCCL();
+  CheckBuffers();
+
+  InitBuffers();
+  AllGatherWithMemcpy();
+  CheckBuffers();
+
+  return 0;
+}