This commit is contained in:
Jinpyo Kim 2023-08-15 17:23:08 +00:00
commit fe46ababde
18 changed files with 1122 additions and 23 deletions

View File

@ -0,0 +1,143 @@
#include <cstdio>
#include <nccl.h>
#include <sys/time.h>
#define CHECK_CUDA(call) \
do { \
cudaError_t status_ = call; \
if (status_ != cudaSuccess) { \
fprintf(stderr, "CUDA error (%s:%d): %s:%s\n", __FILE__, __LINE__, \
cudaGetErrorName(status_), cudaGetErrorString(status_)); \
exit(EXIT_FAILURE); \
} \
} while (0)
#define CHECK_NCCL(call) \
do { \
ncclResult_t status_ = call; \
if (status_ != ncclSuccess && status_ != ncclInProgress) { \
fprintf(stderr, "NCCL error (%s:%d): %s\n", __FILE__, __LINE__, \
ncclGetErrorString(status_)); \
exit(EXIT_FAILURE); \
} \
} while (0)
const int NUM_GPU = 4;
const int NITER = 3;
const size_t nbytes = 256 * 1024 * 1024; // 256MiB
int* sendbuf[NUM_GPU];
int* recvbuf[NUM_GPU];
cudaStream_t streams[NUM_GPU];
double SyncAllGPUsAndGetTime() {
for (int i = 0; i < NUM_GPU; ++i) {
CHECK_CUDA(cudaSetDevice(i));
CHECK_CUDA(cudaDeviceSynchronize());
}
struct timeval tv;
gettimeofday(&tv, 0);
return tv.tv_sec + tv.tv_usec * 1e-6;
}
__global__ void FillBuffer(int* buf, size_t nbytes, size_t offset) {
size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
size_t stride = blockDim.x * gridDim.x;
for (size_t i = idx; i < nbytes / sizeof(int); i += stride) {
buf[i] = i * sizeof(int) + offset;
}
}
void InitBuffers() {
for (int i = 0; i < NUM_GPU; ++i) {
CHECK_CUDA(cudaSetDevice(i));
FillBuffer<<<1, 1024>>>(sendbuf[i], nbytes, i * nbytes);
CHECK_CUDA(cudaMemset(recvbuf[i], 0, nbytes * NUM_GPU));
}
}
void CheckBuffers() {
int* buf = (int*)malloc(nbytes * NUM_GPU);
for (int i = 0; i < NUM_GPU; ++i) {
CHECK_CUDA(cudaSetDevice(i));
CHECK_CUDA(cudaMemcpy(buf, recvbuf[i], nbytes * NUM_GPU, cudaMemcpyDeviceToHost));
for (size_t j = 0; j < nbytes * NUM_GPU / sizeof(int); ++j) {
if (buf[j] != j * sizeof(int)) {
printf("Incorrect! buf[%zu] should be %zu, but %d found\n", j, j * sizeof(int), buf[j]);
goto end;
}
}
}
printf("Correct!\n");
end: free(buf);
}
void AllGatherWithNCCL() {
printf("[AllGather with NCCL]\n");
ncclComm_t comms[NUM_GPU];
int devlist[NUM_GPU];
for (int i = 0; i < NUM_GPU; ++i) {
devlist[i] = i;
}
CHECK_NCCL(ncclCommInitAll(comms, NUM_GPU, devlist));
for (int iter = 0; iter < NITER; ++iter) {
double st = SyncAllGPUsAndGetTime();
for (int i = 0; i < NUM_GPU; ++i) {
CHECK_NCCL(ncclAllGather(sendbuf[i], recvbuf[i], nbytes / sizeof(int), ncclInt, comms[i], streams[i]));
}
double et = SyncAllGPUsAndGetTime();
double gbps = nbytes * (NUM_GPU - 1) / (et - st) / 1e9;
printf("[Iter %d] %f sec (Effective bandwidth %f GB/s)\n", iter, et - st, gbps);
}
for (int i = 0; i < NUM_GPU; ++i) {
CHECK_NCCL(ncclCommDestroy(comms[i]));
}
}
void AllGatherWithMemcpy() {
printf("[AllGather with Memcpy]\n");
int canAccessPeer;
for (int i = 0; i < NUM_GPU; ++i) {
for (int j = 0; j < NUM_GPU; ++j) {
cudaDeviceCanAccessPeer(&canAccessPeer, i, j);
if (canAccessPeer == 1) {
cudaSetDevice(i);
cudaDeviceEnablePeerAccess(j, 0);
}
}
}
for (int iter = 0; iter < NITER; ++iter) {
double st = SyncAllGPUsAndGetTime();
for (int i = 0; i < NUM_GPU; ++i) {
for (int j = 0; j < NUM_GPU; ++j) {
CHECK_CUDA(cudaMemcpyAsync(recvbuf[j] + i * (nbytes / sizeof(int)), sendbuf[i], nbytes, cudaMemcpyDefault, streams[i]));
}
}
double et = SyncAllGPUsAndGetTime();
double gbps = nbytes * (NUM_GPU - 1) / (et - st) / 1e9;
printf("[Iter %d] %f sec (Effective bandwidth %f GB/s)\n", iter, et - st, gbps);
}
}
int main(int argc, char **argv) {
for (int i = 0; i < NUM_GPU; ++i) {
CHECK_CUDA(cudaSetDevice(i));
CHECK_CUDA(cudaMalloc(&sendbuf[i], nbytes));
CHECK_CUDA(cudaMalloc(&recvbuf[i], nbytes * NUM_GPU));
CHECK_CUDA(cudaStreamCreate(&streams[i]));
}
InitBuffers();
AllGatherWithNCCL();
CheckBuffers();
InitBuffers();
AllGatherWithMemcpy();
CheckBuffers();
return 0;
}

View File

@ -0,0 +1,114 @@
#include <cstdio>
#include <mma.h>
#include "matmul.h"
using namespace nvcuda;
#define CHECK_CUDA(call) \
do { \
cudaError_t status_ = call; \
if (status_ != cudaSuccess) { \
fprintf(stderr, "CUDA error (%s:%d): %s\n", __FILE__, __LINE__, \
cudaGetErrorString(status_)); \
exit(EXIT_FAILURE); \
} \
} while (0)
#define BLOCK_SIZE 32
#define WMMA_M 16
#define WMMA_N 16
#define WMMA_K 16
#define WARP_SIZE 32
#define NUM_WARP ((WMMA_M * WMMA_N) / (WARP_SIZE))
#define C_LAYOUT wmma::mem_row_major
static __global__ void matmul_kernel(half *A, half *B, float *C, int M, int N,
int K) {
int gj = blockIdx.x;
int gi = blockIdx.y;
if (gi * BLOCK_SIZE >= M || gj * BLOCK_SIZE >= N) return; // boundary check
int lj = threadIdx.x;
int li = threadIdx.y;
int warpId = li;
__shared__ half Alocal[BLOCK_SIZE * BLOCK_SIZE];
__shared__ half Blocal[BLOCK_SIZE * BLOCK_SIZE];
// Declare the fragments
wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K, half, wmma::row_major> a_frag;
wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, half, wmma::row_major> b_frag;
wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, float> c_frag;
wmma::fill_fragment(c_frag, 0.0f);
int A_row_index = (gi * BLOCK_SIZE + li);
int B_col_index = (gj * BLOCK_SIZE + lj);
for (int bk = 0; bk < K; bk += BLOCK_SIZE) {
for(int offset = 0 ; offset < NUM_WARP ; ++offset){
int A_col_index = bk + lj;
Alocal[(li + offset * blockDim.y) * BLOCK_SIZE + lj] =
((A_row_index + offset * blockDim.y) < M && A_col_index < K)
? A[(A_row_index + offset * blockDim.y) * K + A_col_index]
: (half)(0.0);
int B_row_index = bk + li + (offset * blockDim.y);
Blocal[(li + offset * blockDim.y) * BLOCK_SIZE + lj] =
(B_row_index < K && B_col_index < N)
? B[B_row_index * N + B_col_index]
: (half)(0.0);
}
__syncthreads();
for (int i = 0; i < BLOCK_SIZE; i += WMMA_K) {
int aCol = i;
int aRow = (warpId / 2) * WMMA_M;
int bCol = (warpId % 2) * WMMA_N;
int bRow = i;
wmma::load_matrix_sync(a_frag, Alocal + aCol + aRow * BLOCK_SIZE, BLOCK_SIZE);
wmma::load_matrix_sync(b_frag, Blocal + bCol + bRow * BLOCK_SIZE, BLOCK_SIZE);
wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
}
__syncthreads();
}
int cRow = (warpId / 2) * WMMA_M + blockIdx.y * blockDim.y * NUM_WARP;
int cCol = (warpId % 2) * WMMA_N + blockIdx.x * blockDim.x;
if(cRow + WMMA_M <= M && cCol + WMMA_N <= N){
wmma::store_matrix_sync(C + cCol + cRow * N, c_frag, N, C_LAYOUT);
}
}
static half *A_gpu, *B_gpu;
static float *C_gpu;
void matmul(half *_A, half *_B, float *_C, int M, int N, int K) {
CHECK_CUDA(
cudaMemcpy(A_gpu, _A, M * K * sizeof(half), cudaMemcpyHostToDevice));
CHECK_CUDA(
cudaMemcpy(B_gpu, _B, K * N * sizeof(half), cudaMemcpyHostToDevice));
dim3 blockDim(BLOCK_SIZE, 4);
dim3 gridDim((N + BLOCK_SIZE - 1) / BLOCK_SIZE,
(M + BLOCK_SIZE - 1) / BLOCK_SIZE);
matmul_kernel<<<gridDim, blockDim>>>(A_gpu, B_gpu, C_gpu, M, N, K);
CHECK_CUDA(cudaGetLastError());
CHECK_CUDA(
cudaMemcpy(_C, C_gpu, M * N * sizeof(float), cudaMemcpyDeviceToHost));
}
void matmul_init(int M, int N, int K) {
CHECK_CUDA(cudaMalloc(&A_gpu, M * K * sizeof(half)));
CHECK_CUDA(cudaMalloc(&B_gpu, K * N * sizeof(half)));
CHECK_CUDA(cudaMalloc(&C_gpu, M * N * sizeof(float)));
}
void matmul_cleanup(half *_A, half *_B, float *_C, int M, int N, int K) {
CHECK_CUDA(cudaFree(A_gpu));
CHECK_CUDA(cudaFree(B_gpu));
CHECK_CUDA(cudaFree(C_gpu));
}

View File

@ -0,0 +1,215 @@
#include <cstdlib>
#include <cstdio>
#include <mma.h>
#include "convolution.cuh"
using namespace nvcuda;
#define CHECK_CUDA(call) \
do { \
cudaError_t status_ = call; \
if (status_ != cudaSuccess) { \
fprintf(stderr, "CUDA error (%s:%d): %s:%s\n", __FILE__, __LINE__, \
cudaGetErrorName(status_), cudaGetErrorString(status_)); \
exit(EXIT_FAILURE); \
} \
} while (0)
#define MAX_BLOCK 1024
#define NUM_SM 80
#define WARP_SIZE 32
#define BLOCK_PER_WARP (MAX_BLOCK / WARP_SIZE)
#define BLOCK_SIZE 32
#define WMMA_M 16
#define WMMA_N 16
#define WMMA_K 16
#define NUM_WARP ((WMMA_M * WMMA_N) / (WARP_SIZE))
#define C_LAYOUT wmma::mem_row_major
// Device(GPU) pointers
static half *I_gpu, *F_gpu, *O_gpu, *BUF1_gpu, *BUF2_gpu;
__global__ void im2col_kernel(half *_I, half *workspace, int N, int C, int H, int W,
int R, int S, int pad_h, int pad_w, int stride_h,
int stride_w, int dilation_h, int dilation_w, int OH, int OW){
int tidx = threadIdx.x + blockIdx.x * blockDim.x;
for(int crs = 0 ; crs < C * R * S ; ++crs){
int row_offset = crs * N * OH * OW;
const int c = crs / (R * S);
const int r = (crs / S) % R;
const int s = crs % S;
for(int nhw = tidx ; nhw < N * OH * OW ; nhw += MAX_BLOCK){
const int n = nhw / (OH * OW);
const int oh = (nhw / OW) % OH;
const int ow = nhw % OW;
const int h = oh * stride_h - pad_h + r * dilation_h;
const int w = ow * stride_w - pad_w + s * dilation_w;
if (h < 0 || h >= H || w < 0 || w >= W) continue;
workspace[row_offset + nhw] =
_I[n * C * H * W + c * H * W + h * W + w];
}
}
}
static __global__ void matmul_kernel(half *A, half *B, half *C, int M, int N,
int K) {
int gj = blockIdx.x;
int gi = blockIdx.y;
if (gi * BLOCK_SIZE >= M || gj * BLOCK_SIZE >= N) return; // boundary check
int lj = threadIdx.x;
int li = threadIdx.y;
int warpId = li;
__shared__ half Alocal[BLOCK_SIZE * BLOCK_SIZE];
__shared__ half Blocal[BLOCK_SIZE * BLOCK_SIZE];
// Declare the fragments
wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K, half, wmma::row_major> a_frag;
wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, half, wmma::row_major> b_frag;
wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, half> c_frag;
wmma::fill_fragment(c_frag, 0.0f);
int A_row_index = (gi * BLOCK_SIZE + li);
int B_col_index = (gj * BLOCK_SIZE + lj);
for (int bk = 0; bk < K; bk += BLOCK_SIZE) {
for(int offset = 0 ; offset < NUM_WARP ; ++offset){
int A_col_index = bk + lj;
Alocal[(li + offset * blockDim.y) * BLOCK_SIZE + lj] =
((A_row_index + offset * blockDim.y) < M && A_col_index < K)
? A[(A_row_index + offset * blockDim.y) * K + A_col_index]
: (half)(0.0);
int B_row_index = bk + li + (offset * blockDim.y);
Blocal[(li + offset * blockDim.y) * BLOCK_SIZE + lj] =
(B_row_index < K && B_col_index < N)
? B[B_row_index * N + B_col_index]
: (half)(0.0);
}
__syncthreads();
for (int i = 0; i < BLOCK_SIZE; i += WMMA_K) {
int aCol = i;
int aRow = (warpId / 2) * WMMA_M;
int bCol = (warpId % 2) * WMMA_N;
int bRow = i;
wmma::load_matrix_sync(a_frag, Alocal + aCol + aRow * BLOCK_SIZE, BLOCK_SIZE);
wmma::load_matrix_sync(b_frag, Blocal + bCol + bRow * BLOCK_SIZE, BLOCK_SIZE);
wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
}
__syncthreads();
}
int cRow = (warpId / 2) * WMMA_M + blockIdx.y * blockDim.y * NUM_WARP;
int cCol = (warpId % 2) * WMMA_N + blockIdx.x * blockDim.x;
if(cRow + WMMA_M <= M && cCol + WMMA_N <= N){
wmma::store_matrix_sync(C + cCol + cRow * N, c_frag, N, C_LAYOUT);
}
}
__global__ void reshape_kernel(half *_src, half *_dst, int N, int K, int OH, int OW){
int bidx = blockIdx.x;
int widx = threadIdx.x / WARP_SIZE;
int lidx = threadIdx.x % WARP_SIZE;
for(int k = widx ; k < K ; k += BLOCK_PER_WARP){
for(int on = bidx ; on < N ; on += NUM_SM){
for(int hw = lidx ; hw < OH * OW ; hw += WARP_SIZE){
_dst[on * K * OH * OW + k * OH * OW + hw] =
_src[k * N * OH * OW + on * OH * OW + hw];
}
}
}
}
void convolution_im2col(half *_I, half *_F, half *_O, half *_BUF1,
half *_BUF2, int N, int C, int H, int W,
int K, int R, int S, int pad_h, int pad_w,
int stride_h, int stride_w, int dilation_h,
int dilation_w) {
const int OH = 1 + (H + 2 * pad_h - (((R - 1) * dilation_h) + 1)) / stride_h;
const int OW = 1 + (W + 2 * pad_w - (((S - 1) * dilation_w) + 1)) / stride_w;
CHECK_CUDA(
cudaMemcpy(I_gpu, _I, sizeof(half) * N * C * H * W, cudaMemcpyHostToDevice));
CHECK_CUDA(
cudaMemcpy(F_gpu, _F, sizeof(half) * K * C * R * S, cudaMemcpyHostToDevice));
dim3 griddim_im2col(NUM_SM);
dim3 blockdim_im2col(MAX_BLOCK);
im2col_kernel<<<griddim_im2col, blockdim_im2col>>>(I_gpu, BUF1_gpu, N, C, H, W, R, S,
pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, OH, OW);
CHECK_CUDA(cudaGetLastError());
dim3 griddim_matmul(((N * OH * OW) + BLOCK_SIZE - 1) / BLOCK_SIZE,
(K + BLOCK_SIZE - 1) / BLOCK_SIZE);
dim3 blockDim_matmul(BLOCK_SIZE, 4);
matmul_kernel<<<griddim_matmul, blockDim_matmul>>>(F_gpu, BUF1_gpu, BUF2_gpu, K, N * OH * OW, C * R * S);
CHECK_CUDA(cudaGetLastError());
dim3 griddim_reshape(NUM_SM);
dim3 blockDim_reshape(MAX_BLOCK);
reshape_kernel<<<griddim_reshape, blockDim_reshape>>>(BUF2_gpu, O_gpu, N, K, OH, OW);
CHECK_CUDA(cudaGetLastError());
CHECK_CUDA(
cudaMemcpy(_O, O_gpu, sizeof(half) * N * K * OH * OW, cudaMemcpyDeviceToHost));
// DO NOT REMOVE; NEEDED FOR TIME MEASURE
CHECK_CUDA(cudaDeviceSynchronize());
}
void convolution(half *_I, half *_F, half *_O, half *_BUF1, half *_BUF2,
int N, int C, int H, int W, int K, int R, int S, int pad_h,
int pad_w, int stride_h, int stride_w, int dilation_h,
int dilation_w) {
// Remove this line after you complete the convolution on GPU
convolution_im2col(_I, _F, _O, _BUF1, _BUF2, N, C, H, W, K, R, S,
pad_h, pad_w, stride_h, stride_w, dilation_h,
dilation_w);
}
void convolution_initialize(int N, int C, int H, int W, int K, int R, int S,
int pad_h, int pad_w, int stride_h, int stride_w,
int dilation_h, int dilation_w) {
const int OH = 1 + (H + 2 * pad_h - (((R - 1) * dilation_h) + 1)) / stride_h;
const int OW = 1 + (W + 2 * pad_w - (((S - 1) * dilation_w) + 1)) / stride_w;
size_t alloc = sizeof(half) * N * C * H * W + sizeof(half) * K * C * R * S + sizeof(half) * N * K * OH * OW +
sizeof(half) * C * R * S * N * OH * OW + sizeof(half) * N * K * OH * OW;
printf("Alloc Memory : %lf\n",(double)alloc/1e9);
CHECK_CUDA(cudaMalloc((void **) &I_gpu, sizeof(half) * N * C * H * W));
CHECK_CUDA(cudaMalloc((void **) &F_gpu, sizeof(half) * K * C * R * S));
CHECK_CUDA(cudaMalloc((void **) &O_gpu, sizeof(half) * N * K * OH * OW));
CHECK_CUDA(cudaMalloc((void **) &BUF1_gpu, sizeof(half) * C * R * S * N * OH * OW));
CHECK_CUDA(cudaMalloc((void **) &BUF2_gpu, sizeof(half) * N * K * OH * OW));
}
void convolution_cleanup(half *_I, half *_F, half *_O, int N, int C, int H,
int W, int K, int R, int S, int pad_h, int pad_w,
int stride_h, int stride_w, int dilation_h,
int dilation_w) {
CHECK_CUDA(cudaFree(I_gpu));
CHECK_CUDA(cudaFree(F_gpu));
CHECK_CUDA(cudaFree(O_gpu));
CHECK_CUDA(cudaFree(BUF1_gpu));
CHECK_CUDA(cudaFree(BUF2_gpu));
}

View File

@ -0,0 +1,176 @@
#include <cstdlib>
#include <cstdio>
#include "convolution.cuh"
#define CHECK_CUDA(call) \
do { \
cudaError_t status_ = call; \
if (status_ != cudaSuccess) { \
fprintf(stderr, "CUDA error (%s:%d): %s:%s\n", __FILE__, __LINE__, \
cudaGetErrorName(status_), cudaGetErrorString(status_)); \
exit(EXIT_FAILURE); \
} \
} while (0)
#define MAX_BLOCK 1024
#define NUM_SM 80
#define NUM_WARP 32
#define BLOCK_PER_WARP (MAX_BLOCK / NUM_WARP)
#define BLOCK_SIZE 32
// Device(GPU) pointers
static float *I_gpu, *F_gpu, *O_gpu, *BUF1_gpu, *BUF2_gpu;
__global__ void im2col_kernel(float *_I, float *workspace, int N, int C, int H, int W,
int R, int S, int pad_h, int pad_w, int stride_h,
int stride_w, int dilation_h, int dilation_w, int OH, int OW){
int tidx = threadIdx.x + blockIdx.x * blockDim.x;
for(int crs = 0 ; crs < C * R * S ; ++crs){
int row_offset = crs * N * OH * OW;
const int c = crs / (R * S);
const int r = (crs / S) % R;
const int s = crs % S;
for(int nhw = tidx ; nhw < N * OH * OW ; nhw += MAX_BLOCK){
const int n = nhw / (OH * OW);
const int oh = (nhw / OW) % OH;
const int ow = nhw % OW;
const int h = oh * stride_h - pad_h + r * dilation_h;
const int w = ow * stride_w - pad_w + s * dilation_w;
if (h < 0 || h >= H || w < 0 || w >= W) continue;
workspace[row_offset + nhw] =
_I[n * C * H * W + c * H * W + h * W + w];
}
}
}
static __global__ void matmul_kernel(float *A, float *B, float *C, int M, int N,
int K) {
int j = blockIdx.x * blockDim.x + threadIdx.x;
int i = blockIdx.y * blockDim.y + threadIdx.y;
int gj = blockIdx.x;
int gi = blockIdx.y;
if (gi * BLOCK_SIZE >= M || gj * BLOCK_SIZE >= N) return; // boundary check
int lj = threadIdx.x;
int li = threadIdx.y;
__shared__ float Alocal[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float Blocal[BLOCK_SIZE][BLOCK_SIZE];
float c = 0.f;
int A_row_index = (gi * BLOCK_SIZE + li);
int B_col_index = (gj * BLOCK_SIZE + lj);
for (int bk = 0; bk < K; bk += BLOCK_SIZE) {
int A_col_index = bk + lj;
Alocal[li][lj] = (A_row_index < M && A_col_index < K)
? A[A_row_index * K + A_col_index]
: 0.f;
int B_row_index = bk + li;
Blocal[li][lj] = (B_row_index < K && B_col_index < N)
? B[B_row_index * N + B_col_index]
: 0.f;
__syncthreads();
for (int lk = 0; lk < BLOCK_SIZE; ++lk) {
c += Alocal[li][lk] * Blocal[lk][lj];
}
__syncthreads();
}
if (i < M && j < N) C[i * N + j] = c;
}
__global__ void reshape_kernel(float *_src, float *_dst, int N, int K, int OH, int OW){
int bidx = blockIdx.x;
int widx = threadIdx.x / NUM_WARP;
int lidx = threadIdx.x % NUM_WARP;
for(int k = widx ; k < K ; k += BLOCK_PER_WARP){
for(int on = bidx ; on < N ; on += NUM_SM){
for(int hw = lidx ; hw < OH * OW ; hw += NUM_WARP){
_dst[on * K * OH * OW + k * OH * OW + hw] =
_src[k * N * OH * OW + on * OH * OW + hw];
}
}
}
}
void convolution_im2col(float *_I, float *_F, float *_O, float *_BUF1,
float *_BUF2, int N, int C, int H, int W,
int K, int R, int S, int pad_h, int pad_w,
int stride_h, int stride_w, int dilation_h,
int dilation_w) {
const int OH = 1 + (H + 2 * pad_h - (((R - 1) * dilation_h) + 1)) / stride_h;
const int OW = 1 + (W + 2 * pad_w - (((S - 1) * dilation_w) + 1)) / stride_w;
CHECK_CUDA(
cudaMemcpy(I_gpu, _I, sizeof(float) * N * C * H * W, cudaMemcpyHostToDevice));
CHECK_CUDA(
cudaMemcpy(F_gpu, _F, sizeof(float) * K * C * R * S, cudaMemcpyHostToDevice));
dim3 griddim_im2col(NUM_SM);
dim3 blockdim_im2col(MAX_BLOCK);
im2col_kernel<<<griddim_im2col, blockdim_im2col>>>(I_gpu, BUF1_gpu, N, C, H, W, R, S,
pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, OH, OW);
CHECK_CUDA(cudaGetLastError());
dim3 griddim_matmul(((N * OH * OW)+ BLOCK_SIZE - 1) / BLOCK_SIZE,
(K + BLOCK_SIZE - 1) / BLOCK_SIZE);
dim3 blockDim_matmul(BLOCK_SIZE, BLOCK_SIZE);
matmul_kernel<<<griddim_matmul, blockDim_matmul>>>(F_gpu, BUF1_gpu, BUF2_gpu, K, N * OH * OW, C * R * S);
CHECK_CUDA(cudaGetLastError());
dim3 griddim_reshape(NUM_SM);
dim3 blockDim_reshape(MAX_BLOCK);
reshape_kernel<<<griddim_reshape, blockDim_reshape>>>(BUF2_gpu, O_gpu, N, K, OH, OW);
CHECK_CUDA(cudaGetLastError());
CHECK_CUDA(
cudaMemcpy(_O, O_gpu, sizeof(float) * N * K * OH * OW, cudaMemcpyDeviceToHost));
// DO NOT REMOVE; NEEDED FOR TIME MEASURE
CHECK_CUDA(cudaDeviceSynchronize());
}
void convolution(float *_I, float *_F, float *_O, float *_BUF1, float *_BUF2,
int N, int C, int H, int W, int K, int R, int S, int pad_h,
int pad_w, int stride_h, int stride_w, int dilation_h,
int dilation_w) {
// Remove this line after you complete the convolution on GPU
convolution_im2col(_I, _F, _O, _BUF1, _BUF2, N, C, H, W, K, R, S,
pad_h, pad_w, stride_h, stride_w, dilation_h,
dilation_w);
}
void convolution_initialize(int N, int C, int H, int W, int K, int R, int S,
int pad_h, int pad_w, int stride_h, int stride_w,
int dilation_h, int dilation_w) {
const int OH = 1 + (H + 2 * pad_h - (((R - 1) * dilation_h) + 1)) / stride_h;
const int OW = 1 + (W + 2 * pad_w - (((S - 1) * dilation_w) + 1)) / stride_w;
size_t alloc = sizeof(float) * N * C * H * W + sizeof(float) * K * C * R * S + sizeof(float) * N * K * OH * OW +
sizeof(float) * C * R * S * N * OH * OW + sizeof(float) * N * K * OH * OW;
printf("Alloc Memory : %lf\n",(double)alloc/1e9);
CHECK_CUDA(cudaMalloc((void **) &I_gpu, sizeof(float) * N * C * H * W));
CHECK_CUDA(cudaMalloc((void **) &F_gpu, sizeof(float) * K * C * R * S));
CHECK_CUDA(cudaMalloc((void **) &O_gpu, sizeof(float) * N * K * OH * OW));
CHECK_CUDA(cudaMalloc((void **) &BUF1_gpu, sizeof(float) * C * R * S * N * OH * OW));
CHECK_CUDA(cudaMalloc((void **) &BUF2_gpu, sizeof(float) * N * K * OH * OW));
}
void convolution_cleanup(float *_I, float *_F, float *_O, int N, int C, int H,
int W, int K, int R, int S, int pad_h, int pad_w,
int stride_h, int stride_w, int dilation_h,
int dilation_w) {
CHECK_CUDA(cudaFree(I_gpu));
CHECK_CUDA(cudaFree(F_gpu));
CHECK_CUDA(cudaFree(O_gpu));
CHECK_CUDA(cudaFree(BUF1_gpu));
CHECK_CUDA(cudaFree(BUF2_gpu));
}

View File

@ -2,7 +2,7 @@ TARGET=main
OBJECTS=main.o util.o convolution.o
CPPFLAGS=-std=c++14 -O3 -Wall -march=native -mavx2 -mfma -fopenmp -mno-avx512f -I/usr/local/cuda/include
CUDA_CFLAGS:=$(foreach option, $(CPPFLAGS),-Xcompiler=$(option))
CUDA_CFLAGS:=$(foreach option, $(CPPFLAGS),-Xcompiler=$(option) -arch=sm_70)
LDFLAGS=-L/usr/local/cuda/lib64
LDLIBS=-lstdc++ -lcudart -lm -lcudnn

View File

@ -15,7 +15,6 @@
void naive_cpu_im2col(half *_I, half *workspace, int N, int C, int H, int W,
int R, int S, int pad_h, int pad_w, int stride_h,
int stride_w, int dilation_h, int dilation_w) {
half *I = _I;
// Naive CPU im2col
const int ON = N;
@ -36,7 +35,7 @@ void naive_cpu_im2col(half *_I, half *workspace, int N, int C, int H, int W,
workspace[((c * R * S) + (r * S) + s) * (ON * OH * OW) +
(on * OH * OW + oh * OW + ow)] =
I[n * C * H * W + c * H * W + h * W + w];
_I[n * C * H * W + c * H * W + h * W + w];
}
}
}
@ -46,10 +45,11 @@ void naive_cpu_im2col(half *_I, half *workspace, int N, int C, int H, int W,
}
void naive_cpu_matmul_TC(half *_A, half *_B, half *_C, int M, int N, int K) {
for (int i = 0; i < M; i++) {
for (int k = 0; k < K; k++) {
for (int j = 0; j < N; j++) {
_C[i * N + j] += _A[i * K + k] * _B[k * N + j];
_C[i * N + j] = (half)((float)(_C[i * N + j]) + (float)_A[i * K + k] * (float)_B[k * N + j]);
}
}
}

View File

@ -1,9 +1,6 @@
#pragma once
#include "half.hpp"
using half_float::half;
using namespace half_float::literal;
#include <cuda_fp16.h>
void convolution(half *_I, half *_F, half *_O, half *_BUF1, half *_BUF2,
int N, int C, int H, int W, int K, int R, int S, int pad_h,

View File

@ -5,12 +5,9 @@
#include <string.h>
#include "convolution.cuh"
#include "half.hpp"
#include <cuda_fp16.h>
#include "util.h"
using half_float::half;
using namespace half_float::literal;
static bool print = false;
static bool validation = false;
static int N = 1;

View File

@ -22,7 +22,7 @@ half *alloc_tensor(int N, int C, int H, int W) {
void rand_tensor(half *m, int N, int C, int H, int W) {
int L = N * C * H * W;
for (int j = 0; j < L; j++) { m[j] = (half) rand() / RAND_MAX - 0.5; }
for (int j = 0; j < L; j++) { m[j] = (half) (float)rand() / RAND_MAX - 0.5; }
}
void zero_tensor(half *m, int N, int C, int H, int W) {
@ -60,7 +60,7 @@ void check_convolution(half *I, half *F, half *O, int N, int C, int H, int W,
for (int oc = 0; oc < OC; ++oc) {
for (int oh = 0; oh < OH; ++oh) {
for (int ow = 0; ow < OW; ++ow) {
half sum = 0.0_h;
half sum = (half)0.0;
for (int c = 0; c < C; ++c) {
for (int r = 0; r < R; ++r) {
for (int s = 0; s < S; ++s) {
@ -69,8 +69,8 @@ void check_convolution(half *I, half *F, half *O, int N, int C, int H, int W,
const int w = ow * stride_w - pad_w + s * dilation_w;
const int k = oc;
if (h < 0 || h >= H || w < 0 || w >= W) continue;
sum += I[((n * C + c) * H + h) * W + w] *
F[((k * C + c) * R + r) * S + s];
sum = sum + (half)(I[((n * C + c) * H + h) * W + w] *
F[((k * C + c) * R + r) * S + s]);
}
}
}
@ -82,7 +82,7 @@ void check_convolution(half *I, half *F, half *O, int N, int C, int H, int W,
bool is_valid = true;
int cnt = 0, thr = 10;
half eps = 1e-3_h;
half eps = (half)1e-3;
for (int on = 0; on < ON; ++on) {
for (int oc = 0; oc < OC; ++oc) {
for (int oh = 0; oh < OH; ++oh) {

View File

@ -1,9 +1,5 @@
#pragma once
#include "half.hpp"
using half_float::half;
using namespace half_float::literal;
#include <cuda_fp16.h>
double get_time();

View File

@ -0,0 +1,20 @@
TARGET=main
OBJECTS=util.o matmul.o main.o
CPPFLAGS=-std=c++11 -O3 -Wall -march=native -mavx2 -mno-avx512f -mfma -fopenmp
CPPFLAGS+= -I/usr/local/cuda/include/
LDFLAGS=-lm -lcudart -lcublas
LDFLAGS+=-L/usr/local/cuda/lib64
NVCC=/usr/local/cuda/bin/nvcc
all: $(TARGET)
$(TARGET): $(OBJECTS)
g++ $(CPPFLAGS) $^ -o $@ $(LDFLAGS)
matmul.o: matmul.cu
$(NVCC) -c -o $@ $^ -arch=sm_70
clean:
rm -rf $(TARGET) $(OBJECTS)

View File

@ -0,0 +1,103 @@
#include <stdio.h>
#include <getopt.h>
#include <stdbool.h>
#include <stdlib.h>
#include <cuda_fp16.h>
#include "util.h"
#include "matmul.h"
static void print_help(const char* prog_name) {
printf("Usage: %s [-pvh] [-n num_iterations] M N K\n", prog_name);
printf("Options:\n");
printf(" -p : print matrix data. (default: off)\n");
printf(" -v : validate matrix multiplication. (default: off)\n");
printf(" -h : print this page.\n");
printf(" -t : number of threads (default: 1)\n");
printf(" -n : number of iterations (default: 1)\n");
printf(" M : number of rows of matrix A and C. (default: 8)\n");
printf(" N : number of columns of matrix B and C. (default: 8)\n");
printf(" K : number of columns of matrix A and rows of B. (default: 8)\n");
}
static bool print_matrix = false;
static bool validation = false;
static int M = 8, N = 8, K = 8;
static int num_iterations = 1;
static void parse_opt(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "pvht:n:")) != -1) {
switch (c) {
case 'p':
print_matrix = true;
break;
case 'v':
validation = true;
break;
case 'n':
num_iterations = atoi(optarg);
break;
case 'h':
default:
print_help(argv[0]);
exit(0);
}
}
for (int i = optind, j = 0; i < argc; ++i, ++j) {
switch (j) {
case 0: M = atoi(argv[i]); break;
case 1: N = atoi(argv[i]); break;
case 2: K = atoi(argv[i]); break;
default: break;
}
}
printf("Options:\n");
printf(" Problem size: M = %d, N = %d, K = %d\n", M, N, K);
printf(" Number of iterations: %d\n", num_iterations);
printf(" Print matrix: %s\n", print_matrix ? "on" : "off");
printf(" Validation: %s\n", validation ? "on" : "off");
printf("\n");
}
int main(int argc, char **argv) {
parse_opt(argc, argv);
printf("Initializing... "); fflush(stdout);
half *A = alloc_mat(M, K);
half *B = alloc_mat(K, N);
float *C = alloc_mat_float(M, N);
rand_mat(A, M, K);
rand_mat(B, K, N);
matmul_init(M, N, K);
printf("done!\n"); fflush(stdout);
double elapsed_time_sum = 0;
for (int i = 0; i < num_iterations; ++i) {
printf("Calculating...(iter=%d) ", i); fflush(stdout);
zero_mat_float(C, M, N);
double start_time = get_time();
matmul(A, B, C, M, N, K);
double elapsed_time = get_time() - start_time;
printf("%f sec\n", elapsed_time);
elapsed_time_sum += elapsed_time;
}
if (print_matrix) {
printf("MATRIX A:\n"); print_mat(A, M, K);
printf("MATRIX B:\n"); print_mat(B, K, N);
printf("MATRIX C:\n"); print_mat_float(C, M, N);
}
matmul_cleanup(A, B, C, M, N, K);
if (validation) {
check_matmul(A, B, C, M, N, K);
}
double elapsed_time_avg = elapsed_time_sum / num_iterations;
printf("Avg. time: %f sec\n", elapsed_time_avg);
printf("Avg. throughput: %f GFLOPS\n", 2.0 * M * N * K / elapsed_time_avg / 1e9);
return 0;
}

View File

@ -0,0 +1,57 @@
#include <cstdio>
#include <mma.h>
#include "matmul.h"
using namespace nvcuda;
#define CHECK_CUDA(call) \
do { \
cudaError_t status_ = call; \
if (status_ != cudaSuccess) { \
fprintf(stderr, "CUDA error (%s:%d): %s:%s\n", __FILE__, __LINE__, \
cudaGetErrorName(status_), cudaGetErrorString(status_)); \
exit(EXIT_FAILURE); \
} \
} while (0)
// Device(GPU) pointers
static float *A_gpu, *B_gpu, *C_gpu;
void naive_cpu_matmul(half *_A, half *_B, float *_C, int M, int N, int K) {
for (int i = 0; i < M; i++) {
for (int k = 0; k < K; k++) {
for (int j = 0; j < N; j++) {
_C[i * N + j] += (float)_A[i * K + k] * (float)_B[k * N + j];
}
}
}
}
void matmul(half *_A, half *_B, float *_C, int M, int N, int K) {
// Remove this line after you complete the matmul on GPU
naive_cpu_matmul(_A, _B, _C, M, N, K);
// (TODO) Upload A and B matrix to GPU
// (TODO) Launch kernel on a GPU
// (TODO) Download C matrix from GPU
// DO NOT REMOVE; NEEDED FOR TIME MEASURE
CHECK_CUDA(cudaDeviceSynchronize());
}
void matmul_init(int M, int N, int K) {
// (TODO) Allocate device memory
// DO NOT REMOVE; NEEDED FOR TIME MEASURE
CHECK_CUDA(cudaDeviceSynchronize());
}
void matmul_cleanup(half *_A, half *_B, float *_C, int M, int N, int K) {
// (TODO) Do any post-matmul cleanup work here.
// DO NOT REMOVE; NEEDED FOR TIME MEASURE
CHECK_CUDA(cudaDeviceSynchronize());
}

View File

@ -0,0 +1,7 @@
#pragma once
#include <cuda_fp16.h>
void matmul(half *_A, half *_B, float *_C, int M, int N, int K);
void matmul_init(int M, int N, int K);
void matmul_cleanup(half *_A, half *_B, float *_C, int M, int N, int K);

View File

@ -0,0 +1,96 @@
#include "util.h"
#include <omp.h>
#include <sys/time.h>
#include <cmath>
#include <cstdbool>
#include <cstdio>
#include <cstdlib>
#include <cstring>
double get_time() {
struct timeval tv;
gettimeofday(&tv, 0);
return tv.tv_sec + tv.tv_usec * 1e-6;
}
void check_matmul(half *A, half *B, float *C, int M, int N, int K) {
printf("Validating...\n");
float *C_ans = alloc_mat_float(M, N);
zero_mat_float(C_ans, M, N);
#pragma omp parallel for num_threads(20)
for (int i = 0; i < M; ++i) {
for (int k = 0; k < K; ++k) {
for (int j = 0; j < N; ++j) {
C_ans[i * N + j] = C_ans[i * N + j] + (float)((A[i * K + k]) * (B[k * N + j]));
}
}
}
bool is_valid = true;
int cnt = 0, thr = 10;
float eps = 1e-3;
for (int i = 0; i < M; ++i) {
for (int j = 0; j < N; ++j) {
float c = C[i * N + j];
float c_ans = C_ans[i * N + j];
if (fabsf(c - c_ans) > eps &&
(c_ans == 0 || fabsf((c - c_ans) / c_ans) > eps)) {
++cnt;
if (cnt <= thr)
printf("C[%d][%d] : correct_value = %f, your_value = %f\n", i, j,
(float)c_ans, (float)c);
if (cnt == thr + 1)
printf("Too many error, only first %d values are printed.\n", thr);
is_valid = false;
}
}
}
if (is_valid) {
printf("Result: VALID\n");
} else {
printf("Result: INVALID\n");
}
}
void print_mat(half *m, int R, int C) {
for (int i = 0; i < R; ++i) {
for (int j = 0; j < C; ++j) { printf("%+.3f ", (float)(m[i * C + j])); }
printf("\n");
}
}
void print_mat_float(float *m, int R, int C) {
for (int i = 0; i < R; ++i) {
for (int j = 0; j < C; ++j) { printf("%+.3f ", (float)(m[i * C + j])); }
printf("\n");
}
}
half *alloc_mat(int R, int C) {
half *m = (half *) aligned_alloc(32, sizeof(half) * R * C);
return m;
}
float *alloc_mat_float(int R, int C) {
float *m = (float *) aligned_alloc(32, sizeof(float) * R * C);
return m;
}
void rand_mat(half *m, int R, int C) {
for (int i = 0; i < R; i++) {
for (int j = 0; j < C; j++) {
m[i * C + j] = (half) ((float)rand() / RAND_MAX - 0.5);
}
}
}
void zero_mat(half *m, int R, int C) { memset(m, 0, sizeof(half) * R * C); }
void zero_mat_float(float *m, int R, int C) { memset(m, 0, sizeof(float) * R * C); }

View File

@ -0,0 +1,20 @@
#pragma once
#include <cuda_fp16.h>
double get_time();
void check_matmul(half *A, half *B, float *C, int M, int N, int K);
void print_mat(half *m, int R, int C);
void print_mat_float(float *m, int R, int C);
half* alloc_mat(int R, int C);
float* alloc_mat_float(int R, int C);
void rand_mat(half *m, int R, int C);
void zero_mat(half *m, int R, int C);
void zero_mat_float(float *m, int R, int C);

View File

@ -0,0 +1,11 @@
TARGET=main
NVCC=/usr/local/cuda/bin/nvcc
all: $(TARGET)
main: main.cu
$(NVCC) -o $@ $^ -lnccl
clean:
rm -rf $(TARGET)

View File

@ -0,0 +1,147 @@
#include <cstdio>
#include <nccl.h>
#include <sys/time.h>
#define CHECK_CUDA(call) \
do { \
cudaError_t status_ = call; \
if (status_ != cudaSuccess) { \
fprintf(stderr, "CUDA error (%s:%d): %s:%s\n", __FILE__, __LINE__, \
cudaGetErrorName(status_), cudaGetErrorString(status_)); \
exit(EXIT_FAILURE); \
} \
} while (0)
#define CHECK_NCCL(call) \
do { \
ncclResult_t status_ = call; \
if (status_ != ncclSuccess && status_ != ncclInProgress) { \
fprintf(stderr, "NCCL error (%s:%d): %s\n", __FILE__, __LINE__, \
ncclGetErrorString(status_)); \
exit(EXIT_FAILURE); \
} \
} while (0)
const int NUM_GPU = 4;
const int NITER = 3;
const size_t nbytes = 256 * 1024 * 1024; // 256MiB
int* sendbuf[NUM_GPU];
int* recvbuf[NUM_GPU];
cudaStream_t streams[NUM_GPU];
double SyncAllGPUsAndGetTime() {
for (int i = 0; i < NUM_GPU; ++i) {
CHECK_CUDA(cudaSetDevice(i));
CHECK_CUDA(cudaDeviceSynchronize());
}
struct timeval tv;
gettimeofday(&tv, 0);
return tv.tv_sec + tv.tv_usec * 1e-6;
}
__global__ void FillBuffer(int* buf, size_t nbytes, size_t offset) {
size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
size_t stride = blockDim.x * gridDim.x;
for (size_t i = idx; i < nbytes / sizeof(int); i += stride) {
buf[i] = i * sizeof(int) + offset;
}
}
void InitBuffers() {
for (int i = 0; i < NUM_GPU; ++i) {
CHECK_CUDA(cudaSetDevice(i));
FillBuffer<<<1, 1024>>>(sendbuf[i], nbytes, i * nbytes);
CHECK_CUDA(cudaMemset(recvbuf[i], 0, nbytes * NUM_GPU));
}
}
void CheckBuffers() {
int* buf = (int*)malloc(nbytes * NUM_GPU);
for (int i = 0; i < NUM_GPU; ++i) {
CHECK_CUDA(cudaSetDevice(i));
CHECK_CUDA(cudaMemcpy(buf, recvbuf[i], nbytes * NUM_GPU, cudaMemcpyDeviceToHost));
for (size_t j = 0; j < nbytes * NUM_GPU / sizeof(int); ++j) {
if (buf[j] != j * sizeof(int)) {
printf("Incorrect! buf[%zu] should be %zu, but %d found\n", j, j * sizeof(int), buf[j]);
goto end;
}
}
}
printf("Correct!\n");
end: free(buf);
}
void AllGatherWithNCCL() {
printf("[AllGather with NCCL]\n");
ncclComm_t comms[NUM_GPU];
int devlist[NUM_GPU];
for (int i = 0; i < NUM_GPU; ++i) {
devlist[i] = i;
}
CHECK_NCCL(ncclCommInitAll(comms, NUM_GPU, devlist));
for (int iter = 0; iter < NITER; ++iter) {
double st = SyncAllGPUsAndGetTime();
for (int i = 0; i < NUM_GPU; ++i) {
/*
* TODO
* Implement AllGather with NCCL here.
*/
}
double et = SyncAllGPUsAndGetTime();
double gbps = nbytes * (NUM_GPU - 1) / (et - st) / 1e9;
printf("[Iter %d] %f sec (Effective bandwidth %f GB/s)\n", iter, et - st, gbps);
}
for (int i = 0; i < NUM_GPU; ++i) {
CHECK_NCCL(ncclCommDestroy(comms[i]));
}
}
void AllGatherWithMemcpy() {
printf("[AllGather with Memcpy]\n");
int canAccessPeer;
for (int i = 0; i < NUM_GPU; ++i) {
for (int j = 0; j < NUM_GPU; ++j) {
cudaDeviceCanAccessPeer(&canAccessPeer, i, j);
if (canAccessPeer == 1) {
cudaSetDevice(i);
cudaDeviceEnablePeerAccess(j, 0);
}
}
}
for (int iter = 0; iter < NITER; ++iter) {
double st = SyncAllGPUsAndGetTime();
for (int i = 0; i < NUM_GPU; ++i) {
/*
* TODO
* Implement AllGather using memcpy here.
*/
}
double et = SyncAllGPUsAndGetTime();
double gbps = nbytes * (NUM_GPU - 1) / (et - st) / 1e9;
printf("[Iter %d] %f sec (Effective bandwidth %f GB/s)\n", iter, et - st, gbps);
}
}
int main(int argc, char **argv) {
for (int i = 0; i < NUM_GPU; ++i) {
CHECK_CUDA(cudaSetDevice(i));
CHECK_CUDA(cudaMalloc(&sendbuf[i], nbytes));
CHECK_CUDA(cudaMalloc(&recvbuf[i], nbytes * NUM_GPU));
CHECK_CUDA(cudaStreamCreate(&streams[i]));
}
InitBuffers();
AllGatherWithNCCL();
CheckBuffers();
InitBuffers();
AllGatherWithMemcpy();
CheckBuffers();
return 0;
}