add TC variants
This commit is contained in:
parent
d90f62ac21
commit
ea9d968314
|
@ -0,0 +1,114 @@
|
|||
#include <cstdio>
|
||||
#include <mma.h>
|
||||
#include "matmul.h"
|
||||
|
||||
using namespace nvcuda;
|
||||
|
||||
#define CHECK_CUDA(call) \
|
||||
do { \
|
||||
cudaError_t status_ = call; \
|
||||
if (status_ != cudaSuccess) { \
|
||||
fprintf(stderr, "CUDA error (%s:%d): %s\n", __FILE__, __LINE__, \
|
||||
cudaGetErrorString(status_)); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define BLOCK_SIZE 32
|
||||
|
||||
#define WMMA_M 16
|
||||
#define WMMA_N 16
|
||||
#define WMMA_K 16
|
||||
#define WARP_SIZE 32
|
||||
#define NUM_WARP ((WMMA_M * WMMA_N) / (WARP_SIZE))
|
||||
#define C_LAYOUT wmma::mem_row_major
|
||||
|
||||
static __global__ void matmul_kernel(half *A, half *B, float *C, int M, int N,
|
||||
int K) {
|
||||
int gj = blockIdx.x;
|
||||
int gi = blockIdx.y;
|
||||
if (gi * BLOCK_SIZE >= M || gj * BLOCK_SIZE >= N) return; // boundary check
|
||||
int lj = threadIdx.x;
|
||||
int li = threadIdx.y;
|
||||
int warpId = li;
|
||||
|
||||
__shared__ half Alocal[BLOCK_SIZE * BLOCK_SIZE];
|
||||
__shared__ half Blocal[BLOCK_SIZE * BLOCK_SIZE];
|
||||
|
||||
// Declare the fragments
|
||||
wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K, half, wmma::row_major> a_frag;
|
||||
wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, half, wmma::row_major> b_frag;
|
||||
wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, float> c_frag;
|
||||
wmma::fill_fragment(c_frag, 0.0f);
|
||||
|
||||
int A_row_index = (gi * BLOCK_SIZE + li);
|
||||
int B_col_index = (gj * BLOCK_SIZE + lj);
|
||||
|
||||
for (int bk = 0; bk < K; bk += BLOCK_SIZE) {
|
||||
|
||||
for(int offset = 0 ; offset < NUM_WARP ; ++offset){
|
||||
int A_col_index = bk + lj;
|
||||
Alocal[(li + offset * blockDim.y) * BLOCK_SIZE + lj] =
|
||||
((A_row_index + offset * blockDim.y) < M && A_col_index < K)
|
||||
? A[(A_row_index + offset * blockDim.y) * K + A_col_index]
|
||||
: (half)(0.0);
|
||||
|
||||
int B_row_index = bk + li + (offset * blockDim.y);
|
||||
Blocal[(li + offset * blockDim.y) * BLOCK_SIZE + lj] =
|
||||
(B_row_index < K && B_col_index < N)
|
||||
? B[B_row_index * N + B_col_index]
|
||||
: (half)(0.0);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
for (int i = 0; i < BLOCK_SIZE; i += WMMA_K) {
|
||||
int aCol = i;
|
||||
int aRow = (warpId / 2) * WMMA_M;
|
||||
int bCol = (warpId % 2) * WMMA_N;
|
||||
int bRow = i;
|
||||
|
||||
wmma::load_matrix_sync(a_frag, Alocal + aCol + aRow * BLOCK_SIZE, BLOCK_SIZE);
|
||||
wmma::load_matrix_sync(b_frag, Blocal + bCol + bRow * BLOCK_SIZE, BLOCK_SIZE);
|
||||
|
||||
wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
int cRow = (warpId / 2) * WMMA_M + blockIdx.y * blockDim.y * NUM_WARP;
|
||||
int cCol = (warpId % 2) * WMMA_N + blockIdx.x * blockDim.x;
|
||||
|
||||
if(cRow + WMMA_M <= M && cCol + WMMA_N <= N){
|
||||
wmma::store_matrix_sync(C + cCol + cRow * N, c_frag, N, C_LAYOUT);
|
||||
}
|
||||
}
|
||||
|
||||
static half *A_gpu, *B_gpu;
|
||||
static float *C_gpu;
|
||||
|
||||
void matmul(half *_A, half *_B, float *_C, int M, int N, int K) {
|
||||
CHECK_CUDA(
|
||||
cudaMemcpy(A_gpu, _A, M * K * sizeof(half), cudaMemcpyHostToDevice));
|
||||
CHECK_CUDA(
|
||||
cudaMemcpy(B_gpu, _B, K * N * sizeof(half), cudaMemcpyHostToDevice));
|
||||
dim3 blockDim(BLOCK_SIZE, 4);
|
||||
dim3 gridDim((N + BLOCK_SIZE - 1) / BLOCK_SIZE,
|
||||
(M + BLOCK_SIZE - 1) / BLOCK_SIZE);
|
||||
matmul_kernel<<<gridDim, blockDim>>>(A_gpu, B_gpu, C_gpu, M, N, K);
|
||||
CHECK_CUDA(cudaGetLastError());
|
||||
CHECK_CUDA(
|
||||
cudaMemcpy(_C, C_gpu, M * N * sizeof(float), cudaMemcpyDeviceToHost));
|
||||
}
|
||||
|
||||
void matmul_init(int M, int N, int K) {
|
||||
CHECK_CUDA(cudaMalloc(&A_gpu, M * K * sizeof(half)));
|
||||
CHECK_CUDA(cudaMalloc(&B_gpu, K * N * sizeof(half)));
|
||||
CHECK_CUDA(cudaMalloc(&C_gpu, M * N * sizeof(float)));
|
||||
}
|
||||
|
||||
void matmul_cleanup(half *_A, half *_B, float *_C, int M, int N, int K) {
|
||||
CHECK_CUDA(cudaFree(A_gpu));
|
||||
CHECK_CUDA(cudaFree(B_gpu));
|
||||
CHECK_CUDA(cudaFree(C_gpu));
|
||||
}
|
|
@ -0,0 +1,215 @@
|
|||
#include <cstdlib>
|
||||
#include <cstdio>
|
||||
#include <mma.h>
|
||||
#include "convolution.cuh"
|
||||
|
||||
using namespace nvcuda;
|
||||
|
||||
|
||||
#define CHECK_CUDA(call) \
|
||||
do { \
|
||||
cudaError_t status_ = call; \
|
||||
if (status_ != cudaSuccess) { \
|
||||
fprintf(stderr, "CUDA error (%s:%d): %s:%s\n", __FILE__, __LINE__, \
|
||||
cudaGetErrorName(status_), cudaGetErrorString(status_)); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define MAX_BLOCK 1024
|
||||
#define NUM_SM 80
|
||||
#define WARP_SIZE 32
|
||||
#define BLOCK_PER_WARP (MAX_BLOCK / WARP_SIZE)
|
||||
#define BLOCK_SIZE 32
|
||||
|
||||
#define WMMA_M 16
|
||||
#define WMMA_N 16
|
||||
#define WMMA_K 16
|
||||
#define NUM_WARP ((WMMA_M * WMMA_N) / (WARP_SIZE))
|
||||
#define C_LAYOUT wmma::mem_row_major
|
||||
// Device(GPU) pointers
|
||||
static half *I_gpu, *F_gpu, *O_gpu, *BUF1_gpu, *BUF2_gpu;
|
||||
|
||||
__global__ void im2col_kernel(half *_I, half *workspace, int N, int C, int H, int W,
|
||||
int R, int S, int pad_h, int pad_w, int stride_h,
|
||||
int stride_w, int dilation_h, int dilation_w, int OH, int OW){
|
||||
|
||||
int tidx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
|
||||
for(int crs = 0 ; crs < C * R * S ; ++crs){
|
||||
int row_offset = crs * N * OH * OW;
|
||||
const int c = crs / (R * S);
|
||||
const int r = (crs / S) % R;
|
||||
const int s = crs % S;
|
||||
|
||||
for(int nhw = tidx ; nhw < N * OH * OW ; nhw += MAX_BLOCK){
|
||||
const int n = nhw / (OH * OW);
|
||||
const int oh = (nhw / OW) % OH;
|
||||
const int ow = nhw % OW;
|
||||
|
||||
const int h = oh * stride_h - pad_h + r * dilation_h;
|
||||
const int w = ow * stride_w - pad_w + s * dilation_w;
|
||||
|
||||
if (h < 0 || h >= H || w < 0 || w >= W) continue;
|
||||
|
||||
workspace[row_offset + nhw] =
|
||||
_I[n * C * H * W + c * H * W + h * W + w];
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static __global__ void matmul_kernel(half *A, half *B, half *C, int M, int N,
|
||||
int K) {
|
||||
int gj = blockIdx.x;
|
||||
int gi = blockIdx.y;
|
||||
if (gi * BLOCK_SIZE >= M || gj * BLOCK_SIZE >= N) return; // boundary check
|
||||
int lj = threadIdx.x;
|
||||
int li = threadIdx.y;
|
||||
int warpId = li;
|
||||
|
||||
__shared__ half Alocal[BLOCK_SIZE * BLOCK_SIZE];
|
||||
__shared__ half Blocal[BLOCK_SIZE * BLOCK_SIZE];
|
||||
|
||||
// Declare the fragments
|
||||
wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K, half, wmma::row_major> a_frag;
|
||||
wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, half, wmma::row_major> b_frag;
|
||||
wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, half> c_frag;
|
||||
wmma::fill_fragment(c_frag, 0.0f);
|
||||
|
||||
int A_row_index = (gi * BLOCK_SIZE + li);
|
||||
int B_col_index = (gj * BLOCK_SIZE + lj);
|
||||
|
||||
for (int bk = 0; bk < K; bk += BLOCK_SIZE) {
|
||||
|
||||
for(int offset = 0 ; offset < NUM_WARP ; ++offset){
|
||||
int A_col_index = bk + lj;
|
||||
Alocal[(li + offset * blockDim.y) * BLOCK_SIZE + lj] =
|
||||
((A_row_index + offset * blockDim.y) < M && A_col_index < K)
|
||||
? A[(A_row_index + offset * blockDim.y) * K + A_col_index]
|
||||
: (half)(0.0);
|
||||
|
||||
int B_row_index = bk + li + (offset * blockDim.y);
|
||||
Blocal[(li + offset * blockDim.y) * BLOCK_SIZE + lj] =
|
||||
(B_row_index < K && B_col_index < N)
|
||||
? B[B_row_index * N + B_col_index]
|
||||
: (half)(0.0);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
for (int i = 0; i < BLOCK_SIZE; i += WMMA_K) {
|
||||
int aCol = i;
|
||||
int aRow = (warpId / 2) * WMMA_M;
|
||||
int bCol = (warpId % 2) * WMMA_N;
|
||||
int bRow = i;
|
||||
|
||||
wmma::load_matrix_sync(a_frag, Alocal + aCol + aRow * BLOCK_SIZE, BLOCK_SIZE);
|
||||
wmma::load_matrix_sync(b_frag, Blocal + bCol + bRow * BLOCK_SIZE, BLOCK_SIZE);
|
||||
|
||||
wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
int cRow = (warpId / 2) * WMMA_M + blockIdx.y * blockDim.y * NUM_WARP;
|
||||
int cCol = (warpId % 2) * WMMA_N + blockIdx.x * blockDim.x;
|
||||
|
||||
if(cRow + WMMA_M <= M && cCol + WMMA_N <= N){
|
||||
wmma::store_matrix_sync(C + cCol + cRow * N, c_frag, N, C_LAYOUT);
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void reshape_kernel(half *_src, half *_dst, int N, int K, int OH, int OW){
|
||||
|
||||
int bidx = blockIdx.x;
|
||||
int widx = threadIdx.x / WARP_SIZE;
|
||||
int lidx = threadIdx.x % WARP_SIZE;
|
||||
|
||||
for(int k = widx ; k < K ; k += BLOCK_PER_WARP){
|
||||
for(int on = bidx ; on < N ; on += NUM_SM){
|
||||
for(int hw = lidx ; hw < OH * OW ; hw += WARP_SIZE){
|
||||
_dst[on * K * OH * OW + k * OH * OW + hw] =
|
||||
_src[k * N * OH * OW + on * OH * OW + hw];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void convolution_im2col(half *_I, half *_F, half *_O, half *_BUF1,
|
||||
half *_BUF2, int N, int C, int H, int W,
|
||||
int K, int R, int S, int pad_h, int pad_w,
|
||||
int stride_h, int stride_w, int dilation_h,
|
||||
int dilation_w) {
|
||||
|
||||
const int OH = 1 + (H + 2 * pad_h - (((R - 1) * dilation_h) + 1)) / stride_h;
|
||||
const int OW = 1 + (W + 2 * pad_w - (((S - 1) * dilation_w) + 1)) / stride_w;
|
||||
|
||||
CHECK_CUDA(
|
||||
cudaMemcpy(I_gpu, _I, sizeof(half) * N * C * H * W, cudaMemcpyHostToDevice));
|
||||
CHECK_CUDA(
|
||||
cudaMemcpy(F_gpu, _F, sizeof(half) * K * C * R * S, cudaMemcpyHostToDevice));
|
||||
|
||||
dim3 griddim_im2col(NUM_SM);
|
||||
dim3 blockdim_im2col(MAX_BLOCK);
|
||||
im2col_kernel<<<griddim_im2col, blockdim_im2col>>>(I_gpu, BUF1_gpu, N, C, H, W, R, S,
|
||||
pad_h, pad_w, stride_h, stride_w,
|
||||
dilation_h, dilation_w, OH, OW);
|
||||
CHECK_CUDA(cudaGetLastError());
|
||||
|
||||
dim3 griddim_matmul(((N * OH * OW) + BLOCK_SIZE - 1) / BLOCK_SIZE,
|
||||
(K + BLOCK_SIZE - 1) / BLOCK_SIZE);
|
||||
dim3 blockDim_matmul(BLOCK_SIZE, 4);
|
||||
matmul_kernel<<<griddim_matmul, blockDim_matmul>>>(F_gpu, BUF1_gpu, BUF2_gpu, K, N * OH * OW, C * R * S);
|
||||
CHECK_CUDA(cudaGetLastError());
|
||||
|
||||
dim3 griddim_reshape(NUM_SM);
|
||||
dim3 blockDim_reshape(MAX_BLOCK);
|
||||
reshape_kernel<<<griddim_reshape, blockDim_reshape>>>(BUF2_gpu, O_gpu, N, K, OH, OW);
|
||||
CHECK_CUDA(cudaGetLastError());
|
||||
|
||||
CHECK_CUDA(
|
||||
cudaMemcpy(_O, O_gpu, sizeof(half) * N * K * OH * OW, cudaMemcpyDeviceToHost));
|
||||
|
||||
// DO NOT REMOVE; NEEDED FOR TIME MEASURE
|
||||
CHECK_CUDA(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
void convolution(half *_I, half *_F, half *_O, half *_BUF1, half *_BUF2,
|
||||
int N, int C, int H, int W, int K, int R, int S, int pad_h,
|
||||
int pad_w, int stride_h, int stride_w, int dilation_h,
|
||||
int dilation_w) {
|
||||
// Remove this line after you complete the convolution on GPU
|
||||
convolution_im2col(_I, _F, _O, _BUF1, _BUF2, N, C, H, W, K, R, S,
|
||||
pad_h, pad_w, stride_h, stride_w, dilation_h,
|
||||
dilation_w);
|
||||
}
|
||||
|
||||
void convolution_initialize(int N, int C, int H, int W, int K, int R, int S,
|
||||
int pad_h, int pad_w, int stride_h, int stride_w,
|
||||
int dilation_h, int dilation_w) {
|
||||
|
||||
const int OH = 1 + (H + 2 * pad_h - (((R - 1) * dilation_h) + 1)) / stride_h;
|
||||
const int OW = 1 + (W + 2 * pad_w - (((S - 1) * dilation_w) + 1)) / stride_w;
|
||||
|
||||
size_t alloc = sizeof(half) * N * C * H * W + sizeof(half) * K * C * R * S + sizeof(half) * N * K * OH * OW +
|
||||
sizeof(half) * C * R * S * N * OH * OW + sizeof(half) * N * K * OH * OW;
|
||||
printf("Alloc Memory : %lf\n",(double)alloc/1e9);
|
||||
|
||||
CHECK_CUDA(cudaMalloc((void **) &I_gpu, sizeof(half) * N * C * H * W));
|
||||
CHECK_CUDA(cudaMalloc((void **) &F_gpu, sizeof(half) * K * C * R * S));
|
||||
CHECK_CUDA(cudaMalloc((void **) &O_gpu, sizeof(half) * N * K * OH * OW));
|
||||
CHECK_CUDA(cudaMalloc((void **) &BUF1_gpu, sizeof(half) * C * R * S * N * OH * OW));
|
||||
CHECK_CUDA(cudaMalloc((void **) &BUF2_gpu, sizeof(half) * N * K * OH * OW));
|
||||
}
|
||||
|
||||
void convolution_cleanup(half *_I, half *_F, half *_O, int N, int C, int H,
|
||||
int W, int K, int R, int S, int pad_h, int pad_w,
|
||||
int stride_h, int stride_w, int dilation_h,
|
||||
int dilation_w) {
|
||||
CHECK_CUDA(cudaFree(I_gpu));
|
||||
CHECK_CUDA(cudaFree(F_gpu));
|
||||
CHECK_CUDA(cudaFree(O_gpu));
|
||||
CHECK_CUDA(cudaFree(BUF1_gpu));
|
||||
CHECK_CUDA(cudaFree(BUF2_gpu));
|
||||
}
|
|
@ -0,0 +1,176 @@
|
|||
#include <cstdlib>
|
||||
#include <cstdio>
|
||||
#include "convolution.cuh"
|
||||
|
||||
#define CHECK_CUDA(call) \
|
||||
do { \
|
||||
cudaError_t status_ = call; \
|
||||
if (status_ != cudaSuccess) { \
|
||||
fprintf(stderr, "CUDA error (%s:%d): %s:%s\n", __FILE__, __LINE__, \
|
||||
cudaGetErrorName(status_), cudaGetErrorString(status_)); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define MAX_BLOCK 1024
|
||||
#define NUM_SM 80
|
||||
#define NUM_WARP 32
|
||||
#define BLOCK_PER_WARP (MAX_BLOCK / NUM_WARP)
|
||||
#define BLOCK_SIZE 32
|
||||
// Device(GPU) pointers
|
||||
static float *I_gpu, *F_gpu, *O_gpu, *BUF1_gpu, *BUF2_gpu;
|
||||
|
||||
__global__ void im2col_kernel(float *_I, float *workspace, int N, int C, int H, int W,
|
||||
int R, int S, int pad_h, int pad_w, int stride_h,
|
||||
int stride_w, int dilation_h, int dilation_w, int OH, int OW){
|
||||
|
||||
int tidx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
|
||||
for(int crs = 0 ; crs < C * R * S ; ++crs){
|
||||
int row_offset = crs * N * OH * OW;
|
||||
const int c = crs / (R * S);
|
||||
const int r = (crs / S) % R;
|
||||
const int s = crs % S;
|
||||
|
||||
for(int nhw = tidx ; nhw < N * OH * OW ; nhw += MAX_BLOCK){
|
||||
const int n = nhw / (OH * OW);
|
||||
const int oh = (nhw / OW) % OH;
|
||||
const int ow = nhw % OW;
|
||||
|
||||
const int h = oh * stride_h - pad_h + r * dilation_h;
|
||||
const int w = ow * stride_w - pad_w + s * dilation_w;
|
||||
|
||||
if (h < 0 || h >= H || w < 0 || w >= W) continue;
|
||||
|
||||
workspace[row_offset + nhw] =
|
||||
_I[n * C * H * W + c * H * W + h * W + w];
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static __global__ void matmul_kernel(float *A, float *B, float *C, int M, int N,
|
||||
int K) {
|
||||
int j = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int i = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
int gj = blockIdx.x;
|
||||
int gi = blockIdx.y;
|
||||
if (gi * BLOCK_SIZE >= M || gj * BLOCK_SIZE >= N) return; // boundary check
|
||||
int lj = threadIdx.x;
|
||||
int li = threadIdx.y;
|
||||
__shared__ float Alocal[BLOCK_SIZE][BLOCK_SIZE];
|
||||
__shared__ float Blocal[BLOCK_SIZE][BLOCK_SIZE];
|
||||
float c = 0.f;
|
||||
int A_row_index = (gi * BLOCK_SIZE + li);
|
||||
int B_col_index = (gj * BLOCK_SIZE + lj);
|
||||
for (int bk = 0; bk < K; bk += BLOCK_SIZE) {
|
||||
int A_col_index = bk + lj;
|
||||
Alocal[li][lj] = (A_row_index < M && A_col_index < K)
|
||||
? A[A_row_index * K + A_col_index]
|
||||
: 0.f;
|
||||
int B_row_index = bk + li;
|
||||
Blocal[li][lj] = (B_row_index < K && B_col_index < N)
|
||||
? B[B_row_index * N + B_col_index]
|
||||
: 0.f;
|
||||
__syncthreads();
|
||||
for (int lk = 0; lk < BLOCK_SIZE; ++lk) {
|
||||
c += Alocal[li][lk] * Blocal[lk][lj];
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
if (i < M && j < N) C[i * N + j] = c;
|
||||
}
|
||||
|
||||
__global__ void reshape_kernel(float *_src, float *_dst, int N, int K, int OH, int OW){
|
||||
|
||||
int bidx = blockIdx.x;
|
||||
int widx = threadIdx.x / NUM_WARP;
|
||||
int lidx = threadIdx.x % NUM_WARP;
|
||||
|
||||
for(int k = widx ; k < K ; k += BLOCK_PER_WARP){
|
||||
for(int on = bidx ; on < N ; on += NUM_SM){
|
||||
for(int hw = lidx ; hw < OH * OW ; hw += NUM_WARP){
|
||||
_dst[on * K * OH * OW + k * OH * OW + hw] =
|
||||
_src[k * N * OH * OW + on * OH * OW + hw];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void convolution_im2col(float *_I, float *_F, float *_O, float *_BUF1,
|
||||
float *_BUF2, int N, int C, int H, int W,
|
||||
int K, int R, int S, int pad_h, int pad_w,
|
||||
int stride_h, int stride_w, int dilation_h,
|
||||
int dilation_w) {
|
||||
|
||||
const int OH = 1 + (H + 2 * pad_h - (((R - 1) * dilation_h) + 1)) / stride_h;
|
||||
const int OW = 1 + (W + 2 * pad_w - (((S - 1) * dilation_w) + 1)) / stride_w;
|
||||
|
||||
CHECK_CUDA(
|
||||
cudaMemcpy(I_gpu, _I, sizeof(float) * N * C * H * W, cudaMemcpyHostToDevice));
|
||||
CHECK_CUDA(
|
||||
cudaMemcpy(F_gpu, _F, sizeof(float) * K * C * R * S, cudaMemcpyHostToDevice));
|
||||
|
||||
dim3 griddim_im2col(NUM_SM);
|
||||
dim3 blockdim_im2col(MAX_BLOCK);
|
||||
im2col_kernel<<<griddim_im2col, blockdim_im2col>>>(I_gpu, BUF1_gpu, N, C, H, W, R, S,
|
||||
pad_h, pad_w, stride_h, stride_w,
|
||||
dilation_h, dilation_w, OH, OW);
|
||||
CHECK_CUDA(cudaGetLastError());
|
||||
|
||||
dim3 griddim_matmul(((N * OH * OW)+ BLOCK_SIZE - 1) / BLOCK_SIZE,
|
||||
(K + BLOCK_SIZE - 1) / BLOCK_SIZE);
|
||||
dim3 blockDim_matmul(BLOCK_SIZE, BLOCK_SIZE);
|
||||
matmul_kernel<<<griddim_matmul, blockDim_matmul>>>(F_gpu, BUF1_gpu, BUF2_gpu, K, N * OH * OW, C * R * S);
|
||||
CHECK_CUDA(cudaGetLastError());
|
||||
|
||||
dim3 griddim_reshape(NUM_SM);
|
||||
dim3 blockDim_reshape(MAX_BLOCK);
|
||||
reshape_kernel<<<griddim_reshape, blockDim_reshape>>>(BUF2_gpu, O_gpu, N, K, OH, OW);
|
||||
CHECK_CUDA(cudaGetLastError());
|
||||
|
||||
CHECK_CUDA(
|
||||
cudaMemcpy(_O, O_gpu, sizeof(float) * N * K * OH * OW, cudaMemcpyDeviceToHost));
|
||||
|
||||
// DO NOT REMOVE; NEEDED FOR TIME MEASURE
|
||||
CHECK_CUDA(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
void convolution(float *_I, float *_F, float *_O, float *_BUF1, float *_BUF2,
|
||||
int N, int C, int H, int W, int K, int R, int S, int pad_h,
|
||||
int pad_w, int stride_h, int stride_w, int dilation_h,
|
||||
int dilation_w) {
|
||||
// Remove this line after you complete the convolution on GPU
|
||||
convolution_im2col(_I, _F, _O, _BUF1, _BUF2, N, C, H, W, K, R, S,
|
||||
pad_h, pad_w, stride_h, stride_w, dilation_h,
|
||||
dilation_w);
|
||||
}
|
||||
|
||||
void convolution_initialize(int N, int C, int H, int W, int K, int R, int S,
|
||||
int pad_h, int pad_w, int stride_h, int stride_w,
|
||||
int dilation_h, int dilation_w) {
|
||||
|
||||
const int OH = 1 + (H + 2 * pad_h - (((R - 1) * dilation_h) + 1)) / stride_h;
|
||||
const int OW = 1 + (W + 2 * pad_w - (((S - 1) * dilation_w) + 1)) / stride_w;
|
||||
|
||||
size_t alloc = sizeof(float) * N * C * H * W + sizeof(float) * K * C * R * S + sizeof(float) * N * K * OH * OW +
|
||||
sizeof(float) * C * R * S * N * OH * OW + sizeof(float) * N * K * OH * OW;
|
||||
printf("Alloc Memory : %lf\n",(double)alloc/1e9);
|
||||
|
||||
CHECK_CUDA(cudaMalloc((void **) &I_gpu, sizeof(float) * N * C * H * W));
|
||||
CHECK_CUDA(cudaMalloc((void **) &F_gpu, sizeof(float) * K * C * R * S));
|
||||
CHECK_CUDA(cudaMalloc((void **) &O_gpu, sizeof(float) * N * K * OH * OW));
|
||||
CHECK_CUDA(cudaMalloc((void **) &BUF1_gpu, sizeof(float) * C * R * S * N * OH * OW));
|
||||
CHECK_CUDA(cudaMalloc((void **) &BUF2_gpu, sizeof(float) * N * K * OH * OW));
|
||||
}
|
||||
|
||||
void convolution_cleanup(float *_I, float *_F, float *_O, int N, int C, int H,
|
||||
int W, int K, int R, int S, int pad_h, int pad_w,
|
||||
int stride_h, int stride_w, int dilation_h,
|
||||
int dilation_w) {
|
||||
CHECK_CUDA(cudaFree(I_gpu));
|
||||
CHECK_CUDA(cudaFree(F_gpu));
|
||||
CHECK_CUDA(cudaFree(O_gpu));
|
||||
CHECK_CUDA(cudaFree(BUF1_gpu));
|
||||
CHECK_CUDA(cudaFree(BUF2_gpu));
|
||||
}
|
|
@ -2,7 +2,7 @@ TARGET=main
|
|||
OBJECTS=main.o util.o convolution.o
|
||||
|
||||
CPPFLAGS=-std=c++14 -O3 -Wall -march=native -mavx2 -mfma -fopenmp -mno-avx512f -I/usr/local/cuda/include
|
||||
CUDA_CFLAGS:=$(foreach option, $(CPPFLAGS),-Xcompiler=$(option))
|
||||
CUDA_CFLAGS:=$(foreach option, $(CPPFLAGS),-Xcompiler=$(option) -arch=sm_70)
|
||||
|
||||
LDFLAGS=-L/usr/local/cuda/lib64
|
||||
LDLIBS=-lstdc++ -lcudart -lm -lcudnn
|
||||
|
|
|
@ -12,10 +12,9 @@
|
|||
} \
|
||||
} while (0)
|
||||
|
||||
void naive_cpu_im2col(half *_I, half *workspace, int N, int C, int H, int W,
|
||||
void naive_cpu_im2col(half *_I, half *workspace, int N, int C, int H, int W,
|
||||
int R, int S, int pad_h, int pad_w, int stride_h,
|
||||
int stride_w, int dilation_h, int dilation_w) {
|
||||
half *I = _I;
|
||||
|
||||
// Naive CPU im2col
|
||||
const int ON = N;
|
||||
|
@ -36,7 +35,7 @@ void naive_cpu_im2col(half *_I, half *workspace, int N, int C, int H, int W,
|
|||
|
||||
workspace[((c * R * S) + (r * S) + s) * (ON * OH * OW) +
|
||||
(on * OH * OW + oh * OW + ow)] =
|
||||
I[n * C * H * W + c * H * W + h * W + w];
|
||||
_I[n * C * H * W + c * H * W + h * W + w];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -46,10 +45,11 @@ void naive_cpu_im2col(half *_I, half *workspace, int N, int C, int H, int W,
|
|||
}
|
||||
|
||||
void naive_cpu_matmul_TC(half *_A, half *_B, half *_C, int M, int N, int K) {
|
||||
|
||||
for (int i = 0; i < M; i++) {
|
||||
for (int k = 0; k < K; k++) {
|
||||
for (int j = 0; j < N; j++) {
|
||||
_C[i * N + j] += _A[i * K + k] * _B[k * N + j];
|
||||
_C[i * N + j] = (half)((float)(_C[i * N + j]) + (float)_A[i * K + k] * (float)_B[k * N + j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,9 +1,6 @@
|
|||
#pragma once
|
||||
|
||||
#include "half.hpp"
|
||||
|
||||
using half_float::half;
|
||||
using namespace half_float::literal;
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
void convolution(half *_I, half *_F, half *_O, half *_BUF1, half *_BUF2,
|
||||
int N, int C, int H, int W, int K, int R, int S, int pad_h,
|
||||
|
|
|
@ -5,12 +5,9 @@
|
|||
#include <string.h>
|
||||
|
||||
#include "convolution.cuh"
|
||||
#include "half.hpp"
|
||||
#include <cuda_fp16.h>
|
||||
#include "util.h"
|
||||
|
||||
using half_float::half;
|
||||
using namespace half_float::literal;
|
||||
|
||||
static bool print = false;
|
||||
static bool validation = false;
|
||||
static int N = 1;
|
||||
|
|
|
@ -22,7 +22,7 @@ half *alloc_tensor(int N, int C, int H, int W) {
|
|||
|
||||
void rand_tensor(half *m, int N, int C, int H, int W) {
|
||||
int L = N * C * H * W;
|
||||
for (int j = 0; j < L; j++) { m[j] = (half) rand() / RAND_MAX - 0.5; }
|
||||
for (int j = 0; j < L; j++) { m[j] = (half) (float)rand() / RAND_MAX - 0.5; }
|
||||
}
|
||||
|
||||
void zero_tensor(half *m, int N, int C, int H, int W) {
|
||||
|
@ -60,7 +60,7 @@ void check_convolution(half *I, half *F, half *O, int N, int C, int H, int W,
|
|||
for (int oc = 0; oc < OC; ++oc) {
|
||||
for (int oh = 0; oh < OH; ++oh) {
|
||||
for (int ow = 0; ow < OW; ++ow) {
|
||||
half sum = 0.0_h;
|
||||
half sum = (half)0.0;
|
||||
for (int c = 0; c < C; ++c) {
|
||||
for (int r = 0; r < R; ++r) {
|
||||
for (int s = 0; s < S; ++s) {
|
||||
|
@ -69,8 +69,8 @@ void check_convolution(half *I, half *F, half *O, int N, int C, int H, int W,
|
|||
const int w = ow * stride_w - pad_w + s * dilation_w;
|
||||
const int k = oc;
|
||||
if (h < 0 || h >= H || w < 0 || w >= W) continue;
|
||||
sum += I[((n * C + c) * H + h) * W + w] *
|
||||
F[((k * C + c) * R + r) * S + s];
|
||||
sum = sum + (half)(I[((n * C + c) * H + h) * W + w] *
|
||||
F[((k * C + c) * R + r) * S + s]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -82,7 +82,7 @@ void check_convolution(half *I, half *F, half *O, int N, int C, int H, int W,
|
|||
|
||||
bool is_valid = true;
|
||||
int cnt = 0, thr = 10;
|
||||
half eps = 1e-3_h;
|
||||
half eps = (half)1e-3;
|
||||
for (int on = 0; on < ON; ++on) {
|
||||
for (int oc = 0; oc < OC; ++oc) {
|
||||
for (int oh = 0; oh < OH; ++oh) {
|
||||
|
|
|
@ -1,9 +1,5 @@
|
|||
#pragma once
|
||||
|
||||
#include "half.hpp"
|
||||
|
||||
using half_float::half;
|
||||
using namespace half_float::literal;
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
double get_time();
|
||||
|
||||
|
|
|
@ -0,0 +1,20 @@
|
|||
TARGET=main
|
||||
OBJECTS=util.o matmul.o main.o
|
||||
|
||||
CPPFLAGS=-std=c++11 -O3 -Wall -march=native -mavx2 -mno-avx512f -mfma -fopenmp
|
||||
CPPFLAGS+= -I/usr/local/cuda/include/
|
||||
LDFLAGS=-lm -lcudart -lcublas
|
||||
LDFLAGS+=-L/usr/local/cuda/lib64
|
||||
|
||||
NVCC=/usr/local/cuda/bin/nvcc
|
||||
|
||||
all: $(TARGET)
|
||||
|
||||
$(TARGET): $(OBJECTS)
|
||||
g++ $(CPPFLAGS) $^ -o $@ $(LDFLAGS)
|
||||
|
||||
matmul.o: matmul.cu
|
||||
$(NVCC) -c -o $@ $^ -arch=sm_70
|
||||
|
||||
clean:
|
||||
rm -rf $(TARGET) $(OBJECTS)
|
|
@ -0,0 +1,103 @@
|
|||
#include <stdio.h>
|
||||
#include <getopt.h>
|
||||
#include <stdbool.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include <cuda_fp16.h>
|
||||
#include "util.h"
|
||||
#include "matmul.h"
|
||||
|
||||
static void print_help(const char* prog_name) {
|
||||
printf("Usage: %s [-pvh] [-n num_iterations] M N K\n", prog_name);
|
||||
printf("Options:\n");
|
||||
printf(" -p : print matrix data. (default: off)\n");
|
||||
printf(" -v : validate matrix multiplication. (default: off)\n");
|
||||
printf(" -h : print this page.\n");
|
||||
printf(" -t : number of threads (default: 1)\n");
|
||||
printf(" -n : number of iterations (default: 1)\n");
|
||||
printf(" M : number of rows of matrix A and C. (default: 8)\n");
|
||||
printf(" N : number of columns of matrix B and C. (default: 8)\n");
|
||||
printf(" K : number of columns of matrix A and rows of B. (default: 8)\n");
|
||||
}
|
||||
|
||||
static bool print_matrix = false;
|
||||
static bool validation = false;
|
||||
static int M = 8, N = 8, K = 8;
|
||||
static int num_iterations = 1;
|
||||
|
||||
static void parse_opt(int argc, char **argv) {
|
||||
int c;
|
||||
while ((c = getopt(argc, argv, "pvht:n:")) != -1) {
|
||||
switch (c) {
|
||||
case 'p':
|
||||
print_matrix = true;
|
||||
break;
|
||||
case 'v':
|
||||
validation = true;
|
||||
break;
|
||||
case 'n':
|
||||
num_iterations = atoi(optarg);
|
||||
break;
|
||||
case 'h':
|
||||
default:
|
||||
print_help(argv[0]);
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
for (int i = optind, j = 0; i < argc; ++i, ++j) {
|
||||
switch (j) {
|
||||
case 0: M = atoi(argv[i]); break;
|
||||
case 1: N = atoi(argv[i]); break;
|
||||
case 2: K = atoi(argv[i]); break;
|
||||
default: break;
|
||||
}
|
||||
}
|
||||
printf("Options:\n");
|
||||
printf(" Problem size: M = %d, N = %d, K = %d\n", M, N, K);
|
||||
printf(" Number of iterations: %d\n", num_iterations);
|
||||
printf(" Print matrix: %s\n", print_matrix ? "on" : "off");
|
||||
printf(" Validation: %s\n", validation ? "on" : "off");
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
parse_opt(argc, argv);
|
||||
|
||||
printf("Initializing... "); fflush(stdout);
|
||||
half *A = alloc_mat(M, K);
|
||||
half *B = alloc_mat(K, N);
|
||||
float *C = alloc_mat_float(M, N);
|
||||
rand_mat(A, M, K);
|
||||
rand_mat(B, K, N);
|
||||
matmul_init(M, N, K);
|
||||
printf("done!\n"); fflush(stdout);
|
||||
|
||||
double elapsed_time_sum = 0;
|
||||
for (int i = 0; i < num_iterations; ++i) {
|
||||
printf("Calculating...(iter=%d) ", i); fflush(stdout);
|
||||
zero_mat_float(C, M, N);
|
||||
double start_time = get_time();
|
||||
matmul(A, B, C, M, N, K);
|
||||
double elapsed_time = get_time() - start_time;
|
||||
printf("%f sec\n", elapsed_time);
|
||||
elapsed_time_sum += elapsed_time;
|
||||
}
|
||||
|
||||
if (print_matrix) {
|
||||
printf("MATRIX A:\n"); print_mat(A, M, K);
|
||||
printf("MATRIX B:\n"); print_mat(B, K, N);
|
||||
printf("MATRIX C:\n"); print_mat_float(C, M, N);
|
||||
}
|
||||
|
||||
matmul_cleanup(A, B, C, M, N, K);
|
||||
|
||||
if (validation) {
|
||||
check_matmul(A, B, C, M, N, K);
|
||||
}
|
||||
|
||||
double elapsed_time_avg = elapsed_time_sum / num_iterations;
|
||||
printf("Avg. time: %f sec\n", elapsed_time_avg);
|
||||
printf("Avg. throughput: %f GFLOPS\n", 2.0 * M * N * K / elapsed_time_avg / 1e9);
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,57 @@
|
|||
#include <cstdio>
|
||||
#include <mma.h>
|
||||
|
||||
#include "matmul.h"
|
||||
|
||||
using namespace nvcuda;
|
||||
|
||||
#define CHECK_CUDA(call) \
|
||||
do { \
|
||||
cudaError_t status_ = call; \
|
||||
if (status_ != cudaSuccess) { \
|
||||
fprintf(stderr, "CUDA error (%s:%d): %s:%s\n", __FILE__, __LINE__, \
|
||||
cudaGetErrorName(status_), cudaGetErrorString(status_)); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
// Device(GPU) pointers
|
||||
static float *A_gpu, *B_gpu, *C_gpu;
|
||||
|
||||
void naive_cpu_matmul(half *_A, half *_B, float *_C, int M, int N, int K) {
|
||||
for (int i = 0; i < M; i++) {
|
||||
for (int k = 0; k < K; k++) {
|
||||
for (int j = 0; j < N; j++) {
|
||||
_C[i * N + j] += (float)_A[i * K + k] * (float)_B[k * N + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void matmul(half *_A, half *_B, float *_C, int M, int N, int K) {
|
||||
// Remove this line after you complete the matmul on GPU
|
||||
naive_cpu_matmul(_A, _B, _C, M, N, K);
|
||||
|
||||
// (TODO) Upload A and B matrix to GPU
|
||||
|
||||
// (TODO) Launch kernel on a GPU
|
||||
|
||||
// (TODO) Download C matrix from GPU
|
||||
|
||||
// DO NOT REMOVE; NEEDED FOR TIME MEASURE
|
||||
CHECK_CUDA(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
void matmul_init(int M, int N, int K) {
|
||||
// (TODO) Allocate device memory
|
||||
|
||||
// DO NOT REMOVE; NEEDED FOR TIME MEASURE
|
||||
CHECK_CUDA(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
void matmul_cleanup(half *_A, half *_B, float *_C, int M, int N, int K) {
|
||||
// (TODO) Do any post-matmul cleanup work here.
|
||||
|
||||
// DO NOT REMOVE; NEEDED FOR TIME MEASURE
|
||||
CHECK_CUDA(cudaDeviceSynchronize());
|
||||
}
|
|
@ -0,0 +1,7 @@
|
|||
#pragma once
|
||||
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
void matmul(half *_A, half *_B, float *_C, int M, int N, int K);
|
||||
void matmul_init(int M, int N, int K);
|
||||
void matmul_cleanup(half *_A, half *_B, float *_C, int M, int N, int K);
|
|
@ -0,0 +1,96 @@
|
|||
#include "util.h"
|
||||
|
||||
#include <omp.h>
|
||||
#include <sys/time.h>
|
||||
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdbool>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
|
||||
double get_time() {
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv, 0);
|
||||
return tv.tv_sec + tv.tv_usec * 1e-6;
|
||||
}
|
||||
|
||||
void check_matmul(half *A, half *B, float *C, int M, int N, int K) {
|
||||
printf("Validating...\n");
|
||||
|
||||
float *C_ans = alloc_mat_float(M, N);
|
||||
zero_mat_float(C_ans, M, N);
|
||||
|
||||
#pragma omp parallel for num_threads(20)
|
||||
for (int i = 0; i < M; ++i) {
|
||||
for (int k = 0; k < K; ++k) {
|
||||
for (int j = 0; j < N; ++j) {
|
||||
C_ans[i * N + j] = C_ans[i * N + j] + (float)((A[i * K + k]) * (B[k * N + j]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool is_valid = true;
|
||||
int cnt = 0, thr = 10;
|
||||
float eps = 1e-3;
|
||||
for (int i = 0; i < M; ++i) {
|
||||
for (int j = 0; j < N; ++j) {
|
||||
float c = C[i * N + j];
|
||||
float c_ans = C_ans[i * N + j];
|
||||
if (fabsf(c - c_ans) > eps &&
|
||||
(c_ans == 0 || fabsf((c - c_ans) / c_ans) > eps)) {
|
||||
++cnt;
|
||||
if (cnt <= thr)
|
||||
printf("C[%d][%d] : correct_value = %f, your_value = %f\n", i, j,
|
||||
(float)c_ans, (float)c);
|
||||
if (cnt == thr + 1)
|
||||
printf("Too many error, only first %d values are printed.\n", thr);
|
||||
is_valid = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (is_valid) {
|
||||
printf("Result: VALID\n");
|
||||
} else {
|
||||
printf("Result: INVALID\n");
|
||||
}
|
||||
}
|
||||
|
||||
void print_mat(half *m, int R, int C) {
|
||||
for (int i = 0; i < R; ++i) {
|
||||
for (int j = 0; j < C; ++j) { printf("%+.3f ", (float)(m[i * C + j])); }
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
|
||||
void print_mat_float(float *m, int R, int C) {
|
||||
for (int i = 0; i < R; ++i) {
|
||||
for (int j = 0; j < C; ++j) { printf("%+.3f ", (float)(m[i * C + j])); }
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
|
||||
half *alloc_mat(int R, int C) {
|
||||
half *m = (half *) aligned_alloc(32, sizeof(half) * R * C);
|
||||
return m;
|
||||
}
|
||||
|
||||
float *alloc_mat_float(int R, int C) {
|
||||
float *m = (float *) aligned_alloc(32, sizeof(float) * R * C);
|
||||
return m;
|
||||
}
|
||||
|
||||
void rand_mat(half *m, int R, int C) {
|
||||
for (int i = 0; i < R; i++) {
|
||||
for (int j = 0; j < C; j++) {
|
||||
m[i * C + j] = (half) ((float)rand() / RAND_MAX - 0.5);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void zero_mat(half *m, int R, int C) { memset(m, 0, sizeof(half) * R * C); }
|
||||
|
||||
void zero_mat_float(float *m, int R, int C) { memset(m, 0, sizeof(float) * R * C); }
|
|
@ -0,0 +1,20 @@
|
|||
#pragma once
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
double get_time();
|
||||
|
||||
void check_matmul(half *A, half *B, float *C, int M, int N, int K);
|
||||
|
||||
void print_mat(half *m, int R, int C);
|
||||
|
||||
void print_mat_float(float *m, int R, int C);
|
||||
|
||||
half* alloc_mat(int R, int C);
|
||||
|
||||
float* alloc_mat_float(int R, int C);
|
||||
|
||||
void rand_mat(half *m, int R, int C);
|
||||
|
||||
void zero_mat(half *m, int R, int C);
|
||||
|
||||
void zero_mat_float(float *m, int R, int C);
|
Loading…
Reference in New Issue