WIP on APSS24 Project:

- Add CHECK_CUDA Macro and cudaDeviceSync
- Set cudaSetDevice(0) in main.cpp
This commit is contained in:
Jaehwan Lee 2024-08-20 21:04:33 +09:00
parent c64f7a5615
commit 0f63360f03
10 changed files with 52 additions and 14 deletions

View File

@ -12,10 +12,12 @@ void Linear(Tensor *in, Tensor *w, Tensor *b, Tensor *out);
/* Data movement operations */
void Reshape(Tensor *in, Tensor *out);
/* Other operations */
void BatchNorm2d(Tensor *in, Tensor *weight, Tensor *bias, Tensor *out);
/* Convolutional operations */
void ConvTranspose2d(Tensor *in, Tensor *weight, Tensor *bias, Tensor *out);
void Conv2d(Tensor *in, Tensor *w, Tensor *b, Tensor *out);
/* Other operations */
void BatchNorm2d(Tensor *in, Tensor *weight, Tensor *bias, Tensor *out);
/* Example GPU kernel */
void LeakyReLU_cuda(Tensor *inout);

View File

@ -1,6 +1,7 @@
#pragma once
#include <vector>
#include <cstdio>
#include "half.hpp" /* for half on CPU ('half_cpu') */
#include "cuda_fp16.h" /* for half on GPU ('half') */

View File

@ -1,7 +1,7 @@
#!/bin/bash
srun -N 1 --partition PB --exclusive \
./main $@
./main -v $@
# ./main $@

View File

@ -1,5 +1,15 @@
#include "layer.h"
#define CHECK_CUDA(call) \
do { \
cudaError_t status_ = call; \
if (status_ != cudaSuccess) { \
fprintf(stderr, "CUDA error (%s:%d): %s:%s\n", __FILE__, __LINE__, \
cudaGetErrorName(status_), cudaGetErrorString(status_)); \
exit(EXIT_FAILURE); \
} \
} while (0)
/* Linear
* @param [in1] in: [M, K]
* @param [in2] w: [N, K]
@ -206,13 +216,14 @@ void LeakyReLU_cuda(Tensor *inout) {
half *d_inout;
cudaMalloc(&d_inout, N * sizeof(half));
cudaMemcpy(d_inout, inout->buf, N * sizeof(half), cudaMemcpyHostToDevice);
CHECK_CUDA(cudaMalloc(&d_inout, N * sizeof(half)));
CHECK_CUDA(cudaMemcpy(d_inout, inout->buf, N * sizeof(half), cudaMemcpyHostToDevice));
LeakyReLU_kernel<<<(N + 255) / 256, 256>>>(d_inout, N, alpha);
CHECK_CUDA(cudaDeviceSynchronize());
cudaMemcpy(inout->buf, d_inout, N * sizeof(half), cudaMemcpyDeviceToHost);
cudaFree(d_inout);
CHECK_CUDA(cudaMemcpy(inout->buf, d_inout, N * sizeof(half), cudaMemcpyDeviceToHost));
CHECK_CUDA(cudaFree(d_inout));
}
/* Conv2d

View File

@ -164,8 +164,11 @@ int main(int argc, char **argv) {
cudaSetDevice(i);
cudaDeviceSynchronize();
}
cudaSetDevice(0);
fprintf(stdout, "Generating images...");
fflush(stdout);
st = get_time();
/* Call the main computation (optimization target) of the program. */
@ -175,6 +178,8 @@ int main(int argc, char **argv) {
cudaSetDevice(i);
cudaDeviceSynchronize();
}
cudaSetDevice(0);
et = get_time();
/* Print the result */

View File

@ -12,10 +12,12 @@ void Linear(Tensor *in, Tensor *w, Tensor *b, Tensor *out);
/* Data movement operations */
void Reshape(Tensor *in, Tensor *out);
/* Other operations */
void BatchNorm2d(Tensor *in, Tensor *weight, Tensor *bias, Tensor *out);
/* Convolutional operations */
void ConvTranspose2d(Tensor *in, Tensor *weight, Tensor *bias, Tensor *out);
void Conv2d(Tensor *in, Tensor *w, Tensor *b, Tensor *out);
/* Other operations */
void BatchNorm2d(Tensor *in, Tensor *weight, Tensor *bias, Tensor *out);
/* Example GPU kernel */
void LeakyReLU_cuda(Tensor *inout);

View File

@ -1,6 +1,7 @@
#pragma once
#include <vector>
#include <cstdio>
using std::vector;

View File

@ -1,7 +1,7 @@
#!/bin/bash
srun -N 1 --partition PB --exclusive \
./main $@
./main -v $@
# ./main $@

View File

@ -1,5 +1,15 @@
#include "layer.h"
#define CHECK_CUDA(call) \
do { \
cudaError_t status_ = call; \
if (status_ != cudaSuccess) { \
fprintf(stderr, "CUDA error (%s:%d): %s:%s\n", __FILE__, __LINE__, \
cudaGetErrorName(status_), cudaGetErrorString(status_)); \
exit(EXIT_FAILURE); \
} \
} while (0)
/* Linear
* @param [in1] in: [M, K]
* @param [in2] w: [N, K]
@ -204,13 +214,14 @@ void LeakyReLU_cuda(Tensor *inout) {
float *d_inout;
cudaMalloc(&d_inout, N * sizeof(float));
cudaMemcpy(d_inout, inout->buf, N * sizeof(float), cudaMemcpyHostToDevice);
CHECK_CUDA(cudaMalloc(&d_inout, N * sizeof(float)));
CHECK_CUDA(cudaMemcpy(d_inout, inout->buf, N * sizeof(float), cudaMemcpyHostToDevice));
LeakyReLU_kernel<<<(N + 255) / 256, 256>>>(d_inout, N, alpha);
CHECK_CUDA(cudaDeviceSynchronize());
cudaMemcpy(inout->buf, d_inout, N * sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(d_inout);
CHECK_CUDA(cudaMemcpy(inout->buf, d_inout, N * sizeof(float), cudaMemcpyDeviceToHost));
CHECK_CUDA(cudaFree(d_inout));
}
/* Conv2d

View File

@ -164,8 +164,11 @@ int main(int argc, char **argv) {
cudaSetDevice(i);
cudaDeviceSynchronize();
}
cudaSetDevice(0);
fprintf(stdout, "Generating images...");
fflush(stdout);
st = get_time();
/* Call the main computation (optimization target) of the program. */
@ -175,6 +178,8 @@ int main(int argc, char **argv) {
cudaSetDevice(i);
cudaDeviceSynchronize();
}
cudaSetDevice(0);
et = get_time();
/* Print the result */