WIP on APSS24 Project:

- Add CHECK_CUDA Macro and cudaDeviceSync - Set cudaSetDevice(0) in main.cpp
2024-08-20 21:04:33 +09:00 · 2024-08-20 21:04:33 +09:00 · 0f63360f03
parent c64f7a5615
commit 0f63360f03
10 changed files with 52 additions and 14 deletions
--- a/APSS24/project/advanced/include/layer.h
+++ b/APSS24/project/advanced/include/layer.h
@ -12,10 +12,12 @@ void Linear(Tensor *in, Tensor *w, Tensor *b, Tensor *out);
 /* Data movement operations */
 void Reshape(Tensor *in, Tensor *out);

-/* Other operations */
-void BatchNorm2d(Tensor *in, Tensor *weight, Tensor *bias, Tensor *out);
+/* Convolutional operations */
 void ConvTranspose2d(Tensor *in, Tensor *weight, Tensor *bias, Tensor *out);
 void Conv2d(Tensor *in, Tensor *w, Tensor *b, Tensor *out);

+/* Other operations */
+void BatchNorm2d(Tensor *in, Tensor *weight, Tensor *bias, Tensor *out);
+
 /* Example GPU kernel */
 void LeakyReLU_cuda(Tensor *inout);
--- a/APSS24/project/advanced/include/tensor.h
+++ b/APSS24/project/advanced/include/tensor.h
@ -1,6 +1,7 @@
 #pragma once

 #include <vector>
+#include <cstdio>

 #include "half.hpp" /* for half on CPU ('half_cpu') */
 #include "cuda_fp16.h" /* for half on GPU ('half') */
--- a/APSS24/project/advanced/run.sh
+++ b/APSS24/project/advanced/run.sh
@ -1,7 +1,7 @@
 #!/bin/bash

 srun -N 1 --partition PB --exclusive \
-	./main $@
+	./main -v $@

 	# ./main $@
 		
--- a/APSS24/project/advanced/src/layer.cu
+++ b/APSS24/project/advanced/src/layer.cu
@ -1,5 +1,15 @@
 #include "layer.h"

+#define CHECK_CUDA(call)                                                 \
+  do {                                                                   \
+    cudaError_t status_ = call;                                          \
+    if (status_ != cudaSuccess) {                                        \
+      fprintf(stderr, "CUDA error (%s:%d): %s:%s\n", __FILE__, __LINE__, \
+              cudaGetErrorName(status_), cudaGetErrorString(status_));   \
+      exit(EXIT_FAILURE);                                                \
+    }                                                                    \
+  } while (0)
+
 /* Linear
 * @param [in1]  in: [M, K]
 * @param [in2]   w: [N, K]
@ -206,13 +216,14 @@ void LeakyReLU_cuda(Tensor *inout) {
  
  half *d_inout;

-  cudaMalloc(&d_inout, N * sizeof(half));
-  cudaMemcpy(d_inout, inout->buf, N * sizeof(half), cudaMemcpyHostToDevice);
+  CHECK_CUDA(cudaMalloc(&d_inout, N * sizeof(half)));
+  CHECK_CUDA(cudaMemcpy(d_inout, inout->buf, N * sizeof(half), cudaMemcpyHostToDevice));

  LeakyReLU_kernel<<<(N + 255) / 256, 256>>>(d_inout, N, alpha);
+  CHECK_CUDA(cudaDeviceSynchronize());

-  cudaMemcpy(inout->buf, d_inout, N * sizeof(half), cudaMemcpyDeviceToHost);
-  cudaFree(d_inout);
+  CHECK_CUDA(cudaMemcpy(inout->buf, d_inout, N * sizeof(half), cudaMemcpyDeviceToHost));
+  CHECK_CUDA(cudaFree(d_inout));
 }

 /* Conv2d
--- a/APSS24/project/advanced/src/main.cpp
+++ b/APSS24/project/advanced/src/main.cpp
@ -164,8 +164,11 @@ int main(int argc, char **argv) {
    cudaSetDevice(i);
    cudaDeviceSynchronize();
  }
+  cudaSetDevice(0);
+
  fprintf(stdout, "Generating images...");
  fflush(stdout);
+  
  st = get_time();

  /* Call the main computation (optimization target) of the program. */
@ -175,6 +178,8 @@ int main(int argc, char **argv) {
    cudaSetDevice(i);
    cudaDeviceSynchronize();
  }
+  cudaSetDevice(0);
+
  et = get_time();

  /* Print the result */
--- a/APSS24/project/basic/include/layer.h
+++ b/APSS24/project/basic/include/layer.h
@ -12,10 +12,12 @@ void Linear(Tensor *in, Tensor *w, Tensor *b, Tensor *out);
 /* Data movement operations */
 void Reshape(Tensor *in, Tensor *out);

-/* Other operations */
-void BatchNorm2d(Tensor *in, Tensor *weight, Tensor *bias, Tensor *out);
+/* Convolutional operations */
 void ConvTranspose2d(Tensor *in, Tensor *weight, Tensor *bias, Tensor *out);
 void Conv2d(Tensor *in, Tensor *w, Tensor *b, Tensor *out);

+/* Other operations */
+void BatchNorm2d(Tensor *in, Tensor *weight, Tensor *bias, Tensor *out);
+
 /* Example GPU kernel */
 void LeakyReLU_cuda(Tensor *inout);
--- a/APSS24/project/basic/include/tensor.h
+++ b/APSS24/project/basic/include/tensor.h
@ -1,6 +1,7 @@
 #pragma once

 #include <vector>
+#include <cstdio>

 using std::vector;

--- a/APSS24/project/basic/run.sh
+++ b/APSS24/project/basic/run.sh
@ -1,7 +1,7 @@
 #!/bin/bash

 srun -N 1 --partition PB --exclusive \
-	./main $@
+	./main -v $@

 	# ./main $@
 		
--- a/APSS24/project/basic/src/layer.cu
+++ b/APSS24/project/basic/src/layer.cu
@ -1,5 +1,15 @@
 #include "layer.h"

+#define CHECK_CUDA(call)                                                 \
+  do {                                                                   \
+    cudaError_t status_ = call;                                          \
+    if (status_ != cudaSuccess) {                                        \
+      fprintf(stderr, "CUDA error (%s:%d): %s:%s\n", __FILE__, __LINE__, \
+              cudaGetErrorName(status_), cudaGetErrorString(status_));   \
+      exit(EXIT_FAILURE);                                                \
+    }                                                                    \
+  } while (0)
+
 /* Linear
 * @param [in1]  in: [M, K]
 * @param [in2]   w: [N, K]
@ -204,13 +214,14 @@ void LeakyReLU_cuda(Tensor *inout) {

  float *d_inout;

-  cudaMalloc(&d_inout, N * sizeof(float));
-  cudaMemcpy(d_inout, inout->buf, N * sizeof(float), cudaMemcpyHostToDevice);
+  CHECK_CUDA(cudaMalloc(&d_inout, N * sizeof(float)));
+  CHECK_CUDA(cudaMemcpy(d_inout, inout->buf, N * sizeof(float), cudaMemcpyHostToDevice));

  LeakyReLU_kernel<<<(N + 255) / 256, 256>>>(d_inout, N, alpha);
+  CHECK_CUDA(cudaDeviceSynchronize());

-  cudaMemcpy(inout->buf, d_inout, N * sizeof(float), cudaMemcpyDeviceToHost);
-  cudaFree(d_inout);
+  CHECK_CUDA(cudaMemcpy(inout->buf, d_inout, N * sizeof(float), cudaMemcpyDeviceToHost));
+  CHECK_CUDA(cudaFree(d_inout));
 }

 /* Conv2d
--- a/APSS24/project/basic/src/main.cpp
+++ b/APSS24/project/basic/src/main.cpp
@ -164,8 +164,11 @@ int main(int argc, char **argv) {
    cudaSetDevice(i);
    cudaDeviceSynchronize();
  }
+  cudaSetDevice(0);
+
  fprintf(stdout, "Generating images...");
  fflush(stdout);
+
  st = get_time();

  /* Call the main computation (optimization target) of the program. */
@ -175,6 +178,8 @@ int main(int argc, char **argv) {
    cudaSetDevice(i);
    cudaDeviceSynchronize();
  }
+  cudaSetDevice(0);
+
  et = get_time();

  /* Print the result */