WIP on APSS24 Project:
- Add CHECK_CUDA Macro and cudaDeviceSync - Set cudaSetDevice(0) in main.cpp
This commit is contained in:
parent
c64f7a5615
commit
0f63360f03
|
@ -12,10 +12,12 @@ void Linear(Tensor *in, Tensor *w, Tensor *b, Tensor *out);
|
|||
/* Data movement operations */
|
||||
void Reshape(Tensor *in, Tensor *out);
|
||||
|
||||
/* Other operations */
|
||||
void BatchNorm2d(Tensor *in, Tensor *weight, Tensor *bias, Tensor *out);
|
||||
/* Convolutional operations */
|
||||
void ConvTranspose2d(Tensor *in, Tensor *weight, Tensor *bias, Tensor *out);
|
||||
void Conv2d(Tensor *in, Tensor *w, Tensor *b, Tensor *out);
|
||||
|
||||
/* Other operations */
|
||||
void BatchNorm2d(Tensor *in, Tensor *weight, Tensor *bias, Tensor *out);
|
||||
|
||||
/* Example GPU kernel */
|
||||
void LeakyReLU_cuda(Tensor *inout);
|
|
@ -1,6 +1,7 @@
|
|||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
#include <cstdio>
|
||||
|
||||
#include "half.hpp" /* for half on CPU ('half_cpu') */
|
||||
#include "cuda_fp16.h" /* for half on GPU ('half') */
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
#!/bin/bash
|
||||
|
||||
srun -N 1 --partition PB --exclusive \
|
||||
./main $@
|
||||
./main -v $@
|
||||
|
||||
# ./main $@
|
||||
|
|
@ -1,5 +1,15 @@
|
|||
#include "layer.h"
|
||||
|
||||
#define CHECK_CUDA(call) \
|
||||
do { \
|
||||
cudaError_t status_ = call; \
|
||||
if (status_ != cudaSuccess) { \
|
||||
fprintf(stderr, "CUDA error (%s:%d): %s:%s\n", __FILE__, __LINE__, \
|
||||
cudaGetErrorName(status_), cudaGetErrorString(status_)); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
/* Linear
|
||||
* @param [in1] in: [M, K]
|
||||
* @param [in2] w: [N, K]
|
||||
|
@ -206,13 +216,14 @@ void LeakyReLU_cuda(Tensor *inout) {
|
|||
|
||||
half *d_inout;
|
||||
|
||||
cudaMalloc(&d_inout, N * sizeof(half));
|
||||
cudaMemcpy(d_inout, inout->buf, N * sizeof(half), cudaMemcpyHostToDevice);
|
||||
CHECK_CUDA(cudaMalloc(&d_inout, N * sizeof(half)));
|
||||
CHECK_CUDA(cudaMemcpy(d_inout, inout->buf, N * sizeof(half), cudaMemcpyHostToDevice));
|
||||
|
||||
LeakyReLU_kernel<<<(N + 255) / 256, 256>>>(d_inout, N, alpha);
|
||||
CHECK_CUDA(cudaDeviceSynchronize());
|
||||
|
||||
cudaMemcpy(inout->buf, d_inout, N * sizeof(half), cudaMemcpyDeviceToHost);
|
||||
cudaFree(d_inout);
|
||||
CHECK_CUDA(cudaMemcpy(inout->buf, d_inout, N * sizeof(half), cudaMemcpyDeviceToHost));
|
||||
CHECK_CUDA(cudaFree(d_inout));
|
||||
}
|
||||
|
||||
/* Conv2d
|
||||
|
|
|
@ -164,8 +164,11 @@ int main(int argc, char **argv) {
|
|||
cudaSetDevice(i);
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
cudaSetDevice(0);
|
||||
|
||||
fprintf(stdout, "Generating images...");
|
||||
fflush(stdout);
|
||||
|
||||
st = get_time();
|
||||
|
||||
/* Call the main computation (optimization target) of the program. */
|
||||
|
@ -175,6 +178,8 @@ int main(int argc, char **argv) {
|
|||
cudaSetDevice(i);
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
cudaSetDevice(0);
|
||||
|
||||
et = get_time();
|
||||
|
||||
/* Print the result */
|
||||
|
|
|
@ -12,10 +12,12 @@ void Linear(Tensor *in, Tensor *w, Tensor *b, Tensor *out);
|
|||
/* Data movement operations */
|
||||
void Reshape(Tensor *in, Tensor *out);
|
||||
|
||||
/* Other operations */
|
||||
void BatchNorm2d(Tensor *in, Tensor *weight, Tensor *bias, Tensor *out);
|
||||
/* Convolutional operations */
|
||||
void ConvTranspose2d(Tensor *in, Tensor *weight, Tensor *bias, Tensor *out);
|
||||
void Conv2d(Tensor *in, Tensor *w, Tensor *b, Tensor *out);
|
||||
|
||||
/* Other operations */
|
||||
void BatchNorm2d(Tensor *in, Tensor *weight, Tensor *bias, Tensor *out);
|
||||
|
||||
/* Example GPU kernel */
|
||||
void LeakyReLU_cuda(Tensor *inout);
|
|
@ -1,6 +1,7 @@
|
|||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
#include <cstdio>
|
||||
|
||||
using std::vector;
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
#!/bin/bash
|
||||
|
||||
srun -N 1 --partition PB --exclusive \
|
||||
./main $@
|
||||
./main -v $@
|
||||
|
||||
# ./main $@
|
||||
|
|
@ -1,5 +1,15 @@
|
|||
#include "layer.h"
|
||||
|
||||
#define CHECK_CUDA(call) \
|
||||
do { \
|
||||
cudaError_t status_ = call; \
|
||||
if (status_ != cudaSuccess) { \
|
||||
fprintf(stderr, "CUDA error (%s:%d): %s:%s\n", __FILE__, __LINE__, \
|
||||
cudaGetErrorName(status_), cudaGetErrorString(status_)); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
/* Linear
|
||||
* @param [in1] in: [M, K]
|
||||
* @param [in2] w: [N, K]
|
||||
|
@ -204,13 +214,14 @@ void LeakyReLU_cuda(Tensor *inout) {
|
|||
|
||||
float *d_inout;
|
||||
|
||||
cudaMalloc(&d_inout, N * sizeof(float));
|
||||
cudaMemcpy(d_inout, inout->buf, N * sizeof(float), cudaMemcpyHostToDevice);
|
||||
CHECK_CUDA(cudaMalloc(&d_inout, N * sizeof(float)));
|
||||
CHECK_CUDA(cudaMemcpy(d_inout, inout->buf, N * sizeof(float), cudaMemcpyHostToDevice));
|
||||
|
||||
LeakyReLU_kernel<<<(N + 255) / 256, 256>>>(d_inout, N, alpha);
|
||||
CHECK_CUDA(cudaDeviceSynchronize());
|
||||
|
||||
cudaMemcpy(inout->buf, d_inout, N * sizeof(float), cudaMemcpyDeviceToHost);
|
||||
cudaFree(d_inout);
|
||||
CHECK_CUDA(cudaMemcpy(inout->buf, d_inout, N * sizeof(float), cudaMemcpyDeviceToHost));
|
||||
CHECK_CUDA(cudaFree(d_inout));
|
||||
}
|
||||
|
||||
/* Conv2d
|
||||
|
|
|
@ -164,8 +164,11 @@ int main(int argc, char **argv) {
|
|||
cudaSetDevice(i);
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
cudaSetDevice(0);
|
||||
|
||||
fprintf(stdout, "Generating images...");
|
||||
fflush(stdout);
|
||||
|
||||
st = get_time();
|
||||
|
||||
/* Call the main computation (optimization target) of the program. */
|
||||
|
@ -175,6 +178,8 @@ int main(int argc, char **argv) {
|
|||
cudaSetDevice(i);
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
cudaSetDevice(0);
|
||||
|
||||
et = get_time();
|
||||
|
||||
/* Print the result */
|
||||
|
|
Loading…
Reference in New Issue