55 lines
1.7 KiB
Plaintext
55 lines
1.7 KiB
Plaintext
|
#include <cstdio>
|
||
|
|
||
|
#include "transpose.h"
|
||
|
|
||
|
#define CHECK_CUDA(call) \
|
||
|
do { \
|
||
|
cudaError_t status_ = call; \
|
||
|
if (status_ != cudaSuccess) { \
|
||
|
fprintf(stderr, "CUDA error (%s:%d): %s:%s\n", __FILE__, __LINE__, \
|
||
|
cudaGetErrorName(status_), cudaGetErrorString(status_)); \
|
||
|
exit(EXIT_FAILURE); \
|
||
|
} \
|
||
|
} while (0)
|
||
|
|
||
|
// Device(GPU) pointers
|
||
|
static float *A_gpu, *B_gpu;
|
||
|
|
||
|
void naive_cpu_transpose(float *A, float *B, int M, int N) {
|
||
|
for (int i = 0; i < N; ++i) {
|
||
|
for (int j = 0; j < M; ++j) {
|
||
|
B[i * M + j] = A[j * N + i];
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// A: M x N matrix, B: N x M matrix
|
||
|
void transpose(float *_A, float *_B, int M, int N) {
|
||
|
// Remove this line after you complete the transpose on GPU
|
||
|
naive_cpu_transpose(_A, _B, M, N);
|
||
|
|
||
|
// (TODO) Run transpose on GPU
|
||
|
// You can memcpy data in initialize/cleanup functions.
|
||
|
|
||
|
// DO NOT REMOVE; NEEDED FOR TIME MEASURE
|
||
|
CHECK_CUDA(cudaDeviceSynchronize());
|
||
|
}
|
||
|
|
||
|
void transpose_init(float *_A, float *_B, int M, int N) {
|
||
|
// (TODO) Allocate device memory
|
||
|
|
||
|
// (TODO) Upload A matrix to GPU
|
||
|
|
||
|
// DO NOT REMOVE; NEEDED FOR TIME MEASURE
|
||
|
CHECK_CUDA(cudaDeviceSynchronize());
|
||
|
}
|
||
|
|
||
|
void transpose_cleanup(float *_A, float *_B, int M, int N) {
|
||
|
// (TODO) Download B matrix from GPU
|
||
|
|
||
|
// (TODO) Do any post-transpose cleanup work here.
|
||
|
|
||
|
// DO NOT REMOVE; NEEDED FOR TIME MEASURE
|
||
|
CHECK_CUDA(cudaDeviceSynchronize());
|
||
|
}
|