chundoong-lab-ta/SamsungDS22/vector_io/main.cu

136 lines
4.0 KiB
Plaintext

#include <stdio.h>
#include <stdbool.h>
#include <stdlib.h>
#include <algorithm>
#include "util.h"
// https://developer.nvidia.com/blog/cuda-pro-tip-increase-performance-with-vectorized-memory-access/
#define cudaCheckError(e) { \
if(e!=cudaSuccess) { \
printf("Cuda failure %s:%d: '%s'\n",__FILE__,__LINE__,cudaGetErrorString(e)); \
exit(0); \
} \
}
#define MAX_BLOCKS (1ul << 30)
#define NMIN (1ul << 10)
#define NMAX (1ul << 31)
__global__ void init_kernel(float* d_inout, unsigned long N) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = idx; i < N; i += blockDim.x * gridDim.x) {
d_inout[i] = idx;
}
}
__global__ void device_copy_scalar_kernel(float* d_in, float* d_out, unsigned long N) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = idx; i < N/4; i += blockDim.x * gridDim.x) {
d_out[i * 4 + 0] = d_in[i * 4 + 0];
d_out[i * 4 + 1] = d_in[i * 4 + 1];
d_out[i * 4 + 2] = d_in[i * 4 + 2];
d_out[i * 4 + 3] = d_in[i * 4 + 3];
}
}
void device_copy_scalar(float* d_in, float* d_out, unsigned long N)
{
int threads = 128;
int blocks = std::min((N + threads-1) / threads, MAX_BLOCKS);
device_copy_scalar_kernel<<<blocks, threads>>>(d_in, d_out, N);
}
__global__ void device_copy_vector2_kernel(float* d_in, float* d_out, unsigned long N) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = idx; i < N/4; i += blockDim.x * gridDim.x) {
reinterpret_cast<float2*>(d_out)[i*2+0] = reinterpret_cast<float2*>(d_in)[i*2+0];
reinterpret_cast<float2*>(d_out)[i*2+1] = reinterpret_cast<float2*>(d_in)[i*2+1];
}
// in only one thread, process final element (if there is one)
if (idx==N/2 && N%2==1)
d_out[N-1] = d_in[N-1];
}
void device_copy_vector2(float* d_in, float* d_out, unsigned long N) {
int threads = 128;
int blocks = std::min((N/2 + threads-1) / threads, MAX_BLOCKS);
device_copy_vector2_kernel<<<blocks, threads>>>(d_in, d_out, N);
}
__global__ void device_copy_vector4_kernel(float* d_in, float* d_out, unsigned long N) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
for(int i = idx; i < N/4; i += blockDim.x * gridDim.x) {
reinterpret_cast<float4*>(d_out)[i] = reinterpret_cast<float4*>(d_in)[i];
}
// in only one thread, process final elements (if there are any)
int remainder = N%4;
if (idx==N/4 && remainder!=0) {
while(remainder) {
int idx = N - remainder--;
d_out[idx] = d_in[idx];
}
}
}
void device_copy_vector4(float* d_in, float* d_out, unsigned long N) {
int threads = 128;
int blocks = std::min((N/4 + threads-1) / threads, MAX_BLOCKS);
device_copy_vector4_kernel<<<blocks, threads>>>(d_in, d_out, N);
}
void init(float *d_inout, unsigned long N) {
int threads = 128;
int blocks = std::min((N + threads-1) / threads, MAX_BLOCKS);
init_kernel<<<blocks, threads>>>(d_inout, N);
}
int main(int argc, char **argv) {
float *A, *B;
cudaCheckError(cudaMalloc(&A, sizeof(float) * NMAX));
cudaCheckError(cudaMalloc(&B, sizeof(float) * NMAX));
printf("Max # blocks: %lu\n", MAX_BLOCKS);
double elapsed_times[3] = { 0 };
init(A, NMAX);
cudaCheckError(cudaDeviceSynchronize());
for (unsigned long N = NMIN; N <= NMAX; N *= 2) {
for (int i = 0; i < 3; ++i) {
timer_start(0);
switch (i) {
case 0:
device_copy_scalar(B, A, N);
break;
case 1:
device_copy_vector2(B, A, N);
break;
case 2:
device_copy_vector4(B, A, N);
break;
}
cudaCheckError(cudaDeviceSynchronize());
elapsed_times[i] = timer_stop(0);
}
double size = (2* (double) sizeof(float)) * N / (1ul << 30);
printf("[%7.2lf %s] scalar: %f GB/s, vector2: %f GB/s, vector4: %f Gb/s\n",
size < 1 ? size * 1024 : size,
size < 1 ? "MB" : "GB",
size / elapsed_times[0],
size / elapsed_times[1],
size / elapsed_times[2]);
}
return 0;
}