#include #include #include #include #include "util.h" // https://developer.nvidia.com/blog/cuda-pro-tip-increase-performance-with-vectorized-memory-access/ #define cudaCheckError(e) { \ if(e!=cudaSuccess) { \ printf("Cuda failure %s:%d: '%s'\n",__FILE__,__LINE__,cudaGetErrorString(e)); \ exit(0); \ } \ } #define MAX_BLOCKS (1ul << 30) #define NMIN (1ul << 10) #define NMAX (1ul << 31) __global__ void init_kernel(float* d_inout, unsigned long N) { int idx = blockIdx.x * blockDim.x + threadIdx.x; for (int i = idx; i < N; i += blockDim.x * gridDim.x) { d_inout[i] = idx; } } __global__ void device_copy_scalar_kernel(float* d_in, float* d_out, unsigned long N) { int idx = blockIdx.x * blockDim.x + threadIdx.x; for (int i = idx; i < N/4; i += blockDim.x * gridDim.x) { d_out[i * 4 + 0] = d_in[i * 4 + 0]; d_out[i * 4 + 1] = d_in[i * 4 + 1]; d_out[i * 4 + 2] = d_in[i * 4 + 2]; d_out[i * 4 + 3] = d_in[i * 4 + 3]; } } void device_copy_scalar(float* d_in, float* d_out, unsigned long N) { int threads = 128; int blocks = std::min((N + threads-1) / threads, MAX_BLOCKS); device_copy_scalar_kernel<<>>(d_in, d_out, N); } __global__ void device_copy_vector2_kernel(float* d_in, float* d_out, unsigned long N) { int idx = blockIdx.x * blockDim.x + threadIdx.x; for (int i = idx; i < N/4; i += blockDim.x * gridDim.x) { reinterpret_cast(d_out)[i*2+0] = reinterpret_cast(d_in)[i*2+0]; reinterpret_cast(d_out)[i*2+1] = reinterpret_cast(d_in)[i*2+1]; } // in only one thread, process final element (if there is one) if (idx==N/2 && N%2==1) d_out[N-1] = d_in[N-1]; } void device_copy_vector2(float* d_in, float* d_out, unsigned long N) { int threads = 128; int blocks = std::min((N/2 + threads-1) / threads, MAX_BLOCKS); device_copy_vector2_kernel<<>>(d_in, d_out, N); } __global__ void device_copy_vector4_kernel(float* d_in, float* d_out, unsigned long N) { int idx = blockIdx.x * blockDim.x + threadIdx.x; for(int i = idx; i < N/4; i += blockDim.x * gridDim.x) { reinterpret_cast(d_out)[i] = reinterpret_cast(d_in)[i]; } // in only one thread, process final elements (if there are any) int remainder = N%4; if (idx==N/4 && remainder!=0) { while(remainder) { int idx = N - remainder--; d_out[idx] = d_in[idx]; } } } void device_copy_vector4(float* d_in, float* d_out, unsigned long N) { int threads = 128; int blocks = std::min((N/4 + threads-1) / threads, MAX_BLOCKS); device_copy_vector4_kernel<<>>(d_in, d_out, N); } void init(float *d_inout, unsigned long N) { int threads = 128; int blocks = std::min((N + threads-1) / threads, MAX_BLOCKS); init_kernel<<>>(d_inout, N); } int main(int argc, char **argv) { float *A, *B; cudaCheckError(cudaMalloc(&A, sizeof(float) * NMAX)); cudaCheckError(cudaMalloc(&B, sizeof(float) * NMAX)); printf("Max # blocks: %lu\n", MAX_BLOCKS); double elapsed_times[3] = { 0 }; init(A, NMAX); cudaCheckError(cudaDeviceSynchronize()); for (unsigned long N = NMIN; N <= NMAX; N *= 2) { for (int i = 0; i < 3; ++i) { timer_start(0); switch (i) { case 0: device_copy_scalar(B, A, N); break; case 1: device_copy_vector2(B, A, N); break; case 2: device_copy_vector4(B, A, N); break; } cudaCheckError(cudaDeviceSynchronize()); elapsed_times[i] = timer_stop(0); } double size = (2* (double) sizeof(float)) * N / (1ul << 30); printf("[%7.2lf %s] scalar: %f GB/s, vector2: %f GB/s, vector4: %f Gb/s\n", size < 1 ? size * 1024 : size, size < 1 ? "MB" : "GB", size / elapsed_times[0], size / elapsed_times[1], size / elapsed_times[2]); } return 0; }