136 lines
4.0 KiB
Plaintext
136 lines
4.0 KiB
Plaintext
|
#include <stdio.h>
|
||
|
#include <stdbool.h>
|
||
|
#include <stdlib.h>
|
||
|
#include <algorithm>
|
||
|
|
||
|
#include "util.h"
|
||
|
|
||
|
// https://developer.nvidia.com/blog/cuda-pro-tip-increase-performance-with-vectorized-memory-access/
|
||
|
#define cudaCheckError(e) { \
|
||
|
if(e!=cudaSuccess) { \
|
||
|
printf("Cuda failure %s:%d: '%s'\n",__FILE__,__LINE__,cudaGetErrorString(e)); \
|
||
|
exit(0); \
|
||
|
} \
|
||
|
}
|
||
|
|
||
|
|
||
|
#define MAX_BLOCKS (1ul << 30)
|
||
|
|
||
|
#define NMIN (1ul << 10)
|
||
|
#define NMAX (1ul << 31)
|
||
|
|
||
|
__global__ void init_kernel(float* d_inout, unsigned long N) {
|
||
|
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||
|
for (int i = idx; i < N; i += blockDim.x * gridDim.x) {
|
||
|
d_inout[i] = idx;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
__global__ void device_copy_scalar_kernel(float* d_in, float* d_out, unsigned long N) {
|
||
|
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||
|
for (int i = idx; i < N/4; i += blockDim.x * gridDim.x) {
|
||
|
d_out[i * 4 + 0] = d_in[i * 4 + 0];
|
||
|
d_out[i * 4 + 1] = d_in[i * 4 + 1];
|
||
|
d_out[i * 4 + 2] = d_in[i * 4 + 2];
|
||
|
d_out[i * 4 + 3] = d_in[i * 4 + 3];
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void device_copy_scalar(float* d_in, float* d_out, unsigned long N)
|
||
|
{
|
||
|
int threads = 128;
|
||
|
int blocks = std::min((N + threads-1) / threads, MAX_BLOCKS);
|
||
|
device_copy_scalar_kernel<<<blocks, threads>>>(d_in, d_out, N);
|
||
|
}
|
||
|
|
||
|
__global__ void device_copy_vector2_kernel(float* d_in, float* d_out, unsigned long N) {
|
||
|
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||
|
for (int i = idx; i < N/4; i += blockDim.x * gridDim.x) {
|
||
|
reinterpret_cast<float2*>(d_out)[i*2+0] = reinterpret_cast<float2*>(d_in)[i*2+0];
|
||
|
reinterpret_cast<float2*>(d_out)[i*2+1] = reinterpret_cast<float2*>(d_in)[i*2+1];
|
||
|
}
|
||
|
|
||
|
// in only one thread, process final element (if there is one)
|
||
|
if (idx==N/2 && N%2==1)
|
||
|
d_out[N-1] = d_in[N-1];
|
||
|
}
|
||
|
|
||
|
void device_copy_vector2(float* d_in, float* d_out, unsigned long N) {
|
||
|
int threads = 128;
|
||
|
int blocks = std::min((N/2 + threads-1) / threads, MAX_BLOCKS);
|
||
|
|
||
|
device_copy_vector2_kernel<<<blocks, threads>>>(d_in, d_out, N);
|
||
|
}
|
||
|
|
||
|
__global__ void device_copy_vector4_kernel(float* d_in, float* d_out, unsigned long N) {
|
||
|
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||
|
for(int i = idx; i < N/4; i += blockDim.x * gridDim.x) {
|
||
|
reinterpret_cast<float4*>(d_out)[i] = reinterpret_cast<float4*>(d_in)[i];
|
||
|
}
|
||
|
|
||
|
// in only one thread, process final elements (if there are any)
|
||
|
int remainder = N%4;
|
||
|
if (idx==N/4 && remainder!=0) {
|
||
|
while(remainder) {
|
||
|
int idx = N - remainder--;
|
||
|
d_out[idx] = d_in[idx];
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void device_copy_vector4(float* d_in, float* d_out, unsigned long N) {
|
||
|
int threads = 128;
|
||
|
int blocks = std::min((N/4 + threads-1) / threads, MAX_BLOCKS);
|
||
|
|
||
|
device_copy_vector4_kernel<<<blocks, threads>>>(d_in, d_out, N);
|
||
|
}
|
||
|
|
||
|
void init(float *d_inout, unsigned long N) {
|
||
|
int threads = 128;
|
||
|
int blocks = std::min((N + threads-1) / threads, MAX_BLOCKS);
|
||
|
|
||
|
init_kernel<<<blocks, threads>>>(d_inout, N);
|
||
|
}
|
||
|
|
||
|
int main(int argc, char **argv) {
|
||
|
|
||
|
float *A, *B;
|
||
|
cudaCheckError(cudaMalloc(&A, sizeof(float) * NMAX));
|
||
|
cudaCheckError(cudaMalloc(&B, sizeof(float) * NMAX));
|
||
|
|
||
|
printf("Max # blocks: %lu\n", MAX_BLOCKS);
|
||
|
|
||
|
double elapsed_times[3] = { 0 };
|
||
|
|
||
|
init(A, NMAX);
|
||
|
cudaCheckError(cudaDeviceSynchronize());
|
||
|
|
||
|
for (unsigned long N = NMIN; N <= NMAX; N *= 2) {
|
||
|
for (int i = 0; i < 3; ++i) {
|
||
|
timer_start(0);
|
||
|
switch (i) {
|
||
|
case 0:
|
||
|
device_copy_scalar(B, A, N);
|
||
|
break;
|
||
|
case 1:
|
||
|
device_copy_vector2(B, A, N);
|
||
|
break;
|
||
|
case 2:
|
||
|
device_copy_vector4(B, A, N);
|
||
|
break;
|
||
|
}
|
||
|
cudaCheckError(cudaDeviceSynchronize());
|
||
|
elapsed_times[i] = timer_stop(0);
|
||
|
}
|
||
|
double size = (2* (double) sizeof(float)) * N / (1ul << 30);
|
||
|
printf("[%7.2lf %s] scalar: %f GB/s, vector2: %f GB/s, vector4: %f Gb/s\n",
|
||
|
size < 1 ? size * 1024 : size,
|
||
|
size < 1 ? "MB" : "GB",
|
||
|
size / elapsed_times[0],
|
||
|
size / elapsed_times[1],
|
||
|
size / elapsed_times[2]);
|
||
|
}
|
||
|
|
||
|
return 0;
|
||
|
}
|