chundoong-lab-ta/SamsungDS22/vector_io/main.cu

#include <stdio.h>
#include <stdbool.h>
#include <stdlib.h>
#include <algorithm>

#include "util.h"

// https://developer.nvidia.com/blog/cuda-pro-tip-increase-performance-with-vectorized-memory-access/
#define cudaCheckError(e) {                                          \
 if(e!=cudaSuccess) {                                              \
   printf("Cuda failure %s:%d: '%s'\n",__FILE__,__LINE__,cudaGetErrorString(e));           \
   exit(0); \
 }                                                                 \
}


#define MAX_BLOCKS (1ul << 30)

#define NMIN (1ul << 10)
#define NMAX (1ul << 31)

__global__ void init_kernel(float* d_inout, unsigned long N) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  for (int i = idx; i < N; i += blockDim.x * gridDim.x) {
    d_inout[i] = idx;
  }
}

__global__ void device_copy_scalar_kernel(float* d_in, float* d_out, unsigned long N) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  for (int i = idx; i < N/4; i += blockDim.x * gridDim.x) {
    d_out[i * 4 + 0] = d_in[i * 4 + 0];
    d_out[i * 4 + 1] = d_in[i * 4 + 1];
    d_out[i * 4 + 2] = d_in[i * 4 + 2];
    d_out[i * 4 + 3] = d_in[i * 4 + 3];
  }
}

void device_copy_scalar(float* d_in, float* d_out, unsigned long N)
{
  int threads = 128;
  int blocks = std::min((N + threads-1) / threads, MAX_BLOCKS);
  device_copy_scalar_kernel<<<blocks, threads>>>(d_in, d_out, N);
}

__global__ void device_copy_vector2_kernel(float* d_in, float* d_out, unsigned long N) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  for (int i = idx; i < N/4; i += blockDim.x * gridDim.x) {
    reinterpret_cast<float2*>(d_out)[i*2+0] = reinterpret_cast<float2*>(d_in)[i*2+0];
    reinterpret_cast<float2*>(d_out)[i*2+1] = reinterpret_cast<float2*>(d_in)[i*2+1];
  }

  // in only one thread, process final element (if there is one)
  if (idx==N/2 && N%2==1)
    d_out[N-1] = d_in[N-1];
}

void device_copy_vector2(float* d_in, float* d_out, unsigned long N) {
  int threads = 128;
  int blocks = std::min((N/2 + threads-1) / threads, MAX_BLOCKS);

  device_copy_vector2_kernel<<<blocks, threads>>>(d_in, d_out, N);
}

__global__ void device_copy_vector4_kernel(float* d_in, float* d_out, unsigned long N) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  for(int i = idx; i < N/4; i += blockDim.x * gridDim.x) {
    reinterpret_cast<float4*>(d_out)[i] = reinterpret_cast<float4*>(d_in)[i];
  }

  // in only one thread, process final elements (if there are any)
  int remainder = N%4;
  if (idx==N/4 && remainder!=0) {
    while(remainder) {
      int idx = N - remainder--;
      d_out[idx] = d_in[idx];
    }
  }
}

void device_copy_vector4(float* d_in, float* d_out, unsigned long N) {
  int threads = 128;
  int blocks = std::min((N/4 + threads-1) / threads, MAX_BLOCKS);

  device_copy_vector4_kernel<<<blocks, threads>>>(d_in, d_out, N);
}

void init(float *d_inout, unsigned long N) {
  int threads = 128;
  int blocks = std::min((N + threads-1) / threads, MAX_BLOCKS);

  init_kernel<<<blocks, threads>>>(d_inout, N);
}

int main(int argc, char **argv) {

  float *A, *B;
  cudaCheckError(cudaMalloc(&A, sizeof(float) * NMAX));
  cudaCheckError(cudaMalloc(&B, sizeof(float) * NMAX));

  printf("Max # blocks: %lu\n", MAX_BLOCKS);

  double elapsed_times[3] = { 0 };

  init(A, NMAX);
  cudaCheckError(cudaDeviceSynchronize());

  for (unsigned long N = NMIN; N <= NMAX; N *= 2) {
    for (int i = 0; i < 3; ++i) {
      timer_start(0);
      switch (i) {
        case 0:
          device_copy_scalar(B, A, N);
          break;
        case 1:
          device_copy_vector2(B, A, N);
          break;
        case 2:
          device_copy_vector4(B, A, N);
          break;
      }
      cudaCheckError(cudaDeviceSynchronize());
      elapsed_times[i] = timer_stop(0);
    }
    double size = (2* (double) sizeof(float)) * N / (1ul << 30);
    printf("[%7.2lf %s] scalar: %f GB/s, vector2: %f GB/s, vector4: %f Gb/s\n",
        size < 1 ? size * 1024 : size,
        size < 1 ? "MB" : "GB",
        size / elapsed_times[0],
        size / elapsed_times[1],
        size / elapsed_times[2]);
  }

  return 0;
}