chundoong-lab-ta/SamsungDS22/vector_io/main.cu

#include <stdio.h>
#include <stdbool.h>
#include <stdlib.h>
#include <algorithm>

#include "util.h"

// https://developer.nvidia.com/blog/cuda-pro-tip-increase-performance-with-vectorized-memory-access/
#define cudaCheckError(e) {                                          \
 if(e!=cudaSuccess) {                                              \
   printf("Cuda failure %s:%d: '%s'\n",__FILE__,__LINE__,cudaGetErrorString(e));           \
   exit(0); \
 }                                                                 \
}


#define MAX_BLOCKS (1ul << 30)

#define NMIN (1ul << 10)
#define NMAX (1ul << 31)

__global__ void init_kernel(float* d_inout, unsigned long N) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  for (int i = idx; i < N; i += blockDim.x * gridDim.x) {
    d_inout[i] = idx;
  }
}

__global__ void device_copy_scalar_kernel(float* d_in, float* d_out, unsigned long N) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  for (int i = idx; i < N/4; i += blockDim.x * gridDim.x) {
    d_out[i * 4 + 0] = d_in[i * 4 + 0];
    d_out[i * 4 + 1] = d_in[i * 4 + 1];
    d_out[i * 4 + 2] = d_in[i * 4 + 2];
    d_out[i * 4 + 3] = d_in[i * 4 + 3];
  }
}

void device_copy_scalar(float* d_in, float* d_out, unsigned long N)
{
  int threads = 128;
  int blocks = std::min((N + threads-1) / threads, MAX_BLOCKS);
  device_copy_scalar_kernel<<<blocks, threads>>>(d_in, d_out, N);
}

__global__ void device_copy_vector2_kernel(float* d_in, float* d_out, unsigned long N) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  for (int i = idx; i < N/4; i += blockDim.x * gridDim.x) {
    reinterpret_cast<float2*>(d_out)[i*2+0] = reinterpret_cast<float2*>(d_in)[i*2+0];
    reinterpret_cast<float2*>(d_out)[i*2+1] = reinterpret_cast<float2*>(d_in)[i*2+1];
  }

  // in only one thread, process final element (if there is one)
  if (idx==N/2 && N%2==1)
    d_out[N-1] = d_in[N-1];
}

void device_copy_vector2(float* d_in, float* d_out, unsigned long N) {
  int threads = 128;
  int blocks = std::min((N/2 + threads-1) / threads, MAX_BLOCKS);

  device_copy_vector2_kernel<<<blocks, threads>>>(d_in, d_out, N);
}

__global__ void device_copy_vector4_kernel(float* d_in, float* d_out, unsigned long N) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  for(int i = idx; i < N/4; i += blockDim.x * gridDim.x) {
    reinterpret_cast<float4*>(d_out)[i] = reinterpret_cast<float4*>(d_in)[i];
  }

  // in only one thread, process final elements (if there are any)
  int remainder = N%4;
  if (idx==N/4 && remainder!=0) {
    while(remainder) {
      int idx = N - remainder--;
      d_out[idx] = d_in[idx];
    }
  }
}

void device_copy_vector4(float* d_in, float* d_out, unsigned long N) {
  int threads = 128;
  int blocks = std::min((N/4 + threads-1) / threads, MAX_BLOCKS);

  device_copy_vector4_kernel<<<blocks, threads>>>(d_in, d_out, N);
}

void init(float *d_inout, unsigned long N) {
  int threads = 128;
  int blocks = std::min((N + threads-1) / threads, MAX_BLOCKS);

  init_kernel<<<blocks, threads>>>(d_inout, N);
}

int main(int argc, char **argv) {

  float *A, *B;
  cudaCheckError(cudaMalloc(&A, sizeof(float) * NMAX));
  cudaCheckError(cudaMalloc(&B, sizeof(float) * NMAX));

  printf("Max # blocks: %lu\n", MAX_BLOCKS);

  double elapsed_times[3] = { 0 };

  init(A, NMAX);
  cudaCheckError(cudaDeviceSynchronize());

  for (unsigned long N = NMIN; N <= NMAX; N *= 2) {
    for (int i = 0; i < 3; ++i) {
      timer_start(0);
      switch (i) {
        case 0:
          device_copy_scalar(B, A, N);
          break;
        case 1:
          device_copy_vector2(B, A, N);
          break;
        case 2:
          device_copy_vector4(B, A, N);
          break;
      }
      cudaCheckError(cudaDeviceSynchronize());
      elapsed_times[i] = timer_stop(0);
    }
    double size = (2* (double) sizeof(float)) * N / (1ul << 30);
    printf("[%7.2lf %s] scalar: %f GB/s, vector2: %f GB/s, vector4: %f Gb/s\n", 
        size < 1 ? size * 1024 : size,
        size < 1 ? "MB" : "GB",
        size / elapsed_times[0],
        size / elapsed_times[1],
        size / elapsed_times[2]);
  }

  return 0;
}
. 2022-09-29 18:01:45 +09:00			`#include <stdio.h>`
			`#include <stdbool.h>`
			`#include <stdlib.h>`
			`#include <algorithm>`

			`#include "util.h"`

			`// https://developer.nvidia.com/blog/cuda-pro-tip-increase-performance-with-vectorized-memory-access/`
			`#define cudaCheckError(e) { \`
			`if(e!=cudaSuccess) { \`
			`printf("Cuda failure %s:%d: '%s'\n",__FILE__,__LINE__,cudaGetErrorString(e)); \`
			`exit(0); \`
			`} \`
			`}`


			`#define MAX_BLOCKS (1ul << 30)`

			`#define NMIN (1ul << 10)`
			`#define NMAX (1ul << 31)`

			`__global__ void init_kernel(float* d_inout, unsigned long N) {`
			`int idx = blockIdx.x * blockDim.x + threadIdx.x;`
			`for (int i = idx; i < N; i += blockDim.x * gridDim.x) {`
			`d_inout[i] = idx;`
			`}`
			`}`

			`__global__ void device_copy_scalar_kernel(float* d_in, float* d_out, unsigned long N) {`
			`int idx = blockIdx.x * blockDim.x + threadIdx.x;`
			`for (int i = idx; i < N/4; i += blockDim.x * gridDim.x) {`
			`d_out[i * 4 + 0] = d_in[i * 4 + 0];`
			`d_out[i * 4 + 1] = d_in[i * 4 + 1];`
			`d_out[i * 4 + 2] = d_in[i * 4 + 2];`
			`d_out[i * 4 + 3] = d_in[i * 4 + 3];`
			`}`
			`}`

			`void device_copy_scalar(float* d_in, float* d_out, unsigned long N)`
			`{`
			`int threads = 128;`
			`int blocks = std::min((N + threads-1) / threads, MAX_BLOCKS);`
			`device_copy_scalar_kernel<<<blocks, threads>>>(d_in, d_out, N);`
			`}`

			`__global__ void device_copy_vector2_kernel(float* d_in, float* d_out, unsigned long N) {`
			`int idx = blockIdx.x * blockDim.x + threadIdx.x;`
			`for (int i = idx; i < N/4; i += blockDim.x * gridDim.x) {`
			`reinterpret_cast<float2>(d_out)[i2+0] = reinterpret_cast<float2>(d_in)[i2+0];`
			`reinterpret_cast<float2>(d_out)[i2+1] = reinterpret_cast<float2>(d_in)[i2+1];`
			`}`

			`// in only one thread, process final element (if there is one)`
			`if (idx==N/2 && N%2==1)`
			`d_out[N-1] = d_in[N-1];`
			`}`

			`void device_copy_vector2(float* d_in, float* d_out, unsigned long N) {`
			`int threads = 128;`
			`int blocks = std::min((N/2 + threads-1) / threads, MAX_BLOCKS);`

			`device_copy_vector2_kernel<<<blocks, threads>>>(d_in, d_out, N);`
			`}`

			`__global__ void device_copy_vector4_kernel(float* d_in, float* d_out, unsigned long N) {`
			`int idx = blockIdx.x * blockDim.x + threadIdx.x;`
			`for(int i = idx; i < N/4; i += blockDim.x * gridDim.x) {`
			`reinterpret_cast<float4>(d_out)[i] = reinterpret_cast<float4>(d_in)[i];`
			`}`

			`// in only one thread, process final elements (if there are any)`
			`int remainder = N%4;`
			`if (idx==N/4 && remainder!=0) {`
			`while(remainder) {`
			`int idx = N - remainder--;`
			`d_out[idx] = d_in[idx];`
			`}`
			`}`
			`}`

			`void device_copy_vector4(float* d_in, float* d_out, unsigned long N) {`
			`int threads = 128;`
			`int blocks = std::min((N/4 + threads-1) / threads, MAX_BLOCKS);`

			`device_copy_vector4_kernel<<<blocks, threads>>>(d_in, d_out, N);`
			`}`

			`void init(float *d_inout, unsigned long N) {`
			`int threads = 128;`
			`int blocks = std::min((N + threads-1) / threads, MAX_BLOCKS);`

			`init_kernel<<<blocks, threads>>>(d_inout, N);`
			`}`

			`int main(int argc, char **argv) {`

			`float A, B;`
			`cudaCheckError(cudaMalloc(&A, sizeof(float) * NMAX));`
			`cudaCheckError(cudaMalloc(&B, sizeof(float) * NMAX));`

			`printf("Max # blocks: %lu\n", MAX_BLOCKS);`

			`double elapsed_times[3] = { 0 };`

			`init(A, NMAX);`
			`cudaCheckError(cudaDeviceSynchronize());`

			`for (unsigned long N = NMIN; N <= NMAX; N *= 2) {`
			`for (int i = 0; i < 3; ++i) {`
			`timer_start(0);`
			`switch (i) {`
			`case 0:`
			`device_copy_scalar(B, A, N);`
			`break;`
			`case 1:`
			`device_copy_vector2(B, A, N);`
			`break;`
			`case 2:`
			`device_copy_vector4(B, A, N);`
			`break;`
			`}`
			`cudaCheckError(cudaDeviceSynchronize());`
			`elapsed_times[i] = timer_stop(0);`
			`}`
			`double size = (2* (double) sizeof(float)) * N / (1ul << 30);`
			`printf("[%7.2lf %s] scalar: %f GB/s, vector2: %f GB/s, vector4: %f Gb/s\n",`
			`size < 1 ? size * 1024 : size,`
			`size < 1 ? "MB" : "GB",`
			`size / elapsed_times[0],`
			`size / elapsed_times[1],`
			`size / elapsed_times[2]);`
			`}`

			`return 0;`
			`}`