chundoong-lab-ta/APWS23/matmul-skeleton/main.cpp

231 lines
5.7 KiB
C++

#include <getopt.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "matmul.h"
#include "util.h"
static bool print_matrix = false;
static bool validation = false;
static size_t T = 0;
static size_t M = 8;
static size_t N = 8;
static size_t K = 8;
static size_t num_iterations = 1;
static char matmul_type_string[6][64] = {
"CPU (sequential)", "naive GPU", "GPU optimization 1",
"GPU optimization 2", "multi GPU", "cuBLAS",
};
static void print_help(const char *prog_name) {
printf("Usage: %s [-pvh] [-n num_iterations] T M N K\n", prog_name);
printf("Options:\n");
printf(" -p : print matrix. (default: off)\n");
printf(" -v : validate matmul. (default: off)\n");
printf(" -h : print this page.\n");
printf(" -n : number of iterations (default: 1)\n");
printf(" T : type of matrix multiplication (default: 0)\n");
printf(" 0 : CPU (sequential)\n");
printf(" 1 : naive GPU\n");
printf(" 2 : GPU optimization 1\n");
printf(" 3 : GPU optimization 2\n");
printf(" 4 : multi GPU\n");
printf(" 5 : cuBLAS\n");
printf(" M : number of rows of matrix A and C. (default: 8)\n");
printf(" N : number of columns of matrix B and C. (default: 8)\n");
printf(
" K : number of columns of matrix A and rows of B. (default: 8)\n");
}
static void parse_opt(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "pvht:n:m:")) != -1) {
switch (c) {
case 'p':
print_matrix = true;
break;
case 'v':
validation = true;
break;
case 'n':
num_iterations = atoi(optarg);
break;
case 'h':
default:
print_help(argv[0]);
exit(0);
}
}
for (int i = optind, j = 0; i < argc; ++i, ++j) {
switch (j) {
case 0:
T = (size_t)atoi(argv[i]);
break;
case 1:
M = (size_t)atoi(argv[i]);
break;
case 2:
N = (size_t)atoi(argv[i]);
break;
case 3:
K = (size_t)atoi(argv[i]);
break;
default:
break;
}
}
printf("============= Matrix Multiplication Benchmark =============\n");
printf("- Matmul Type: %s\n", matmul_type_string[T]);
printf("- Problem size: M = %lu, N = %lu, K = %lu\n", M, N, K);
printf("- Number of iterations: %lu\n", num_iterations);
printf("- Print matrix: %s\n", print_matrix ? "on" : "off");
printf("- Validation: %s\n", validation ? "on" : "off");
}
int main(int argc, char **argv) {
parse_opt(argc, argv);
fflush(stdout);
/* Allocate and initialize matrices on CPU */
float *A, *B, *C;
alloc_mat(&A, M, K);
alloc_mat(&B, K, N);
alloc_mat(&C, M, N);
rand_mat(A, M, K);
rand_mat(B, K, N);
/* Initialize Matrix Multiplication */
switch (T) {
case 0:
matmul_cpu_initialize(M, N, K);
break;
case 1:
matmul_naive_initialize(M, N, K);
break;
case 2:
matmul_opt1_initialize(M, N, K);
break;
case 3:
matmul_opt2_initialize(M, N, K);
break;
case 4:
matmul_multigpu_initialize(M, N, K);
break;
case 5:
matmul_cublas_initialize(M, N, K);
break;
}
/* Run few warmup iterations... */
for (size_t i = 0; i < 3; i++) {
zero_mat(C, M, N);
switch (T) {
case 0:
matmul_cpu(A, B, C, M, N, K);
break;
case 1:
matmul_naive(A, B, C, M, N, K);
break;
case 2:
matmul_opt1(A, B, C, M, N, K);
break;
case 3:
matmul_opt2(A, B, C, M, N, K);
break;
case 4:
matmul_multigpu(A, B, C, M, N, K);
break;
case 5:
matmul_cublas(A, B, C, M, N, K);
break;
}
}
/* Run matrix multiplication for num_iterations */
printf("\n--------------------- Run Benchmark -----------------------\n");
double elapsed_time_sum = 0;
for (size_t i = 0; i < num_iterations; ++i) {
printf("[iter %lu] ", i);
fflush(stdout);
zero_mat(C, M, N);
double elapsed_time_iter = -get_current_time();
switch (T) {
case 0:
matmul_cpu(A, B, C, M, N, K);
break;
case 1:
matmul_naive(A, B, C, M, N, K);
break;
case 2:
matmul_opt1(A, B, C, M, N, K);
break;
case 3:
matmul_opt2(A, B, C, M, N, K);
break;
case 4:
matmul_multigpu(A, B, C, M, N, K);
break;
case 5:
matmul_cublas(A, B, C, M, N, K);
break;
}
elapsed_time_iter += get_current_time();
printf("%.4f s\n", elapsed_time_iter);
elapsed_time_sum += elapsed_time_iter;
}
if (print_matrix) {
printf("\n---------------------- Print Matrix -----------------------\n");
printf("MATRIX A:\n");
print_mat(A, M, K);
printf("MATRIX B:\n");
print_mat(B, K, N);
printf("MATRIX C:\n");
print_mat(C, M, N);
}
if (validation) {
printf("\n----------------------- Validation ------------------------\n");
check_mat_mul(A, B, C, M, N, K);
}
/* Print performance results */
double elapsed_time_avg = elapsed_time_sum / num_iterations;
printf("\n-------------------- Benchmark Summary --------------------\n");
printf("Avg. time : %.4f s\n", elapsed_time_avg);
printf("Avg. performance : %.1f GFLOPS\n",
2.0 * M * N * K / elapsed_time_avg / 1e9);
/* Finalize matrix multiplication */
switch (T) {
case 0:
matmul_cpu_initialize(M, N, K);
break;
case 1:
matmul_naive_initialize(M, N, K);
break;
case 2:
matmul_opt1_initialize(M, N, K);
break;
case 3:
matmul_opt2_initialize(M, N, K);
break;
case 4:
matmul_multigpu_initialize(M, N, K);
break;
case 5:
matmul_cublas_initialize(M, N, K);
break;
}
printf("\n===========================================================\n");
return 0;
}