#include #include #include #include #include #include "matmul.h" #include "util.h" static bool print_matrix = false; static bool validation = false; static size_t T = 0; static size_t M = 8; static size_t N = 8; static size_t K = 8; static size_t num_iterations = 1; static char matmul_type_string[6][64] = { "CPU (sequential)", "Naive GPU", "Double/Triple Buffering", "Multi GPU", "cuBLAS", "Tiling Optimization" }; static void print_help(const char *prog_name) { printf("Usage: %s [-pvh] [-n num_iterations] T M N K\n", prog_name); printf("Options:\n"); printf(" -p : print matrix. (default: off)\n"); printf(" -v : validate matmul. (default: off)\n"); printf(" -h : print this page.\n"); printf(" -n : number of iterations (default: 1)\n"); printf(" T : type of matrix multiplication (default: 0)\n"); printf(" 0 : CPU (sequential)\n"); printf(" 1 : Naive GPU\n"); printf(" 2 : Double/Triple buffering\n"); printf(" 3 : Multi GPU\n"); printf(" 4 : cuBLAS\n"); printf(" 5 : Tiling optimization\n"); printf(" M : number of rows of matrix A and C. (default: 8)\n"); printf(" N : number of columns of matrix B and C. (default: 8)\n"); printf( " K : number of columns of matrix A and rows of B. (default: 8)\n"); } static void parse_opt(int argc, char **argv) { int c; while ((c = getopt(argc, argv, "pvht:n:m:")) != -1) { switch (c) { case 'p': print_matrix = true; break; case 'v': validation = true; break; case 'n': num_iterations = atoi(optarg); break; case 'h': default: print_help(argv[0]); exit(0); } } for (int i = optind, j = 0; i < argc; ++i, ++j) { switch (j) { case 0: T = (size_t) atoi(argv[i]); break; case 1: M = (size_t) atoi(argv[i]); break; case 2: N = (size_t) atoi(argv[i]); break; case 3: K = (size_t) atoi(argv[i]); break; default: break; } } printf("============= Matrix Multiplication Benchmark =============\n"); printf("- Matmul Type: %s\n", matmul_type_string[T]); printf("- Problem size: M = %lu, N = %lu, K = %lu\n", M, N, K); printf("- Number of iterations: %lu\n", num_iterations); printf("- Print matrix: %s\n", print_matrix ? "on" : "off"); printf("- Validation: %s\n", validation ? "on" : "off"); } int main(int argc, char **argv) { parse_opt(argc, argv); fflush(stdout); /* Allocate and initialize matrices on CPU */ float *A, *B, *C; alloc_mat(&A, M, K); alloc_mat(&B, K, N); alloc_mat(&C, M, N); rand_mat(A, M, K); rand_mat(B, K, N); /* Initialize Matrix Multiplication */ switch (T) { case 0: matmul_cpu_initialize(M, N, K); break; case 1: matmul_naive_initialize(M, N, K); break; case 2: matmul_buffering_initialize(M, N, K); break; case 3: matmul_multigpu_initialize(M, N, K); break; case 4: matmul_cublas_initialize(M, N, K); break; case 5: matmul_tiling_initialize(M, N, K); break; } /* Run few warmup iterations... */ for (size_t i = 0; i < 3; i++) { zero_mat(C, M, N); switch (T) { case 0: matmul_cpu(A, B, C, M, N, K); break; case 1: matmul_naive(A, B, C, M, N, K); break; case 2: matmul_buffering(A, B, C, M, N, K); break; case 3: matmul_multigpu(A, B, C, M, N, K); break; case 4: matmul_cublas(A, B, C, M, N, K); break; case 5: matmul_tiling(A, B, C, M, N, K); break; } } /* Run matrix multiplication for num_iterations */ printf("\n--------------------- Run Benchmark -----------------------\n"); double elapsed_time_sum = 0; for (size_t i = 0; i < num_iterations; ++i) { printf("[iter %lu] ", i); fflush(stdout); zero_mat(C, M, N); double elapsed_time_iter = -get_current_time(); switch (T) { case 0: matmul_cpu(A, B, C, M, N, K); break; case 1: matmul_naive(A, B, C, M, N, K); break; case 2: matmul_buffering(A, B, C, M, N, K); break; case 3: matmul_multigpu(A, B, C, M, N, K); break; case 4: matmul_cublas(A, B, C, M, N, K); break; case 5: matmul_tiling(A, B, C, M, N, K); break; } elapsed_time_iter += get_current_time(); printf("%.4f s\n", elapsed_time_iter); elapsed_time_sum += elapsed_time_iter; } if (print_matrix) { printf("\n---------------------- Print Matrix -----------------------\n"); printf("MATRIX A:\n"); print_mat(A, M, K); printf("MATRIX B:\n"); print_mat(B, K, N); printf("MATRIX C:\n"); print_mat(C, M, N); } if (validation) { printf("\n----------------------- Validation ------------------------\n"); check_mat_mul(A, B, C, M, N, K); } /* Print performance results */ double elapsed_time_avg = elapsed_time_sum / num_iterations; printf("\n-------------------- Benchmark Summary --------------------\n"); printf("Avg. time : %.4f s\n", elapsed_time_avg); printf("Avg. performance : %.1f GFLOPS\n", 2.0 * M * N * K / elapsed_time_avg / 1e9); /* Finalize matrix multiplication */ switch (T) { case 0: matmul_cpu_finalize(M, N, K); break; case 1: matmul_naive_finalize(M, N, K); break; case 2: matmul_buffering_finalize(M, N, K); break; case 3: matmul_multigpu_finalize(M, N, K); break; case 4: matmul_cublas_finalize(M, N, K); break; case 5: matmul_tiling_finalize(M, N, K); break; } printf("\n===========================================================\n"); return 0; }