#include "mat_mul.h" #include #include #include #define CHECK_ERROR(err) \ if (err != CL_SUCCESS) { \ printf("[%s:%d] OpenCL error %d\n", __FILE__, __LINE__, err); \ exit(EXIT_FAILURE); \ } #define TS 32 #define WPT 8 #define RTS TS/WPT static cl_int err; static cl_platform_id platform; static cl_device_id device[4]; static cl_context context; static cl_command_queue queue[4]; static cl_program program; static cl_kernel kernel[4]; static cl_mem a_d, b_d, c_d, a_d1, b_d1, c_d1, a_d2, b_d2, c_d2, a_d3, b_d3, c_d3; static float *A, *B, *C; //static float *A_new, *B_new, *C_new; static int M, N, K; static int ndev; static int M_div[4]; void mat_mul(float *_A, float *_B, float *_C, int _M, int _N, int _K) { A = _A, B = _B, C = _C; M = _M, N = _N, K = _K; err = clSetKernelArg(kernel[0], 0, sizeof(cl_mem), &a_d); CHECK_ERROR(err); err = clSetKernelArg(kernel[0], 1, sizeof(cl_mem), &b_d); CHECK_ERROR(err); err = clSetKernelArg(kernel[0], 2, sizeof(cl_mem), &c_d); CHECK_ERROR(err); err = clSetKernelArg(kernel[0], 3, sizeof(int), &M_div[0]); CHECK_ERROR(err); err = clSetKernelArg(kernel[0], 4, sizeof(int), &N); CHECK_ERROR(err); err = clSetKernelArg(kernel[0], 5, sizeof(int), &K); CHECK_ERROR(err); err = clSetKernelArg(kernel[1], 0, sizeof(cl_mem), &a_d1); CHECK_ERROR(err); err = clSetKernelArg(kernel[1], 1, sizeof(cl_mem), &b_d1); CHECK_ERROR(err); err = clSetKernelArg(kernel[1], 2, sizeof(cl_mem), &c_d1); CHECK_ERROR(err); err = clSetKernelArg(kernel[1], 3, sizeof(int), &M_div[1]); CHECK_ERROR(err); err = clSetKernelArg(kernel[1], 4, sizeof(int), &N); CHECK_ERROR(err); err = clSetKernelArg(kernel[1], 5, sizeof(int), &K); CHECK_ERROR(err); err = clSetKernelArg(kernel[2], 0, sizeof(cl_mem), &a_d2); CHECK_ERROR(err); err = clSetKernelArg(kernel[2], 1, sizeof(cl_mem), &b_d2); CHECK_ERROR(err); err = clSetKernelArg(kernel[2], 2, sizeof(cl_mem), &c_d2); CHECK_ERROR(err); err = clSetKernelArg(kernel[2], 3, sizeof(int), &M_div[2]); CHECK_ERROR(err); err = clSetKernelArg(kernel[2], 4, sizeof(int), &N); CHECK_ERROR(err); err = clSetKernelArg(kernel[2], 5, sizeof(int), &K); CHECK_ERROR(err); err = clSetKernelArg(kernel[3], 0, sizeof(cl_mem), &a_d3); CHECK_ERROR(err); err = clSetKernelArg(kernel[3], 1, sizeof(cl_mem), &b_d3); CHECK_ERROR(err); err = clSetKernelArg(kernel[3], 2, sizeof(cl_mem), &c_d3); CHECK_ERROR(err); err = clSetKernelArg(kernel[3], 3, sizeof(int), &M_div[3]); CHECK_ERROR(err); err = clSetKernelArg(kernel[3], 4, sizeof(int), &N); CHECK_ERROR(err); err = clSetKernelArg(kernel[3], 5, sizeof(int), &K); CHECK_ERROR(err); for(int i=0; i a_d (gpu), B (cpu) -> b_d (gpu) err = clEnqueueWriteBuffer(queue[0], a_d, CL_TRUE, 0, M/4 * K * sizeof(float), (void*)((size_t)A), 0, NULL, NULL); CHECK_ERROR(err); err = clEnqueueWriteBuffer(queue[0], b_d, CL_TRUE, 0, K * N * sizeof(float), B, 0, NULL, NULL); CHECK_ERROR(err); err = clEnqueueWriteBuffer(queue[1], a_d1, CL_TRUE, 0, M/4 * K * sizeof(float), (void*)((size_t)A+(M/4*K*sizeof(float))), 0, NULL, NULL); CHECK_ERROR(err); err = clEnqueueWriteBuffer(queue[1], b_d1, CL_TRUE, 0, K * N * sizeof(float), B, 0, NULL, NULL); CHECK_ERROR(err); err = clEnqueueWriteBuffer(queue[2], a_d2, CL_TRUE, 0, M/4 * K * sizeof(float), (void*)((size_t)A+(M/4*K*sizeof(float)*2)), 0, NULL, NULL); CHECK_ERROR(err); err = clEnqueueWriteBuffer(queue[2], b_d2, CL_TRUE, 0, K * N * sizeof(float), B, 0, NULL, NULL); CHECK_ERROR(err); err = clEnqueueWriteBuffer(queue[3], a_d3, CL_TRUE, 0, (M/4+M%4) * K * sizeof(float), (void*)((size_t)A+(M/4*K*sizeof(float)*3)), 0, NULL, NULL); CHECK_ERROR(err); err = clEnqueueWriteBuffer(queue[3], b_d3, CL_TRUE, 0, K * N * sizeof(float), B, 0, NULL, NULL); CHECK_ERROR(err); // DO NOT REMOVE; NEEDED FOR TIME MEASURE for(int i=0; i C (cpu) err = clEnqueueReadBuffer(queue[0], c_d, CL_TRUE, 0, M/4 * N * sizeof(float), (void*)((size_t)C), 0, NULL, NULL); CHECK_ERROR(err); err = clEnqueueReadBuffer(queue[1], c_d1, CL_TRUE, 0, M/4 * N * sizeof(float), (void*)((size_t)C+(M/4*N*sizeof(float))), 0, NULL, NULL); CHECK_ERROR(err); err = clEnqueueReadBuffer(queue[2], c_d2, CL_TRUE, 0, M/4 * N * sizeof(float), (void*)((size_t)C+(M/4*N*sizeof(float)*2)), 0, NULL, NULL); CHECK_ERROR(err); err = clEnqueueReadBuffer(queue[3], c_d3, CL_TRUE, 0, (M/4+M%4) * N * sizeof(float), (void*)((size_t)C+(M/4*N*sizeof(float)*3)), 0, NULL, NULL); CHECK_ERROR(err); // DO NOT REMOVE; NEEDED FOR TIME MEASURE for(int i=0; i