#include "mat_mul.h" #include #include #include #include static float *A, *B, *C; static int M, N, K; static int num_threads; static int mpi_rank, mpi_world_size; int portion,low_bound,upper_bound; //,start,end,slice,num; MPI_Status status; MPI_Request request; static void mat_mul_omp() { // TODO: parallelize & optimize matrix multiplication int block = 80; int KB,MB,NB,kk,k,ii; #pragma omp parallel num_threads(num_threads) { int num = omp_get_thread_num(); int slice = (upper_bound-low_bound) / num_threads; int start = num * slice + low_bound; int end = (num == num_threads -1)? upper_bound : ((num+1)*slice+low_bound); int iblock = 4; int mblock = ((end-start)/iblock) * iblock + start; int nblock = (N/block) * block; int kblock = (K/block) * block; int KU = (kblock/4) * 4; #pragma omp parallel for schedule(static) private(KB,MB,NB,kk,k,ii) for (kk = 0; kk < kblock; kk+=block) { for (ii = start; ii < mblock; ii+=iblock) { for (int jj = 0; jj < nblock; jj+=block) { if(kk+block < kblock) KB = kk + block; else KB = KU; for (k = kk; k < KB; k+=4) { if(ii+iblock < mblock) MB = ii + iblock; else MB = end; for (int i = ii; i < MB; i++) { float aik = A[i*K + k]; float aik1 = A[i*K + k+1]; float aik2 = A[i*K + k+2]; float aik3 = A[i*K + k+3]; // float aik4 = A[i*K + k+4]; // float aik5 = A[i*K + k+5]; // float aik6 = A[i*K + k+6]; // float aik7 = A[i*K + k+7]; if(jj+block < nblock) NB = jj + block; else NB = N; for (int j = jj; j < NB; j++) { C[i * N + j] = C[i*N + j] + aik * B[k * N + j] + aik1 * B[(k+1) * N + j] + aik2 * B[(k+2) * N + j] + aik3 * B[(k+3) * N + j]; // + aik4 * B[(k+4) * N + j] // + aik5 * B[(k+5) * N + j] // + aik6 * B[(k+6) * N + j] // + aik7 * B[(k+7) * N + j]; } } // i } //k } // ii } // jj } // kk #pragma omp parallel for schedule(static) private(k) for (int i = start; i < end; i++) { for(k=KU;k