#include "mat_mul.h" #include #include #include #include #include "util.h" #define min(a,b) (a)>(b)?(b):(a) #define ITILESIZE (32) #define JTILESIZE (2048) #define KTILESIZE (16) static float *A, *B, *C; static int M, N, K; static int num_threads; static int mpi_rank, mpi_world_size; static int source, dest; static int rows; static int averow, extra, offset; MPI_Status status; static void mat_mul_omp() { // TODO: parallelize & optimize matrix multiplication // Use num_threads per node int tid; int ii,jj,kk,i,j,k; omp_set_num_threads(num_threads); #pragma omp parallel private(tid,ii,jj,kk,i,j,k) { tid = omp_get_thread_num(); int slice = rows / num_threads; int is = tid*slice; int ie = (tid==num_threads-1)? rows : (tid+1)*slice; for ( ii = is; ii < ie; ii += ITILESIZE) { for ( kk = 0; kk < K; kk += KTILESIZE) { for ( jj = 0; jj < N; jj += JTILESIZE) { for(i=ii; i< (min(ie,ii+ITILESIZE));i++){ for(k=kk; k< (min(K,kk+KTILESIZE)); k++){ float ar = A[i*K+k]; for(j=jj;j< (min(N,jj+JTILESIZE));j+=1){ C[i*N+j] += ar * B[k*N+j]; } } } } } } } } void mat_mul(float *_A, float *_B, float *_C, int _M, int _N, int _K, int _num_threads, int _mpi_rank, int _mpi_world_size) { A = _A, B = _B, C = _C; M = _M, N = _N, K = _K; num_threads = _num_threads, mpi_rank = _mpi_rank, mpi_world_size = _mpi_world_size; // TODO: parallelize & optimize matrix multiplication on multi-node // You must allocate & initialize A, B, C for non-root processes // FIXME: for now, only root process runs the matrix multiplication. if (mpi_rank == 0){ averow = M / mpi_world_size; extra = M % mpi_world_size; offset = averow; for(dest=1; dest0){ alloc_mat(&A,M,K); alloc_mat(&B,K,N); alloc_mat(&C,M,N); MPI_Recv(&offset, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, &status); MPI_Recv(&rows, 1, MPI_INT,0, 0, MPI_COMM_WORLD, &status); MPI_Recv(A, rows*K, MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &status); MPI_Recv(B, N*K, MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &status); MPI_Recv(C, rows*N, MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &status); } // printf("MPIrank :%d, rows:%d\n",mpi_rank,rows); mat_mul_omp(); if(mpi_rank>0){ MPI_Send(&offset, 1, MPI_INT, 0, 0, MPI_COMM_WORLD); MPI_Send(&rows, 1, MPI_INT, 0, 0, MPI_COMM_WORLD); MPI_Send(C, rows*N, MPI_FLOAT, 0, 0, MPI_COMM_WORLD); } else if(mpi_rank==0){ for(source=1; source