chundoong-lab-ta/SamsungDS22/submissions/HW2/yh1597.yang/mat_mul.cpp

64 lines
1.6 KiB
C++

#include "mat_mul.h"
#include <cstdlib>
#include <cstdio>
#include <pthread.h>
static float *A, *B, *C;
static int M, N, K;
static int num_threads;
static void* mat_mul_thread(void *data) {
int thread_num = *((int*)(&data));
int size_i = 4096;
int size_j = 4096;
int size_k = 64;
int ii,jj,kk,i,j,k;
for (ii = 0; ii < M; ii += size_i) {
int min_ii;
if(ii+size_i < M) min_ii = ii+size_i; else min_ii = M;
for (jj = 0; jj < N; jj += size_j) {
int min_jj;
if(jj+size_j < N) min_jj = jj+size_j; else min_jj = N;
for (kk = 0; kk < K; kk += size_k) {
int min_kk;
if(kk+size_k < K) min_kk = kk+size_k; else min_kk = K;
for (i = ii; i < min_ii; i++) {
int iN = i*N;
int iK = i*K;
if( (i%num_threads) == thread_num) {
for (k = kk; k < min_kk; k++) {
int kN = k*N;
for (j = jj; j < min_jj; j++) {
C[iN + j + 0] += A[iK+ k] * B[kN+ j + 0];
}
}
}
}
}
}
}
return NULL;
}
void mat_mul(float *_A, float *_B, float *_C, int _M, int _N, int _K, int _num_threads) {
A = _A, B = _B, C = _C;
M = _M, N = _N, K = _K;
num_threads = _num_threads;
// TODO: create '_num_threads' pthreads
pthread_t thread[80]; //maximum number of threads is 40
int new_threads;
int idx;
new_threads = num_threads;
for(idx=0; idx<new_threads; idx++) {
pthread_create(&thread[idx], NULL, mat_mul_thread, (void *)idx);
}
for(idx=0; idx<new_threads; idx++) {
pthread_join(thread[idx], NULL);
}
}