chundoong-lab-ta/SamsungDS22/submissions/HW2/ym.tai/mat_mul.cpp

125 lines
3.1 KiB
C++
Raw Normal View History

2022-09-29 18:01:45 +09:00
#include "mat_mul.h"
#include "util.h"
#include <stdio.h>
#include <stdlib.h>
#include <cstdlib>
#include <cstdio>
#include <pthread.h>
#define max(a,b) (((a) > (b)) ? (a) : (b))
#define min(a,b) (((a) < (b)) ? (a) : (b))
static float *A, *B, *C; //, *B_T;
static int M, N, K;
static int num_threads;
#if 0
static void* mat_mul_thread(void *data) {
// TODO: parallelize & optimize matrix multiplication
for (int i = 0; i < M; ++i) {
for (int j = 0; j < N; ++j) {
for (int k = 0; k < K; ++k) {
C[i * N + j] += A[i * K + k] * B[k * N + j];
}
}
}
return NULL;
}
void mat_mul(float *_A, float *_B, float *_C, int _M, int _N, int _K, int _num_threads) {
A = _A, B = _B, C = _C;
M = _M, N = _N, K = _K;
num_threads = _num_threads;
TODO: create '_num_threads' pthreads
pthread_t thread;
pthread_create(&thread, NULL, mat_mul_thread, NULL);
pthread_join(thread, NULL);
}
#else
static void* mat_mul_thread(void *data) {
const int block_size = 64;
int tid, blk_size;
int row_start, row_end;
tid = *(int *)(data);
blk_size = M / num_threads;
row_start = tid * blk_size;
row_end = (tid == num_threads - 1) ? M : ((tid + 1) * blk_size);
if ((K % 4) == 0) {
float c0, c1, c2, c3;
for (int kk = 0; kk < K; kk += block_size) {
for (int jj = 0; jj < N; jj += block_size) {
for (int i = row_start; i < row_end; i++) {
int iK = i * K;
for (int k = kk; k < min(kk + block_size, K); k += 4) {
float Aik0 = A[iK + k];
float Aik1 = A[iK + k + 1];
float Aik2 = A[iK + k + 2];
float Aik3 = A[iK + k + 3];
int iN = i * N;
int k0N = k * N;
int k1N = (k + 1)* N;
int k2N = (k + 2)* N;
int k3N = (k + 3)* N;
for (int j = jj; j < min(jj + block_size, N); j++) {
c0 = Aik0 * B[k0N + j];
c1 = Aik1 * B[k1N + j];
c2 = Aik2 * B[k2N + j];
c3 = Aik3 * B[k3N + j];
C[iN + j] += (c0 + c1 + c2 + c3);
}
}
}
}
}
}
else {
for (int kk = 0; kk < K; kk += block_size) {
for (int jj = 0; jj < N; jj += block_size) {
for (int i = row_start; i < row_end; i++) {
int iK = i * K;
for (int k = kk; k < min(kk + block_size, K); k++) {
float Aik = A[iK + k];
int iN = i * N;
int kN = k * N;
for (int j = jj; j < min(jj + block_size, N); j++) {
C[iN + j] += Aik * B[kN + j];
}
}
}
}
}
}
return NULL;
}
void mat_mul(float *_A, float *_B, float *_C, int _M, int _N, int _K, int _num_threads) {
A = _A, B = _B, C = _C;
M = _M, N = _N, K = _K;
num_threads = _num_threads;
pthread_t *thread;
thread = (pthread_t *)malloc(num_threads * sizeof(pthread_t));
for (int i = 0; i < num_threads; i++) {
int *tid;
tid = (int *)malloc( sizeof(int) );
*tid = i;
pthread_create(&thread[i], NULL, mat_mul_thread, (void *)tid);
}
for (int i = 0; i < num_threads; i++) {
pthread_join(thread[i], NULL);
}
}
#endif