125 lines
3.1 KiB
C++
125 lines
3.1 KiB
C++
|
#include "mat_mul.h"
|
||
|
#include "util.h"
|
||
|
|
||
|
#include <stdio.h>
|
||
|
#include <stdlib.h>
|
||
|
|
||
|
#include <cstdlib>
|
||
|
#include <cstdio>
|
||
|
#include <pthread.h>
|
||
|
|
||
|
#define max(a,b) (((a) > (b)) ? (a) : (b))
|
||
|
#define min(a,b) (((a) < (b)) ? (a) : (b))
|
||
|
|
||
|
static float *A, *B, *C; //, *B_T;
|
||
|
static int M, N, K;
|
||
|
static int num_threads;
|
||
|
|
||
|
#if 0
|
||
|
static void* mat_mul_thread(void *data) {
|
||
|
// TODO: parallelize & optimize matrix multiplication
|
||
|
for (int i = 0; i < M; ++i) {
|
||
|
for (int j = 0; j < N; ++j) {
|
||
|
for (int k = 0; k < K; ++k) {
|
||
|
C[i * N + j] += A[i * K + k] * B[k * N + j];
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return NULL;
|
||
|
}
|
||
|
|
||
|
void mat_mul(float *_A, float *_B, float *_C, int _M, int _N, int _K, int _num_threads) {
|
||
|
A = _A, B = _B, C = _C;
|
||
|
M = _M, N = _N, K = _K;
|
||
|
num_threads = _num_threads;
|
||
|
|
||
|
TODO: create '_num_threads' pthreads
|
||
|
pthread_t thread;
|
||
|
pthread_create(&thread, NULL, mat_mul_thread, NULL);
|
||
|
pthread_join(thread, NULL);
|
||
|
}
|
||
|
|
||
|
#else
|
||
|
|
||
|
static void* mat_mul_thread(void *data) {
|
||
|
const int block_size = 64;
|
||
|
int tid, blk_size;
|
||
|
int row_start, row_end;
|
||
|
|
||
|
tid = *(int *)(data);
|
||
|
blk_size = M / num_threads;
|
||
|
|
||
|
row_start = tid * blk_size;
|
||
|
row_end = (tid == num_threads - 1) ? M : ((tid + 1) * blk_size);
|
||
|
|
||
|
if ((K % 4) == 0) {
|
||
|
float c0, c1, c2, c3;
|
||
|
for (int kk = 0; kk < K; kk += block_size) {
|
||
|
for (int jj = 0; jj < N; jj += block_size) {
|
||
|
for (int i = row_start; i < row_end; i++) {
|
||
|
int iK = i * K;
|
||
|
for (int k = kk; k < min(kk + block_size, K); k += 4) {
|
||
|
float Aik0 = A[iK + k];
|
||
|
float Aik1 = A[iK + k + 1];
|
||
|
float Aik2 = A[iK + k + 2];
|
||
|
float Aik3 = A[iK + k + 3];
|
||
|
int iN = i * N;
|
||
|
int k0N = k * N;
|
||
|
int k1N = (k + 1)* N;
|
||
|
int k2N = (k + 2)* N;
|
||
|
int k3N = (k + 3)* N;
|
||
|
for (int j = jj; j < min(jj + block_size, N); j++) {
|
||
|
c0 = Aik0 * B[k0N + j];
|
||
|
c1 = Aik1 * B[k1N + j];
|
||
|
c2 = Aik2 * B[k2N + j];
|
||
|
c3 = Aik3 * B[k3N + j];
|
||
|
C[iN + j] += (c0 + c1 + c2 + c3);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
else {
|
||
|
for (int kk = 0; kk < K; kk += block_size) {
|
||
|
for (int jj = 0; jj < N; jj += block_size) {
|
||
|
for (int i = row_start; i < row_end; i++) {
|
||
|
int iK = i * K;
|
||
|
for (int k = kk; k < min(kk + block_size, K); k++) {
|
||
|
float Aik = A[iK + k];
|
||
|
int iN = i * N;
|
||
|
int kN = k * N;
|
||
|
for (int j = jj; j < min(jj + block_size, N); j++) {
|
||
|
C[iN + j] += Aik * B[kN + j];
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return NULL;
|
||
|
}
|
||
|
|
||
|
void mat_mul(float *_A, float *_B, float *_C, int _M, int _N, int _K, int _num_threads) {
|
||
|
A = _A, B = _B, C = _C;
|
||
|
M = _M, N = _N, K = _K;
|
||
|
num_threads = _num_threads;
|
||
|
|
||
|
pthread_t *thread;
|
||
|
thread = (pthread_t *)malloc(num_threads * sizeof(pthread_t));
|
||
|
|
||
|
for (int i = 0; i < num_threads; i++) {
|
||
|
int *tid;
|
||
|
tid = (int *)malloc( sizeof(int) );
|
||
|
*tid = i;
|
||
|
pthread_create(&thread[i], NULL, mat_mul_thread, (void *)tid);
|
||
|
}
|
||
|
|
||
|
for (int i = 0; i < num_threads; i++) {
|
||
|
pthread_join(thread[i], NULL);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
#endif
|