chundoong-lab-ta/SamsungDS22/submissions/HW2/yc.cho/mat_mul.cpp

#include "mat_mul.h"
#include <algorithm>
#include <cstdlib>
#include <cstdio>
#include <pthread.h>

static float *A, *B, *C;
static int M, N, K;
static int num_threads;
static void* mat_mul_thread(void *data) {
  // TODO: parallelize & optimize matrix multiplication
    int pid = * (int *) data;

    int slice = M / num_threads;
    int start = pid * slice;
    int end = pid == num_threads - 1 ? M : (pid + 1) * slice;

    float Aik;
//   int bs = BLOCKSIZE;
    int bs = 45;

    for (int klm = 0; klm < K; klm += bs) {
            for (int i = start; i < end; ++i) {
		for (int k = klm; k < std::min(klm + bs, K); ++k) {
                    Aik = A[i * K + k];
                    for (int j = 0 ; j < N; ++j) {
                        C[i * N + j] += Aik * B[k * N + j];
                     }
                }
           }
      }
  return NULL;
}

void mat_mul(float *_A, float *_B, float *_C, int _M, int _N, int _K, int _num_threads) {
  A = _A, B = _B, C = _C;
  M = _M, N = _N, K = _K;
  num_threads = _num_threads;

  // TODO: create '_num_threads' pthreads


  pthread_t * threads;
  //threads = (pthread_t*)malloc(sizeof(pthread_t) * num_threads);
  threads = (pthread_t *) malloc(sizeof(pthread_t) * num_threads);

  for (int i = 0; i < num_threads; i++) {
      int * pid = (int *) malloc(sizeof(int));
      *pid = i;

      //pthread_create(&thread[i], NULL, mat_mul_thread, NULL);
      pthread_create(&threads[i], NULL, mat_mul_thread, pid);
  }
  for (int k = 0; k < num_threads; k++) {
  pthread_join(threads[k], NULL);
 }
}