#include "util.h"

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#include <math.h>
#include <sys/time.h>
#include <omp.h>

static double start_time[8];

static double get_time() {
    struct timeval tv;
    gettimeofday(&tv, 0);
    return tv.tv_sec + tv.tv_usec * 1e-6;
}

void timer_start(int i) {
    start_time[i] = get_time();
}

double timer_stop(int i) {
    return get_time() - start_time[i];
}

void check_convolution(float *input, float *output, float *filter,
    int N, int C, int H, int W, int K, int R, int S,
    int pad, int dilation, int stride) {
  printf("Validating...\n");

  int OH = (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1;
  int OW = (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1;

  float *O_ans;
  alloc_tensor(&O_ans, N, K, OH, OW);
  zero_tensor(O_ans, N, K, OH, OW);
#pragma omp parallel for collapse(2)
  for (int n = 0; n < N; ++n) {
    for (int k = 0; k < K; ++k) {
      for (int oh = 0; oh < OH; ++oh) {
        for (int ow = 0; ow < OW; ++ow) {
          float o = 0.f;
          for (int c = 0; c < C; ++c) {
            for (int r = 0; r < R; ++r) {
              for (int s = 0; s < S; ++s) {
                int h = oh * stride - pad + r * dilation;
                int w = ow * stride - pad + s * dilation;
                if (h < 0 || h >= H || w < 0 || w >= W) continue;
                float i = input[n * C * H * W + c * H * W + h * W + w];
                float f = filter[k * C * R * S + c * R * S + r * S + s];
                o += i * f;
              }
            }
          }
          O_ans[n * K * OH * OW + k * OH * OW + oh * OW + ow] = o;
        }
      }
    }
  }

  bool is_valid = true;
  int cnt = 0, thr = 10;
  float eps = 1e-3;
  for (int n = 0; n < N; ++n) {
    for (int k = 0; k < K; ++k) {
      for (int oh = 0; oh < OH; ++oh) {
        for (int ow = 0; ow < OW; ++ow) {
          float o = output[n * K * OH * OW + k * OH * OW + oh * OW + ow];
          float o_ans = O_ans[n * K * OH * OW + k * OH * OW + oh * OW + ow];
          if (fabsf(o - o_ans) > eps && (o_ans == 0 || fabsf((o - o_ans) / o_ans) > eps)) {
            ++cnt;
            if (cnt <= thr)
              printf("output[%d][%d][%d][%d] : correct_value = %f, your_value = %f\n", n, k, oh, ow, o_ans, o);
            if (cnt == thr + 1)
              printf("Too many error, only first %d values are printed.\n", thr);
            is_valid = false;
          }
        }
      }
    }
  }

  if (is_valid) {
    printf("Result: VALID\n");
  } else {
    printf("Result: INVALID\n");
  }
  fflush(stdout);
}

void alloc_tensor(float **t, int D0, int D1, int D2, int D3) {
  *t = (float *) aligned_alloc(32, sizeof(float) * D0 * D1 * D2 * D3);
  if (*t == NULL) {
    printf("Failed to allocate memory for matrix.\n");
    exit(0);
  }
}

void rand_tensor(float *m, int D0, int D1, int D2, int D3) {
#pragma omp parallel for collapse(3)
  for (int j = 0; j < D1; j++) {
    for (int k = 0; k < D2; k++) {
      for (int l = 0; l < D3; l++) {
        m[0 * D1 * D2 * D3
          +  j * D2 * D3
          +  k * D3 
          +  l] = (float) rand() / RAND_MAX - 0.5;
      }
    }
  }

  for (int i = 1; i < D0; i++) {
    memcpy(&m[i * D1 * D2 * D3], &m[0], sizeof(float) * D1 * D2 * D3);
  }
}

void zero_tensor(float *t, int D0, int D1, int D2, int D3) {
  memset(t, 0, sizeof(float) * D0 * D1 * D2 * D3);
}