120 lines
3.2 KiB
C++
120 lines
3.2 KiB
C++
|
#include "util.h"
|
||
|
|
||
|
#include <stdio.h>
|
||
|
#include <stdlib.h>
|
||
|
#include <string.h>
|
||
|
#include <stdbool.h>
|
||
|
#include <math.h>
|
||
|
#include <sys/time.h>
|
||
|
#include <omp.h>
|
||
|
|
||
|
static double start_time[8];
|
||
|
|
||
|
static double get_time() {
|
||
|
struct timeval tv;
|
||
|
gettimeofday(&tv, 0);
|
||
|
return tv.tv_sec + tv.tv_usec * 1e-6;
|
||
|
}
|
||
|
|
||
|
void timer_start(int i) {
|
||
|
start_time[i] = get_time();
|
||
|
}
|
||
|
|
||
|
double timer_stop(int i) {
|
||
|
return get_time() - start_time[i];
|
||
|
}
|
||
|
|
||
|
void check_convolution(float *input, float *output, float *filter,
|
||
|
int N, int C, int H, int W, int K, int R, int S,
|
||
|
int pad, int dilation, int stride) {
|
||
|
printf("Validating...\n");
|
||
|
|
||
|
int OH = (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1;
|
||
|
int OW = (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1;
|
||
|
|
||
|
float *O_ans;
|
||
|
alloc_tensor(&O_ans, N, K, OH, OW);
|
||
|
zero_tensor(O_ans, N, K, OH, OW);
|
||
|
|
||
|
#pragma omp parallel for collapse(2) num_threads(40)
|
||
|
for (int n = 0; n < N; ++n) {
|
||
|
for (int k = 0; k < K; ++k) {
|
||
|
for (int oh = 0; oh < OH; ++oh) {
|
||
|
for (int ow = 0; ow < OW; ++ow) {
|
||
|
float o = 0.f;
|
||
|
for (int c = 0; c < C; ++c) {
|
||
|
for (int r = 0; r < R; ++r) {
|
||
|
for (int s = 0; s < S; ++s) {
|
||
|
int h = oh * stride - pad + r * dilation;
|
||
|
int w = ow * stride - pad + s * dilation;
|
||
|
if (h < 0 || h >= H || w < 0 || w >= W) continue;
|
||
|
float i = input[n * C * H * W + c * H * W + h * W + w];
|
||
|
float f = filter[k * C * R * S + c * R * S + r * S + s];
|
||
|
o += i * f;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
O_ans[n * K * OH * OW + k * OH * OW + oh * OW + ow] = o;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
bool is_valid = true;
|
||
|
int cnt = 0, thr = 10;
|
||
|
float eps = 1e-3;
|
||
|
for (int n = 0; n < N; ++n) {
|
||
|
for (int k = 0; k < K; ++k) {
|
||
|
for (int oh = 0; oh < OH; ++oh) {
|
||
|
for (int ow = 0; ow < OW; ++ow) {
|
||
|
float o = output[n * K * OH * OW + k * OH * OW + oh * OW + ow];
|
||
|
float o_ans = O_ans[n * K * OH * OW + k * OH * OW + oh * OW + ow];
|
||
|
if (fabsf(o - o_ans) > eps && (o_ans == 0 || fabsf((o - o_ans) / o_ans) > eps)) {
|
||
|
++cnt;
|
||
|
if (cnt <= thr)
|
||
|
printf("output[%d][%d][%d][%d] : correct_value = %f, your_value = %f\n", n, k, oh, ow, o_ans, o);
|
||
|
if (cnt == thr + 1)
|
||
|
printf("Too many error, only first %d values are printed.\n", thr);
|
||
|
is_valid = false;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
if (is_valid) {
|
||
|
printf("Result: VALID\n");
|
||
|
} else {
|
||
|
printf("Result: INVALID\n");
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void alloc_tensor(float **t, int D0, int D1, int D2, int D3) {
|
||
|
*t = (float *) aligned_alloc(32, sizeof(float) * D0 * D1 * D2 * D3);
|
||
|
if (*t == NULL) {
|
||
|
printf("Failed to allocate memory for matrix.\n");
|
||
|
exit(0);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void rand_tensor(float *m, int D0, int D1, int D2, int D3) {
|
||
|
for (int i = 0; i < D0; i++) {
|
||
|
for (int j = 0; j < D1; j++) {
|
||
|
for (int k = 0; k < D2; k++) {
|
||
|
for (int l = 0; l < D3; l++) {
|
||
|
m[i * D1 * D2 * D3
|
||
|
+ j * D2 * D3
|
||
|
+ k * D3
|
||
|
+ l] = (float) rand() / RAND_MAX - 0.5;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
void zero_tensor(float *t, int D0, int D1, int D2, int D3) {
|
||
|
memset(t, 0, sizeof(float) * D0 * D1 * D2 * D3);
|
||
|
}
|