#include "util.h" #include #include #include #include #include #include #include static double start_time[8]; static double get_time() { struct timeval tv; gettimeofday(&tv, 0); return tv.tv_sec + tv.tv_usec * 1e-6; } void timer_start(int i) { start_time[i] = get_time(); } double timer_stop(int i) { return get_time() - start_time[i]; } void check_convolution(float *input, float *output, float *filter, int N, int C, int H, int W, int K, int R, int S, int pad, int dilation, int stride) { printf("Validating...\n"); int OH = (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1; int OW = (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1; float *O_ans; alloc_tensor(&O_ans, N, K, OH, OW); zero_tensor(O_ans, N, K, OH, OW); #pragma omp parallel for collapse(2) for (int n = 0; n < N; ++n) { for (int k = 0; k < K; ++k) { for (int oh = 0; oh < OH; ++oh) { for (int ow = 0; ow < OW; ++ow) { float o = 0.f; for (int c = 0; c < C; ++c) { for (int r = 0; r < R; ++r) { for (int s = 0; s < S; ++s) { int h = oh * stride - pad + r * dilation; int w = ow * stride - pad + s * dilation; if (h < 0 || h >= H || w < 0 || w >= W) continue; float i = input[n * C * H * W + c * H * W + h * W + w]; float f = filter[k * C * R * S + c * R * S + r * S + s]; o += i * f; } } } O_ans[n * K * OH * OW + k * OH * OW + oh * OW + ow] = o; } } } } bool is_valid = true; int cnt = 0, thr = 10; float eps = 1e-3; for (int n = 0; n < N; ++n) { for (int k = 0; k < K; ++k) { for (int oh = 0; oh < OH; ++oh) { for (int ow = 0; ow < OW; ++ow) { float o = output[n * K * OH * OW + k * OH * OW + oh * OW + ow]; float o_ans = O_ans[n * K * OH * OW + k * OH * OW + oh * OW + ow]; if (fabsf(o - o_ans) > eps && (o_ans == 0 || fabsf((o - o_ans) / o_ans) > eps)) { ++cnt; if (cnt <= thr) printf("output[%d][%d][%d][%d] : correct_value = %f, your_value = %f\n", n, k, oh, ow, o_ans, o); if (cnt == thr + 1) printf("Too many error, only first %d values are printed.\n", thr); is_valid = false; } } } } } if (is_valid) { printf("Result: VALID\n"); } else { printf("Result: INVALID\n"); } fflush(stdout); } void alloc_tensor(float **t, int D0, int D1, int D2, int D3) { *t = (float *) aligned_alloc(32, sizeof(float) * D0 * D1 * D2 * D3); if (*t == NULL) { printf("Failed to allocate memory for matrix.\n"); exit(0); } } void rand_tensor(float *m, int D0, int D1, int D2, int D3) { #pragma omp parallel for collapse(3) for (int j = 0; j < D1; j++) { for (int k = 0; k < D2; k++) { for (int l = 0; l < D3; l++) { m[0 * D1 * D2 * D3 + j * D2 * D3 + k * D3 + l] = (float) rand() / RAND_MAX - 0.5; } } } for (int i = 1; i < D0; i++) { memcpy(&m[i * D1 * D2 * D3], &m[0], sizeof(float) * D1 * D2 * D3); } } void zero_tensor(float *t, int D0, int D1, int D2, int D3) { memset(t, 0, sizeof(float) * D0 * D1 * D2 * D3); }