#include "convolution.h" #include #include ///추가 #include #include #include #include #include #include #include #include #include #include "util.h" #include #include static float *input, *output, *filter; static int N, C, H, W; static int K, R, S; static int OH, OW; static int pad; static int dilation; static int stride; static int mpi_rank, mpi_world_size; static int tag=0; // 추가 static int num_threads = 100; static int N_nums[2] = {0,0}; static int N_offset[2] = {0,0}; // 추가 int omp_get_thread_num(void); int omp_get_num_threads(void); //static double elapsed_time; //#define Timer_ID 1 void convolution( float *_input, float *_output, float *_filter, int _N, int _C, int _H, int _W, int _K, int _R, int _S, int _pad, int _dilation, int _stride) { input = _input; output = _output; filter = _filter; N =_N; C =_C; H =_H; W=_W; K=_K; R=_R; S=_S; pad=_pad; dilation=_dilation; stride=_stride; OH = (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1; OW = (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1; MPI_Request request; MPI_Status status; int N_num = N/mpi_world_size; // 노드별 N 개수 정의 for(int i=0; i= H || w < 0 || w >= W) continue; input_temp[0] = input[h_re + (ow_re +(s+0) * dilation)]; input_temp[1] = input[h_re + (ow_re + (s+1) * dilation)]; input_temp[2] = input[h_re + (ow_re + (s+2) * dilation)]; input_temp[3] = input[h_re + (ow_re + (s+3) * dilation)]; input_temp[4] = input[h_re + (ow_re + (s+4) * dilation)]; input_temp[5] = input[h_re + (ow_re + (s+5) * dilation)]; input_temp[6] = input[h_re + (ow_re + (s+6) * dilation)]; input_temp[7] = input[h_re + (ow_re + (s+7) * dilation)]; input_temp[8] = input[h_re + (ow_re + (s+8)* dilation)]; input_temp[9] = input[h_re + (ow_re + (s+9) * dilation)]; input_temp[10] = input[h_re + (ow_re + (s+10) * dilation)]; input_temp[11] = input[h_re + (ow_re + (s+11) * dilation)]; input_temp[12] = input[h_re + (ow_re + (s+12) * dilation)]; input_temp[13] = input[h_re + (ow_re + (s+13) * dilation)]; input_temp[14] = input[h_re + (ow_re + (s+14) * dilation)]; input_temp[15] = input[h_re + (ow_re + (s+15) * dilation)]; __m512 i0 = _mm512_loadu_ps(input_temp); __m512 f0 = _mm512_loadu_ps(&filter[fil_re + (s+0)]); o_result = _mm512_fmadd_ps(i0, f0, o_result); } /* float o_result_sum = 0.0f; int tt_end; if(S%16 == 0) tt_end=16; else tt_end=S%16; for(int tt=0; tt= H || w < 0 || w >= W) continue; float i = input[n * C * H * W + c * H * W + h * W + w]; float f = filter[k * C * R * S + c * R * S + r * S + s]; o += i * f; } output[n * K * OH * OW + k * OH * OW + oh * OW + ow] = o; } } } } } } } // convolution 수행 끝 // elapsed_time = timer_stop(Timer_ID); // printf("4. 계산 시간 %f sec\n", elapsed_time); //timer_start(Timer_ID); // 계산결과 output 전송/수신 if(mpi_rank != 0) { MPI_Isend(output, (N_nums[mpi_rank]*OH*OW*K), MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &request); } else { for(int i=1; i