chundoong-lab-ta/SHPC2022/hw2/vectordot/vectordot.c

40 lines
954 B
C
Raw Normal View History

2022-10-04 13:56:31 +09:00
#include <immintrin.h>
#include <math.h>
float vectordot_naive(float *A, float *B, int N) {
float c = 0.f;
2022-10-14 03:59:35 +09:00
for (int i = 0; i < N; ++i) {
c += A[i] * B[i];
}
2022-10-04 13:56:31 +09:00
return c;
}
float vectordot_fma(float *A, float *B, int N) {
float c = 0.f;
/*
2022-10-04 21:07:02 +09:00
TODO: FILL IN HERE
2022-10-04 13:56:31 +09:00
*/
2022-11-07 12:26:51 +09:00
__m256 sum = _mm256_setzero_ps();
for (int i = 0; i < N; i = i + 8) {
__m256 A_ = _mm256_load_ps(A + i);
__m256 B_ = _mm256_load_ps(B + i);
sum = _mm256_fmadd_ps(A_, B_, sum);
}
const __m128 hiQuad = _mm256_extractf128_ps(sum, 1);
const __m128 loQuad = _mm256_castps256_ps128(sum);
const __m128 sumQuad = _mm_add_ps(loQuad, hiQuad);
const __m128 loDual = sumQuad;
const __m128 hiDual = _mm_movehl_ps(sumQuad, sumQuad);
const __m128 sumDual = _mm_add_ps(loDual, hiDual);
const __m128 lo = sumDual;
const __m128 hi = _mm_shuffle_ps(sumDual, sumDual, 0x1);
const __m128 res = _mm_add_ss(lo, hi);
c = _mm_cvtss_f32(res);
2022-10-04 13:56:31 +09:00
return c;
}