2022-10-04 13:56:31 +09:00
|
|
|
#include <immintrin.h>
|
|
|
|
#include <math.h>
|
|
|
|
|
|
|
|
float vectordot_naive(float *A, float *B, int N) {
|
|
|
|
/*
|
2022-10-04 21:07:02 +09:00
|
|
|
TODO: FILL IN HERE
|
2022-10-04 13:56:31 +09:00
|
|
|
*/
|
|
|
|
float c = 0.f;
|
|
|
|
for (int i = 0; i < N; ++i) {
|
|
|
|
c += A[i] * B[i];
|
|
|
|
}
|
|
|
|
return c;
|
|
|
|
}
|
|
|
|
|
|
|
|
float vectordot_fma(float *A, float *B, int N) {
|
|
|
|
/*
|
2022-10-04 21:07:02 +09:00
|
|
|
TODO: FILL IN HERE
|
2022-10-04 13:56:31 +09:00
|
|
|
*/
|
|
|
|
|
|
|
|
__m256 cvec = _mm256_set_ps(0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f);
|
|
|
|
int i = 0;
|
2022-10-04 21:07:02 +09:00
|
|
|
for (i = 0; i < N / 8; ++i) {
|
|
|
|
__m256 avec = _mm256_load_ps(&A[i * 8]);
|
|
|
|
__m256 bvec = _mm256_load_ps(&B[i * 8]);
|
2022-10-04 13:56:31 +09:00
|
|
|
cvec = _mm256_fmadd_ps(avec, bvec, cvec);
|
|
|
|
}
|
|
|
|
|
|
|
|
float c = 0.f;
|
2022-10-04 21:07:02 +09:00
|
|
|
for (i = 0; i < N % 8; ++i) {
|
|
|
|
c = fmaf(A[8 * (N / 8) + i], B[8 * (N / 8) + i], c);
|
2022-10-04 13:56:31 +09:00
|
|
|
}
|
|
|
|
|
|
|
|
float *vecp = &cvec;
|
2022-10-04 21:07:02 +09:00
|
|
|
c += vecp[0] + vecp[1] + vecp[2] + vecp[3] + vecp[4] + vecp[5] + vecp[6] +
|
|
|
|
vecp[7];
|
2022-10-04 13:56:31 +09:00
|
|
|
return c;
|
|
|
|
}
|