Previous The reduction_intrinsics_interleave8.h file |
Parent Interleaving 8 times |
Outline | Next The main_intrinsics_interleave8.cpp file |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
#include <immintrin.h> #include "reduction_intrinsics_interleave8.h" ///Do the Reduction /** @param ptabValue : input table * @param nbElement : number of elements in the input table * @return sum of all the elements of the input table */ float reduction(const float * tabValue, long unsigned int nbElement){ long unsigned int vecSize(VECTOR_ALIGNEMENT/sizeof(float)); long unsigned int nbVec(nbElement/(vecSize*8lu)); float res(0.0f); __m256 vecRes1 = _mm256_broadcast_ss(&res); __m256 vecRes2 = _mm256_broadcast_ss(&res); __m256 vecRes3 = _mm256_broadcast_ss(&res); __m256 vecRes4 = _mm256_broadcast_ss(&res); __m256 vecRes5 = _mm256_broadcast_ss(&res); __m256 vecRes6 = _mm256_broadcast_ss(&res); __m256 vecRes7 = _mm256_broadcast_ss(&res); __m256 vecRes8 = _mm256_broadcast_ss(&res); for(long unsigned int i(0lu); i < nbVec; ++i){ __m256 vecValue1 = _mm256_load_ps(tabValue + 8lu*i*vecSize); vecRes1 = _mm256_add_ps(vecRes1, vecValue1); __m256 vecValue2 = _mm256_load_ps(tabValue + (8lu*i + 1lu)*vecSize); vecRes2 = _mm256_add_ps(vecRes2, vecValue2); __m256 vecValue3 = _mm256_load_ps(tabValue + (8lu*i + 2lu)*vecSize); vecRes3 = _mm256_add_ps(vecRes3, vecValue3); __m256 vecValue4 = _mm256_load_ps(tabValue + (8lu*i + 3lu)*vecSize); vecRes4 = _mm256_add_ps(vecRes4, vecValue4); __m256 vecValue5 = _mm256_load_ps(tabValue + (8lu*i + 4lu)*vecSize); vecRes5 = _mm256_add_ps(vecRes5, vecValue5); __m256 vecValue6 = _mm256_load_ps(tabValue + (8lu*i + 5lu)*vecSize); vecRes6 = _mm256_add_ps(vecRes6, vecValue6); __m256 vecValue7 = _mm256_load_ps(tabValue + (8lu*i + 6lu)*vecSize); vecRes7 = _mm256_add_ps(vecRes7, vecValue7); __m256 vecValue8 = _mm256_load_ps(tabValue + (8lu*i + 7lu)*vecSize); vecRes8 = _mm256_add_ps(vecRes8, vecValue8); } __m256 vecRes12 = _mm256_add_ps(vecRes1, vecRes2); __m256 vecRes34 = _mm256_add_ps(vecRes3, vecRes4); __m256 vecRes56 = _mm256_add_ps(vecRes5, vecRes6); __m256 vecRes78 = _mm256_add_ps(vecRes7, vecRes8); __m256 vecRes1234 = _mm256_add_ps(vecRes12, vecRes34); __m256 vecRes5678 = _mm256_add_ps(vecRes56, vecRes78); __m256 vecRes = _mm256_add_ps(vecRes1234, vecRes5678); float tmp[8lu]; _mm256_storeu_ps(tmp, vecRes); for(long unsigned int i(0lu); i < 8lu; ++i){ res += tmp[i]; } return res; } |
Previous The reduction_intrinsics_interleave8.h file |
Parent Interleaving 8 times |
Outline | Next The main_intrinsics_interleave8.cpp file |