Previous The barycentre_intrinsics.h file |
Parent The intrinsics version of barycentre |
Outline | Next The CMakeLists.txt file |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
#include <immintrin.h> #include "barycentre_intrinsics.h" ///Compute the barycentre of the x,y,a tables /** @param[out] gx : barycentre on X axis * @param[out] gy : barycentre on Y axis * @param tabX : coordinates on X axis * @param tabY : coordinates on Y axis * @param tabA : signal amplitude * @param nbElement : number of elements of the input tables */ void barycentre(float & gx, float & gy, const float * tabX, const float* tabY, const float* tabA, long unsigned int nbElement){ gx = 0.0f; gy = 0.0f; long unsigned int vecSize(VECTOR_ALIGNEMENT/sizeof(float)); long unsigned int nbVec(nbElement/(vecSize*2lu)); __m256 vecResX1 = _mm256_broadcast_ss(&gx); __m256 vecResX2 = _mm256_broadcast_ss(&gx); __m256 vecResY1 = _mm256_broadcast_ss(&gy); __m256 vecResY2 = _mm256_broadcast_ss(&gy); for(long unsigned int i(0lu); i < nbVec; ++i){ __m256 vecValueA1 = _mm256_load_ps(tabA + 2lu*i*vecSize); __m256 vecValueX1 = _mm256_load_ps(tabX + 2lu*i*vecSize); __m256 vecAX1 = _mm256_mul_ps(vecValueA1, vecValueX1); vecResX1 = _mm256_add_ps(vecResX1, vecAX1); __m256 vecValueY1 = _mm256_load_ps(tabY + 2lu*i*vecSize); __m256 vecAY1 = _mm256_mul_ps(vecValueA1, vecValueY1); vecResY1 = _mm256_add_ps(vecResY1, vecAY1); __m256 vecValueA2 = _mm256_load_ps(tabA + (2lu*i + 1lu)*vecSize); __m256 vecValueX2 = _mm256_load_ps(tabX + (2lu*i + 1lu)*vecSize); __m256 vecAX2 = _mm256_mul_ps(vecValueA2, vecValueX2); vecResX2 = _mm256_add_ps(vecResX2, vecAX2); __m256 vecValueY2 = _mm256_load_ps(tabY + (2lu*i + 1lu)*vecSize); __m256 vecAY2 = _mm256_mul_ps(vecValueA2, vecValueY2); vecResY2 = _mm256_add_ps(vecResY2, vecAY2); } __m256 vecResX = _mm256_add_ps(vecResX1, vecResX2); float tmp[8lu]; _mm256_storeu_ps(tmp, vecResX); for(long unsigned int i(0lu); i < 8lu; ++i){ gx += tmp[i]; } gx /= (float)nbElement; __m256 vecResY = _mm256_add_ps(vecResY1, vecResY2); _mm256_storeu_ps(tmp, vecResY); for(long unsigned int i(0lu); i < 8lu; ++i){ gy += tmp[i]; } gy /= (float)nbElement; } |
Previous The barycentre_intrinsics.h file |
Parent The intrinsics version of barycentre |
Outline | Next The CMakeLists.txt file |