9.4.2 : The barycentre_intrinsics.cpp file

The barycentre_intrinsics.cpp file :
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#include <immintrin.h>
#include "barycentre_intrinsics.h"

///Compute the barycentre of the x,y,a tables
/**	@param[out] gx : barycentre on X axis
 * 	@param[out] gy : barycentre on Y axis
 * 	@param tabX : coordinates on X axis
 * 	@param tabY : coordinates on Y axis
 * 	@param tabA : signal amplitude
 * 	@param nbElement : number of elements of the input tables
*/
void barycentre(float & gx, float & gy, const float * tabX, const float* tabY, const float* tabA, long unsigned int nbElement){
	gx = 0.0f;
	gy = 0.0f;
	long unsigned int vecSize(VECTOR_ALIGNEMENT/sizeof(float));
	long unsigned int nbVec(nbElement/(vecSize*2lu));
	
	__m256 vecResX1 = _mm256_broadcast_ss(&gx);
	__m256 vecResX2 = _mm256_broadcast_ss(&gx);
	
	__m256 vecResY1 = _mm256_broadcast_ss(&gy);
	__m256 vecResY2 = _mm256_broadcast_ss(&gy);
	
	for(long unsigned int i(0lu); i < nbVec; ++i){
		
		__m256 vecValueA1 = _mm256_load_ps(tabA + 2lu*i*vecSize);
		__m256 vecValueX1 = _mm256_load_ps(tabX + 2lu*i*vecSize);
		__m256 vecAX1 = _mm256_mul_ps(vecValueA1, vecValueX1);
		vecResX1 = _mm256_add_ps(vecResX1, vecAX1);
		
		__m256 vecValueY1 = _mm256_load_ps(tabY + 2lu*i*vecSize);
		__m256 vecAY1 = _mm256_mul_ps(vecValueA1, vecValueY1);
		vecResY1 = _mm256_add_ps(vecResY1, vecAY1);
		
		
		__m256 vecValueA2 = _mm256_load_ps(tabA + (2lu*i + 1lu)*vecSize);
		__m256 vecValueX2 = _mm256_load_ps(tabX + (2lu*i + 1lu)*vecSize);
		__m256 vecAX2 = _mm256_mul_ps(vecValueA2, vecValueX2);
		vecResX2 = _mm256_add_ps(vecResX2, vecAX2);
		
		__m256 vecValueY2 = _mm256_load_ps(tabY + (2lu*i + 1lu)*vecSize);
		__m256 vecAY2 = _mm256_mul_ps(vecValueA2, vecValueY2);
		vecResY2 = _mm256_add_ps(vecResY2, vecAY2);
	}
	
	__m256 vecResX = _mm256_add_ps(vecResX1, vecResX2);
	float tmp[8lu];
	_mm256_storeu_ps(tmp, vecResX);
	for(long unsigned int i(0lu); i < 8lu; ++i){
		gx += tmp[i];
	}
	gx /= (float)nbElement;
	
	__m256 vecResY = _mm256_add_ps(vecResY1, vecResY2);
	_mm256_storeu_ps(tmp, vecResY);
	for(long unsigned int i(0lu); i < 8lu; ++i){
		gy += tmp[i];
	}
	gy /= (float)nbElement;
}