8.5.3.2 : The reduction_intrinsics_interleave8.cpp file

There is the reduction_intrinsics_interleave8.cpp file :
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
#include <immintrin.h>
#include "reduction_intrinsics_interleave8.h"

///Do the Reduction
/**	@param ptabValue : input table
 * 	@param nbElement : number of elements in the input table
 * 	@return sum of all the elements of the input table
*/
float reduction(const float * tabValue, long unsigned int nbElement){
	long unsigned int vecSize(VECTOR_ALIGNEMENT/sizeof(float));
	long unsigned int nbVec(nbElement/(vecSize*8lu));
	float res(0.0f);
	__m256 vecRes1 = _mm256_broadcast_ss(&res);
	__m256 vecRes2 = _mm256_broadcast_ss(&res);
	__m256 vecRes3 = _mm256_broadcast_ss(&res);
	__m256 vecRes4 = _mm256_broadcast_ss(&res);
	
	__m256 vecRes5 = _mm256_broadcast_ss(&res);
	__m256 vecRes6 = _mm256_broadcast_ss(&res);
	__m256 vecRes7 = _mm256_broadcast_ss(&res);
	__m256 vecRes8 = _mm256_broadcast_ss(&res);
	
	for(long unsigned int i(0lu); i < nbVec; ++i){
		__m256 vecValue1 = _mm256_load_ps(tabValue + 8lu*i*vecSize);
		vecRes1 = _mm256_add_ps(vecRes1, vecValue1);
		__m256 vecValue2 = _mm256_load_ps(tabValue + (8lu*i + 1lu)*vecSize);
		vecRes2 = _mm256_add_ps(vecRes2, vecValue2);
		__m256 vecValue3 = _mm256_load_ps(tabValue + (8lu*i + 2lu)*vecSize);
		vecRes3 = _mm256_add_ps(vecRes3, vecValue3);
		__m256 vecValue4 = _mm256_load_ps(tabValue + (8lu*i + 3lu)*vecSize);
		vecRes4 = _mm256_add_ps(vecRes4, vecValue4);
		
		__m256 vecValue5 = _mm256_load_ps(tabValue + (8lu*i + 4lu)*vecSize);
		vecRes5 = _mm256_add_ps(vecRes5, vecValue5);
		__m256 vecValue6 = _mm256_load_ps(tabValue + (8lu*i + 5lu)*vecSize);
		vecRes6 = _mm256_add_ps(vecRes6, vecValue6);
		__m256 vecValue7 = _mm256_load_ps(tabValue + (8lu*i + 6lu)*vecSize);
		vecRes7 = _mm256_add_ps(vecRes7, vecValue7);
		__m256 vecValue8 = _mm256_load_ps(tabValue + (8lu*i + 7lu)*vecSize);
		vecRes8 = _mm256_add_ps(vecRes8, vecValue8);
		
	}
	__m256 vecRes12 = _mm256_add_ps(vecRes1, vecRes2);
	__m256 vecRes34 = _mm256_add_ps(vecRes3, vecRes4);
	__m256 vecRes56 = _mm256_add_ps(vecRes5, vecRes6);
	__m256 vecRes78 = _mm256_add_ps(vecRes7, vecRes8);
	
	__m256 vecRes1234 = _mm256_add_ps(vecRes12, vecRes34);
	__m256 vecRes5678 = _mm256_add_ps(vecRes56, vecRes78);
	
	__m256 vecRes = _mm256_add_ps(vecRes1234, vecRes5678);
	float tmp[8lu];
	_mm256_storeu_ps(tmp, vecRes);
	for(long unsigned int i(0lu); i < 8lu; ++i){
		res += tmp[i];
	}
	return res;
}