10.6.2 : The sgemm_intrinsics_pitch.cpp file

The sgemm_intrinsics_pitch.cpp file :
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
#include <immintrin.h>

#include <string.h>
#include "sgemm_intrinsics_pitch.h"

///Compute the Matrix-Matrix product of the x,y matrices
/**	@param[out] matOut : result
 * 	@param matX : left matrix
 * 	@param matY : right matrix
 * 	@param size : size of the square matrices
 * 	@param pitch : pitch of the matrix
*/
void sgemm(float* matOut, const float* matX, const float* matY, long unsigned int size, long unsigned int pitch){
	long unsigned int colSize(size + pitch);
	memset(matOut, 0, sizeof(float)*colSize*size);
	long unsigned int vecSize(VECTOR_ALIGNEMENT/sizeof(float));
	long unsigned int nbVec(colSize/vecSize);
	
	for(long unsigned int i(0lu); i < size; ++i){			//Iterate over X rows
		const float * rowX = matX + i*colSize;				//Get current X row
		float * rowOut = matOut + i*colSize;					//Get current out row
		for(long unsigned int k(0lu); k < size; ++k){					//Part of dot product
			__m256 regX = _mm256_broadcast_ss(rowX + k);
			
			const float* rowY = matY + k*colSize;				//Get current Y row
			for(long unsigned int j(0lu); j < nbVec; ++j){
				__m256 regY = _mm256_load_ps(rowY + vecSize*j);
				
				__m256 regRes = _mm256_load_ps(rowOut + vecSize*j);
				
				regRes = _mm256_add_ps(regRes, _mm256_mul_ps(regX, regY));
				
				_mm256_store_ps(rowOut + vecSize*j, regRes);
			}
		}
	}
}