Previous The sgemm_intrinsics_pitch.h file |
Parent Intrinsics implementation with a pitch |
Outline | Next The main_sgemm_intrinsics_pitch.cpp file |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
#include <immintrin.h> #include <string.h> #include "sgemm_intrinsics_pitch.h" ///Compute the Matrix-Matrix product of the x,y matrices /** @param[out] matOut : result * @param matX : left matrix * @param matY : right matrix * @param size : size of the square matrices * @param pitch : pitch of the matrix */ void sgemm(float* matOut, const float* matX, const float* matY, long unsigned int size, long unsigned int pitch){ long unsigned int colSize(size + pitch); memset(matOut, 0, sizeof(float)*colSize*size); long unsigned int vecSize(VECTOR_ALIGNEMENT/sizeof(float)); long unsigned int nbVec(colSize/vecSize); for(long unsigned int i(0lu); i < size; ++i){ //Iterate over X rows const float * rowX = matX + i*colSize; //Get current X row float * rowOut = matOut + i*colSize; //Get current out row for(long unsigned int k(0lu); k < size; ++k){ //Part of dot product __m256 regX = _mm256_broadcast_ss(rowX + k); const float* rowY = matY + k*colSize; //Get current Y row for(long unsigned int j(0lu); j < nbVec; ++j){ __m256 regY = _mm256_load_ps(rowY + vecSize*j); __m256 regRes = _mm256_load_ps(rowOut + vecSize*j); regRes = _mm256_add_ps(regRes, _mm256_mul_ps(regX, regY)); _mm256_store_ps(rowOut + vecSize*j, regRes); } } } } |
Previous The sgemm_intrinsics_pitch.h file |
Parent Intrinsics implementation with a pitch |
Outline | Next The main_sgemm_intrinsics_pitch.cpp file |