SSE3 add the capability to work horizontally in a register to facilitate complex arithmetic. Those algorithm can be found in Intel Optimization Reference Manual, but here they are in intrinsic format instead of assemble language.
inline static intrin_sse2_cmul(__m128 *ab, __m128 *cd, __m128 *ef)
{
__m128 aa, bb, cc, dd, ac, ad, bd, bc, ee, ff;
aa = _mm_shuffle_ps(ab[0], ab[1], _MM_SHUFFLE(2,0,2,0));
bb = _mm_shuffle_ps(ab[0], ab[1], _MM_SHUFFLE(3,1,3,1));
cc = _mm_shuffle_ps(cd[0], cd[1], _MM_SHUFFLE(2,0,2,0));
dd = _mm_shuffle_ps(cd[0], cd[1], _MM_SHUFFLE(3,1,3,1));
ac = _mm_mul_ps(aa, cc);
ad = _mm_mul_ps(aa, dd);
bd = _mm_mul_ps(bb, dd);
bc = _mm_mul_ps(bb, cc);
ee = _mm_sub_ps(ac, bd);
ff = _mm_add_ps(bc, ad);
ef[0] = _mm_unpacklo_ps(ee, ff);
ef[1] = _mm_unpackhi_ps(ee, ff);
}
To be continued. I will explain those later.
I am trying to use
SyntaxHighlighter, there seems to be a bug in current version. Bad luck me.
inline static __m128 intrin_sse3_cmul(__m128 ab, __m128 cd)
{
__m128 aa, bb, dc, x0, x1;
aa = _mm_moveldup_ps(ab);
bb = _mm_movehdup_ps(ab);
x0 = _mm_mul_ps(aa, cd); //ac ad
dc = _mm_shuffle_ps(cd, cd, _MM_SHUFFLE(2,3,0,1));
x1 = _mm_mul_ps(bb, dc); //bd bc
return _mm_addsub_ps(x0, x1);
}
TO be continued.
// size must be multiple of 4!
void sse2_cmul(complex float *ab, complex float *cd, complex float *ef, int size)
{
int i;
__m128 AB[2], CD[2], EF[2];
for(i=0; i < size; i=i+4)
{
AB[0] = _mm_load_ps((float*)(ab+i));
AB[1] = _mm_load_ps((float*)(ab+i+2));
CD[0] = _mm_load_ps((float*)(cd+i));
CD[1] = _mm_load_ps((float*)(cd+i+2));
intrin_sse2_cmul(AB, CD, EF);
_mm_store_ps((float*)(ef+i), EF[0]);
_mm_store_ps((float*)(ef+i+2), EF[1]);
}
}
SSE3
inline static __m128 intrin_sse3_cmul(__m128 ab, __m128 cd)
{
__m128 aa, bb, dc, x0, x1;
aa = _mm_moveldup_ps(ab);
bb = _mm_movehdup_ps(ab);
x0 = _mm_mul_ps(aa, cd); //ac ad
dc = _mm_shuffle_ps(cd, cd, _MM_SHUFFLE(2,3,0,1));
x1 = _mm_mul_ps(bb, dc); //bd bc
return _mm_addsub_ps(x0, x1);
}
没有评论:
发表评论