2011年9月4日星期日

Fast Complex multiply with SSE

SSE3 add the capability to work horizontally in a register to facilitate complex arithmetic. Those algorithm can be found in Intel Optimization Reference Manual, but here they are in intrinsic format instead of assemble language.
inline static intrin_sse2_cmul(__m128 *ab, __m128 *cd, __m128 *ef)
{
    __m128 aa, bb, cc, dd, ac, ad, bd, bc, ee, ff;
    aa = _mm_shuffle_ps(ab[0], ab[1], _MM_SHUFFLE(2,0,2,0));
    bb = _mm_shuffle_ps(ab[0], ab[1], _MM_SHUFFLE(3,1,3,1));
    cc = _mm_shuffle_ps(cd[0], cd[1], _MM_SHUFFLE(2,0,2,0));
    dd = _mm_shuffle_ps(cd[0], cd[1], _MM_SHUFFLE(3,1,3,1));
    ac = _mm_mul_ps(aa, cc);
    ad = _mm_mul_ps(aa, dd);
    bd = _mm_mul_ps(bb, dd);
    bc = _mm_mul_ps(bb, cc);
    ee = _mm_sub_ps(ac, bd);
    ff = _mm_add_ps(bc, ad);
    ef[0] = _mm_unpacklo_ps(ee, ff);
    ef[1] = _mm_unpackhi_ps(ee, ff);
}
To be continued. I will explain those later. I am trying to use SyntaxHighlighter, there seems to be a bug in current version. Bad luck me.
inline static __m128 intrin_sse3_cmul(__m128 ab, __m128 cd)
{
    __m128 aa, bb, dc, x0, x1;
    aa = _mm_moveldup_ps(ab);
    bb = _mm_movehdup_ps(ab);
    x0 = _mm_mul_ps(aa, cd);    //ac ad
    dc = _mm_shuffle_ps(cd, cd, _MM_SHUFFLE(2,3,0,1));
    x1 = _mm_mul_ps(bb, dc);    //bd bc
    return _mm_addsub_ps(x0, x1);

}
TO be continued.
// size must be multiple of 4! 
void sse2_cmul(complex float *ab, complex float *cd, complex float *ef, int size)
{
    int i;
    __m128 AB[2], CD[2], EF[2];
    for(i=0; i < size; i=i+4)
    {
        AB[0] = _mm_load_ps((float*)(ab+i));
        AB[1] = _mm_load_ps((float*)(ab+i+2));
        CD[0] = _mm_load_ps((float*)(cd+i));
        CD[1] = _mm_load_ps((float*)(cd+i+2));
        intrin_sse2_cmul(AB, CD, EF);
        _mm_store_ps((float*)(ef+i), EF[0]);
        _mm_store_ps((float*)(ef+i+2), EF[1]);
    }
}
SSE3
inline static __m128 intrin_sse3_cmul(__m128 ab, __m128 cd)
{
    __m128 aa, bb, dc, x0, x1;
    aa = _mm_moveldup_ps(ab);
    bb = _mm_movehdup_ps(ab);
    x0 = _mm_mul_ps(aa, cd);    //ac ad
    dc = _mm_shuffle_ps(cd, cd, _MM_SHUFFLE(2,3,0,1));
    x1 = _mm_mul_ps(bb, dc);    //bd bc
    return _mm_addsub_ps(x0, x1);

}

没有评论:

发表评论