123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751 |
- #pragma once
- #include <AK/SoundEngine/Common/AkSimdTypes.h>
- #include <AK/SoundEngine/Common/AkTypes.h>
- #include <xmmintrin.h>
- #include <smmintrin.h>
- #include <emmintrin.h>
- #if defined(__FMA__) || defined(__AVX2__)
- #include <immintrin.h>
- #endif
- #define AKSIMD_ALIGNSIZE( __Size__ ) (((__Size__) + 15) & ~15)
- #define AKSIMD_LOAD_V4F32( __addr__ ) _mm_loadu_ps( (AkReal32*)(__addr__) )
- #define AKSIMD_LOADU_V4F32( __addr__ ) _mm_loadu_ps( (__addr__) )
- #define AKSIMD_LOAD1_V4F32( __scalar__ ) _mm_load1_ps( &(__scalar__) )
- #define AKSIMD_SET_V4F32( __scalar__ ) _mm_set_ps1( (__scalar__) )
- #define AKSIMD_SETV_V2F64( _b, _a ) _mm_castpd_ps(_mm_set_pd( (_b), (_a) ))
- #define AKSIMD_SETV_V4F32( _d, _c, _b, _a ) _mm_set_ps( (_d), (_c), (_b), (_a) )
- static AkForceInline AKSIMD_V4COND AKSIMD_SETMASK_V4COND( AkUInt32 x )
- {
- __m128i temp = _mm_set_epi32(8, 4, 2, 1);
- __m128i xvec = _mm_set1_epi32(x);
- __m128i xand = _mm_and_si128(xvec, temp);
- return _mm_castsi128_ps(_mm_cmpeq_epi32(temp, xand));
- }
- #define AKSIMD_SETZERO_V4F32() _mm_setzero_ps()
- #define AKSIMD_LOAD_SS_V4F32( __addr__ ) _mm_load_ss( (__addr__) )
- #define AKSIMD_STORE_V4F32( __addr__, __vec__ ) _mm_storeu_ps( (AkReal32*)(__addr__), (__vec__) )
- #define AKSIMD_STOREU_V4F32( __addr__, __vec__ ) _mm_storeu_ps( (AkReal32*)(__addr__), (__vec__) )
- #define AKSIMD_STORE1_V4F32( __addr__, __vec__ ) _mm_store_ss( (AkReal32*)(__addr__), (__vec__) )
- #define AKSIMD_STORE1_V2F64( __addr__, __vec__ ) _mm_store_sd( (AkReal64*)(__addr__), _mm_castps_pd(__vec__) )
- #define AKSIMD_SHUFFLE( fp3, fp2, fp1, fp0 ) _MM_SHUFFLE( (fp3), (fp2), (fp1), (fp0) )
- #define AKSIMD_SHUFFLE_V4F32( a, b, i ) _mm_shuffle_ps( a, b, i )
- #define AKSIMD_SHUFFLE_V4I32( a, b, i ) _mm_castps_si128(_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), i ))
- #define AKSIMD_MOVEHL_V4F32( a, b ) _mm_movehl_ps( a, b )
- #define AKSIMD_MOVELH_V4F32( a, b ) _mm_movelh_ps( a, b )
- #define AKSIMD_SHUFFLE_BADC( __a__ ) _mm_shuffle_ps( (__a__), (__a__), _MM_SHUFFLE(2,3,0,1))
- #define AKSIMD_SHUFFLE_CDAB( __a__ ) _mm_shuffle_ps( (__a__), (__a__), _MM_SHUFFLE(1,0,3,2))
- #define AKSIMD_SHUFFLE_BCDA( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), _MM_SHUFFLE(0,3,2,1))
- #define AKSIMD_DUP_ODD(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(3,3,1,1))
- #define AKSIMD_DUP_EVEN(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(2,2,0,0))
- #define AKSIMD_SUB_V4F32( a, b ) _mm_sub_ps( a, b )
- #define AKSIMD_SUB_SS_V4F32( a, b ) _mm_sub_ss( a, b )
- #define AKSIMD_ADD_V4F32( a, b ) _mm_add_ps( a, b )
- #define AKSIMD_ADD_SS_V4F32( a, b ) _mm_add_ss( a, b )
- #define AKSIMD_MUL_V4F32( a, b ) _mm_mul_ps( a, b )
- #define AKSIMD_DIV_V4F32( a, b ) _mm_div_ps( a, b )
- #define AKSIMD_MUL_SS_V4F32( a, b ) _mm_mul_ss( a, b )
- #if defined(__FMA__) || defined(__AVX2__)
- #define AKSIMD_MADD_V4F32( __a__, __b__, __c__ ) _mm_fmadd_ps( (__a__), (__b__) , (__c__) )
- #define AKSIMD_MSUB_V4F32( __a__, __b__, __c__ ) _mm_fmsub_ps( (__a__), (__b__) , (__c__) )
- #else
- #define AKSIMD_MADD_V4F32( __a__, __b__, __c__ ) _mm_add_ps( _mm_mul_ps( (__a__), (__b__) ), (__c__) )
- #define AKSIMD_MSUB_V4F32( __a__, __b__, __c__ ) _mm_sub_ps( _mm_mul_ps( (__a__), (__b__) ), (__c__) )
- #endif
- #define AKSIMD_MADD_SS_V4F32( __a__, __b__, __c__ ) _mm_add_ss( _mm_mul_ss( (__a__), (__b__) ), (__c__) )
- #define AKSIMD_MIN_V4F32( a, b ) _mm_min_ps( a, b )
- #define AKSIMD_MAX_V4F32( a, b ) _mm_max_ps( a, b )
- #define AKSIMD_ABS_V4F32( a ) _mm_andnot_ps(_mm_set1_ps(-0.f), a)
- #define AKSIMD_NEG_V4F32( __a__ ) _mm_xor_ps(_mm_set1_ps(-0.f), __a__)
- #define AKSIMD_SQRT_V4F32( __a__ ) _mm_sqrt_ps( (__a__) )
- #define AKSIMD_RSQRT_V4F32( __a__ ) _mm_rsqrt_ps( (__a__) )
- #define AKSIMD_RECIP_V4F32(__a__) _mm_rcp_ps(__a__)
- #define AKSIMD_XOR_V4F32( a, b ) _mm_xor_ps(a,b)
- static AkForceInline AKSIMD_V4F32 AKSIMD_CEIL_V4F32(const AKSIMD_V4F32 & x)
- {
- static const AKSIMD_V4F32 vEpsilon = { 0.49999f, 0.49999f, 0.49999f, 0.49999f };
- return _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_add_ps(x, vEpsilon)));
- }
- static AkForceInline AKSIMD_V4F32 AKSIMD_HORIZONTALADD_V4F32(AKSIMD_V4F32 vVec)
- {
- __m128 vAb = _mm_shuffle_ps(vVec, vVec, 0xB1);
- __m128 vHaddAb = _mm_add_ps(vVec, vAb);
- __m128 vHaddCd = _mm_shuffle_ps(vHaddAb, vHaddAb, 0x4E);
- __m128 vHaddAbcd = _mm_add_ps(vHaddAb, vHaddCd);
- return vHaddAbcd;
- }
- static AkForceInline AKSIMD_V4F32 AKSIMD_DOTPRODUCT( AKSIMD_V4F32 & vVec, const AKSIMD_V4F32 & vfSigns )
- {
- AKSIMD_V4F32 vfDotProduct = AKSIMD_MUL_V4F32( vVec, vfSigns );
- return AKSIMD_HORIZONTALADD_V4F32( vfDotProduct );
- }
- static AkForceInline AKSIMD_V4F32 AKSIMD_COMPLEXMUL_V4F32( const AKSIMD_V4F32 vCIn1, const AKSIMD_V4F32 vCIn2 )
- {
- static const AKSIMD_V4F32 vSign = { -0.f, 0.f, -0.f, 0.f };
- AKSIMD_V4F32 vTmp1 = AKSIMD_SHUFFLE_V4F32( vCIn1, vCIn1, AKSIMD_SHUFFLE(2,2,0,0));
- vTmp1 = AKSIMD_MUL_V4F32( vTmp1, vCIn2 );
- AKSIMD_V4F32 vTmp2 = AKSIMD_SHUFFLE_V4F32( vCIn1, vCIn1, AKSIMD_SHUFFLE(3,3,1,1));
- vTmp2 = AKSIMD_XOR_V4F32( vTmp2, vSign );
- vTmp2 = AKSIMD_MADD_V4F32( vTmp2, AKSIMD_SHUFFLE_BADC( vCIn2 ), vTmp1 );
- return vTmp2;
- }
- #ifdef AK_SSE3
- #include <pmmintrin.h>
- static AKSIMD_V4F32 AKSIMD_COMPLEXMUL_SSE3( const AKSIMD_V4F32 vCIn1, const AKSIMD_V4F32 vCIn2 )
- {
- AKSIMD_V4F32 vXMM0 = _mm_moveldup_ps(vCIn1);
- vXMM0 = AKSIMD_MUL_V4F32(vXMM0, vCIn2);
- AKSIMD_V4F32 xMM1 = _mm_shuffle_ps(vCIn2, vCIn2, 0xB1);
- AKSIMD_V4F32 xMM2 = _mm_movehdup_ps(vCIn1);
- xMM2 = AKSIMD_MUL_V4F32( xMM2, xMM1);
- AKSIMD_V4F32 vCOut = _mm_addsub_ps(vXMM0, xMM2);
- return vCOut;
- }
- #endif
- #if __SSE3__
- #define AKSIMD_ADDSUB_V4F32( a, b ) _mm_addsub_ps( a, b)
- #else
- #define AKSIMD_ADDSUB_V4F32( a, b ) _mm_add_ps( a, _mm_xor_ps(b, AKSIMD_SETV_V4F32(0.f, -0.f, 0.f, -0.f)))
- #endif
- #if defined _MSC_VER && ( _MSC_VER <= 1600 )
- #define AKSIMD_ASSERTFLUSHZEROMODE AKASSERT( _MM_GET_FLUSH_ZERO_MODE(dummy) == _MM_FLUSH_ZERO_ON )
- #elif defined(AK_CPU_X86) || defined(AK_CPU_X86_64)
- #define AKSIMD_ASSERTFLUSHZEROMODE AKASSERT( _MM_GET_FLUSH_ZERO_MODE() == _MM_FLUSH_ZERO_ON )
- #else
- #define AKSIMD_ASSERTFLUSHZEROMODE
- #endif
- #define AKSIMD_ADD_V4I32( a, b ) _mm_add_epi32( a, b )
- #define AKSIMD_CMPLT_V4I32( a, b ) _mm_cmplt_epi32(a,b)
- #define AKSIMD_CMPGT_V4I32( a, b ) _mm_cmpgt_epi32(a,b)
- #define AKSIMD_OR_V4I32( a, b ) _mm_or_si128(a,b)
- #define AKSIMD_XOR_V4I32( a, b ) _mm_xor_si128(a,b)
- #define AKSIMD_SUB_V4I32( a, b ) _mm_sub_epi32(a,b)
- #define AKSIMD_NOT_V4I32( a ) _mm_xor_si128(a,_mm_set1_epi32(~0))
- #define AKSIMD_OR_V4F32( a, b ) _mm_or_ps(a,b)
- #define AKSIMD_AND_V4F32( a, b ) _mm_and_ps(a,b)
- #define AKSIMD_ANDNOT_V4F32( a, b ) _mm_andnot_ps(a,b)
- #define AKSIMD_NOT_V4F32( a ) _mm_xor_ps(a,_mm_castsi128_ps(_mm_set1_epi32(~0)))
- #define AKSIMD_OR_V4COND( a, b ) _mm_or_ps(a,b)
- #define AKSIMD_AND_V4COND( a, b ) _mm_and_ps(a,b)
- #define AKSIMD_MULLO16_V4I32( a , b) _mm_mullo_epi16(a, b)
- static AkForceInline AKSIMD_V4I32 AKSIMD_MULLO_V4I32(const AKSIMD_V4I32 vIn1, const AKSIMD_V4I32 vIn2)
- {
- #ifdef __SSE4_1__
- return _mm_mullo_epi32(vIn1, vIn2);
- #else
- __m128i tmp1 = _mm_mul_epu32(vIn1, vIn2);
- __m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(vIn1, 4), _mm_srli_si128(vIn2, 4));
- return _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 2, 0)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 2, 0)));
- #endif
- }
- #define AKSIMD_UNPACKLO_V4F32( a, b ) _mm_unpacklo_ps( a, b )
- #define AKSIMD_UNPACKHI_V4F32( a, b ) _mm_unpackhi_ps( a, b )
- static AkForceInline AKSIMD_V4I32X2 AKSIMD_GATHER_V4I32_AND_DEINTERLEAVE_V4I32X2(AkInt16* addr3, AkInt16* addr2, AkInt16* addr1, AkInt16* addr0)
- {
- __m128i data[4] = {
- _mm_set1_epi32(*(AkInt32*)addr0),
- _mm_set1_epi32(*(AkInt32*)addr1),
- _mm_set1_epi32(*(AkInt32*)addr2),
- _mm_set1_epi32(*(AkInt32*)addr3),
- };
- __m128i group[2] = {
- _mm_unpacklo_epi32(data[0], data[1]),
- _mm_unpacklo_epi32(data[2], data[3]),
- };
- __m128i shuffle = _mm_unpacklo_epi64(group[0], group[1]);
- AKSIMD_V4I32X2 ret{
- _mm_srai_epi32(_mm_slli_epi32(shuffle, 16), 16),
- _mm_srai_epi32(shuffle, 16)
- };
- return ret;
- }
- static AkForceInline AKSIMD_V4I32X4 AKSIMD_GATHER_V4I64_AND_DEINTERLEAVE_V4I32X4(AkInt16* addr3, AkInt16* addr2, AkInt16* addr1, AkInt16* addr0)
- {
- __m128i data[4] = {
- _mm_set1_epi64x(*(AkInt64*)addr0),
- _mm_set1_epi64x(*(AkInt64*)addr1),
- _mm_set1_epi64x(*(AkInt64*)addr2),
- _mm_set1_epi64x(*(AkInt64*)addr3),
- };
-
- __m128i group[2] = {
- _mm_unpacklo_epi64(data[0], data[1]),
- _mm_unpacklo_epi64(data[2], data[3]),
- };
- __m128i shuffle[2] = {
- _mm_castps_si128 (_mm_shuffle_ps(_mm_castsi128_ps(group[0]), _mm_castsi128_ps(group[1]), 0x88)),
- _mm_castps_si128 (_mm_shuffle_ps(_mm_castsi128_ps(group[0]), _mm_castsi128_ps(group[1]), 0xDD)),
- };
- AKSIMD_V4I32X4 ret{
- _mm_srai_epi32(_mm_slli_epi32(shuffle[0],16),16),
- _mm_srai_epi32(shuffle[0],16),
- _mm_srai_epi32(_mm_slli_epi32(shuffle[1],16),16),
- _mm_srai_epi32(shuffle[1],16),
- };
- return ret;
- }
- #define AKSIMD_CMP_CTRLMASK __m128
- #define AKSIMD_LTEQ_V4F32( __a__, __b__ ) _mm_cmple_ps( (__a__), (__b__) )
- #define AKSIMD_LT_V4F32( __a__, __b__ ) _mm_cmplt_ps( (__a__), (__b__) )
- #define AKSIMD_GTEQ_V4F32( __a__, __b__ ) _mm_cmpge_ps( (__a__), (__b__) )
- #define AKSIMD_GT_V4F32( __a__, __b__ ) _mm_cmpgt_ps( (__a__), (__b__) )
- #define AKSIMD_EQ_V4F32( __a__, __b__ ) _mm_cmpeq_ps( (__a__), (__b__) )
- static AkForceInline AKSIMD_V4F32 AKSIMD_VSEL_V4F32( AKSIMD_V4F32 vA, AKSIMD_V4F32 vB, AKSIMD_V4F32 vMask )
- {
- #if defined(__SSE4_1__)
- return _mm_blendv_ps(vA, vB, vMask);
- #else
- vB = _mm_and_ps( vB, vMask );
- vA= _mm_andnot_ps( vMask, vA );
- return _mm_or_ps( vA, vB );
- #endif
- }
- #define AKSIMD_SEL_GTEQ_V4F32( __a__, __b__, __cond1__, __cond2__ ) AKSIMD_VSEL_V4F32( __a__, __b__, AKSIMD_GTEQ_V4F32( __cond1__, __cond2__ ) )
- #define AKSIMD_SEL_GTEZ_V4F32( __a__, __b__, __c__ ) AKSIMD_VSEL_V4F32( (__c__), (__b__), AKSIMD_GTEQ_V4F32( __a__, _mm_set1_ps(0) ) )
- #define AKSIMD_SPLAT_V4F32(var, idx) AKSIMD_SHUFFLE_V4F32(var,var, AKSIMD_SHUFFLE(idx,idx,idx,idx))
- #define AKSIMD_MASK_V4F32( __a__ ) _mm_movemask_ps( __a__ )
- static AkForceInline bool AKSIMD_TESTZERO_V4I32(AKSIMD_V4I32 a)
- {
- return _mm_movemask_epi8(_mm_cmpeq_epi32(a, _mm_setzero_si128())) == 0xFFFF;
- }
- #define AKSIMD_TESTZERO_V4F32( __a__ ) AKSIMD_TESTZERO_V4I32(_mm_castps_si128(__a__))
- #define AKSIMD_TESTZERO_V4COND( __a__ ) AKSIMD_TESTZERO_V4F32(__a__)
- static AkForceInline bool AKSIMD_TESTONES_V4I32(AKSIMD_V4I32 a)
- {
- return _mm_movemask_epi8(_mm_cmpeq_epi32(a, _mm_set1_epi32(~0))) == 0xFFFF;
- }
- #define AKSIMD_TESTONES_V4F32( __a__ ) AKSIMD_TESTONES_V4I32(_mm_castps_si128(__a__))
- #define AKSIMD_TESTONES_V4COND( __a__ ) AKSIMD_TESTONES_V4F32(__a__)
- #define AKSIMD_LOADU_V4I32( __addr__ ) _mm_loadu_si128( (__addr__) )
- #define AKSIMD_LOAD_V4I32( __addr__ ) _mm_loadu_si128( (__addr__) )
- #define AKSIMD_SETZERO_V4I32() _mm_setzero_si128()
- #define AKSIMD_SET_V4I32( __scalar__ ) _mm_set1_epi32( (__scalar__) )
- #define AKSIMD_SETV_V4I32( _d, _c, _b, _a ) _mm_set_epi32( (_d), (_c), (_b), (_a) )
- #define AKSIMD_SETV_V2I64( _b, _a ) _mm_set_epi64x( (_b), (_a) )
- #define AKSIMD_INSERT_V4I32( a, i, index) _mm_insert_epi32(a, i, index)
- #define AKSIMD_INSERT_V2I64( a, i, index) _mm_insert_epi64(a, i, index)
- #define AKSIMD_STORE_V4I32( __addr__, __vec__ ) _mm_storeu_si128( (__m128i*)(__addr__), (__vec__) )
- #define AKSIMD_STOREU_V4I32( __addr__, __vec__ ) _mm_storeu_si128( (__m128i*)(__addr__), (__vec__) )
- #define AKSIMD_CONVERT_V4I32_TO_V4F32( __vec__ ) _mm_cvtepi32_ps( (__vec__) )
- #define AKSIMD_ROUND_V4F32_TO_V4I32( __vec__ ) _mm_cvtps_epi32( (__vec__) )
- #define AKSIMD_TRUNCATE_V4F32_TO_V4I32( __vec__ ) _mm_cvttps_epi32( (__vec__) )
- #define AKSIMD_AND_V4I32( __a__, __b__ ) _mm_and_si128( (__a__), (__b__) )
- #define AKSIMD_CMPGT_V8I16( __a__, __b__ ) _mm_cmpgt_epi16( (__a__), (__b__) )
- #define AKSIMD_CONVERT_V4F16_TO_V4F32_LO(__vec__) AKSIMD_CONVERT_V4F16_TO_V4F32_HELPER( _mm_unpacklo_epi16(_mm_setzero_si128(), __vec__))
- #define AKSIMD_CONVERT_V4F16_TO_V4F32_HI(__vec__) AKSIMD_CONVERT_V4F16_TO_V4F32_HELPER( _mm_unpackhi_epi16(_mm_setzero_si128(), __vec__))
- static AkForceInline AKSIMD_V4F32 AKSIMD_CONVERT_V4F16_TO_V4F32_HELPER(AKSIMD_V4I32 vec)
- {
- __m128i expMantData = _mm_and_si128(vec, _mm_set1_epi32(0x7fff0000));
- __m128i expMantShifted = _mm_srli_epi32(expMantData, 3);
-
-
- __m128i expMantFloat = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(expMantShifted), _mm_castsi128_ps(_mm_set1_epi32(0x77800000))));
-
-
- __m128i infnanCheck = _mm_cmpgt_epi32(expMantData, _mm_set1_epi32(0x7bffffff));
- __m128i infnanExp = _mm_and_si128(infnanCheck, _mm_set1_epi32(255 << 23));
- __m128i expMantWithInfNan = _mm_or_si128(expMantFloat, infnanExp);
-
-
- __m128i signData = _mm_and_si128(vec, _mm_set1_epi32(0x80000000));
- __m128 assembledFloat = _mm_castsi128_ps(_mm_or_si128(signData, expMantWithInfNan));
- return assembledFloat;
- }
- static AkForceInline AKSIMD_V4I32 AKSIMD_CONVERT_V4F32_TO_V4F16(AKSIMD_V4F32 vec)
- {
- __m128i signData = _mm_and_si128(_mm_castps_si128(vec), _mm_set1_epi32(0x80000000));
- __m128i unsignedVec = _mm_and_si128(_mm_castps_si128(vec), _mm_set1_epi32(0x7fffffff));
-
-
- __m128 denormedVec = _mm_add_ps(_mm_castsi128_ps(unsignedVec), _mm_set1_ps(0.5f));
- __m128i denormResult = _mm_slli_epi32(_mm_castps_si128(denormedVec), 16);
-
- __m128i subnormMagic = _mm_set1_epi32(0xC8000FFF);
- __m128i normRoundPart1 = _mm_add_epi32(unsignedVec, subnormMagic);
- __m128i mantLsb = _mm_slli_epi32(unsignedVec, 31 - 13);
- __m128i mantSignExtendLsb = _mm_srai_epi32(mantLsb, 31);
- __m128i normRoundPart2 = _mm_sub_epi32(normRoundPart1, mantSignExtendLsb);
- __m128i normResult = _mm_slli_epi32(normRoundPart2, 3);
-
- __m128i normalMinimum = _mm_set1_epi32((127 - 14) << 23);
- __m128i denormMask = _mm_cmpgt_epi32(normalMinimum, unsignedVec);
- __m128i nonNanFloat = _mm_or_si128(_mm_and_si128(denormMask, denormResult), _mm_andnot_si128(denormMask, normResult));
-
- __m128i isNotInfNanMask = _mm_cmplt_epi32(unsignedVec, _mm_set1_epi32(0x47800000));
- __m128i mantissaData = _mm_and_si128(unsignedVec, _mm_set1_epi32(0x007fffff));
- __m128i isNanMask = _mm_cmpgt_epi32(unsignedVec, _mm_set1_epi32(0x7F800000));
- __m128i nantissaBit = _mm_and_si128(isNanMask, _mm_set1_epi32(0x02000000));
- __m128i infData = _mm_andnot_si128(mantissaData, _mm_set1_epi32(0x7c000000));
- __m128i infNanFloat = _mm_or_si128(infData, nantissaBit);
- __m128i resultWithInfNan = _mm_or_si128(_mm_and_si128(isNotInfNanMask, nonNanFloat), _mm_andnot_si128(isNotInfNanMask, infNanFloat));
-
- __m128i signedResult = _mm_or_si128(signData, resultWithInfNan);
-
- __m128i resultEpi16Lo = _mm_shufflelo_epi16(signedResult, 0xD);
- __m128i resultEpi16Hi = _mm_shufflehi_epi16(signedResult, 0xD);
- __m128 resultEpi16 = _mm_shuffle_ps(_mm_castsi128_ps(resultEpi16Lo), _mm_castsi128_ps(resultEpi16Hi), 0xE4);
- __m128i result = _mm_castps_si128(_mm_shuffle_ps(resultEpi16, _mm_setzero_ps(), 0x8));
- return result;
- }
- #define AKSIMD_CAST_V2F64_TO_V4F32( __vec__ ) _mm_castpd_ps(__vec__)
- #define AKSIMD_CAST_V2F64_TO_V4I32( __vec__ ) _mm_castpd_si128(__vec__)
- #define AKSIMD_CAST_V4F32_TO_V2F64( __vec__ ) _mm_castps_pd(__vec__)
- #define AKSIMD_CAST_V4F32_TO_V4I32( __vec__ ) _mm_castps_si128(__vec__)
- #define AKSIMD_CAST_V4I32_TO_V2F64( __vec__ ) _mm_castsi128_pd(__vec__)
- #define AKSIMD_CAST_V4I32_TO_V4F32( __vec__ ) _mm_castsi128_ps(__vec__)
- #define AKSIMD_CAST_V4COND_TO_V4F32( __vec__ ) (__vec__)
- #define AKSIMD_CAST_V4F32_TO_V4COND( __vec__ ) (__vec__)
- #define AKSIMD_CAST_V4COND_TO_V4I32( __vec__ ) _mm_castps_si128(__vec__)
- #define AKSIMD_CAST_V4I32_TO_V4COND( __vec__ ) _mm_castsi128_ps(__vec__)
- #define AKSIMD_UNPACKLO_VECTOR8I16( a, b ) _mm_unpacklo_epi16( a, b )
- #define AKSIMD_UNPACKHI_VECTOR8I16( a, b ) _mm_unpackhi_epi16( a, b )
- #define AKSIMD_PACKS_V4I32( a, b ) _mm_packs_epi32( a, b )
- #define AKSIMD_SHIFTLEFT_V4I32( __vec__, __shiftBy__ ) \
- _mm_slli_epi32( (__vec__), (__shiftBy__) )
- #define AKSIMD_SHIFTRIGHT_V4I32( __vec__, __shiftBy__ ) \
- _mm_srli_epi32( (__vec__), (__shiftBy__) )
- #define AKSIMD_SHIFTRIGHTARITH_V4I32( __vec__, __shiftBy__ ) \
- _mm_srai_epi32( (__vec__), (__shiftBy__) )
- #if defined( AK_CPU_X86 )
- typedef __m64 AKSIMD_V2F32;
- #define AKSIMD_SETZERO_V2F32() _mm_setzero_si64()
- #define AKSIMD_CMPGT_V2I32( a, b ) _mm_cmpgt_pi16(a,b)
- #define AKSIMD_UNPACKLO_VECTOR4I16( a, b ) _mm_unpacklo_pi16( a, b )
- #define AKSIMD_UNPACKHI_VECTOR4I16( a, b ) _mm_unpackhi_pi16( a, b )
- #define AKSIMD_SHIFTLEFT_V2I32( __vec__, __shiftBy__ ) \
- _mm_slli_pi32( (__vec__), (__shiftBy__) )
- #define AKSIMD_SHIFTRIGHTARITH_V2I32( __vec__, __shiftBy__ ) \
- _mm_srai_pi32( (__vec__), (__shiftBy__) )
- #define AKSIMD_MMX_EMPTY _mm_empty()
- #endif
|