AkSimd.h 34 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785
  1. /*******************************************************************************
  2. The content of this file includes portions of the AUDIOKINETIC Wwise Technology
  3. released in source code form as part of the SDK installer package.
  4. Commercial License Usage
  5. Licensees holding valid commercial licenses to the AUDIOKINETIC Wwise Technology
  6. may use this file in accordance with the end user license agreement provided
  7. with the software or, alternatively, in accordance with the terms contained in a
  8. written agreement between you and Audiokinetic Inc.
  9. Apache License Usage
  10. Alternatively, this file may be used under the Apache License, Version 2.0 (the
  11. "Apache License"); you may not use this file except in compliance with the
  12. Apache License. You may obtain a copy of the Apache License at
  13. http://www.apache.org/licenses/LICENSE-2.0.
  14. Unless required by applicable law or agreed to in writing, software distributed
  15. under the Apache License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
  16. OR CONDITIONS OF ANY KIND, either express or implied. See the Apache License for
  17. the specific language governing permissions and limitations under the License.
  18. Copyright (c) 2023 Audiokinetic Inc.
  19. *******************************************************************************/
  20. // AkSimd.h
  21. /// \file
  22. /// AKSIMD - SSE implementation
  23. #ifndef _AK_SIMD_SSE_H_
  24. #define _AK_SIMD_SSE_H_
  25. #include <AK/SoundEngine/Common/AkTypes.h>
  26. #include <xmmintrin.h>
  27. #include <smmintrin.h>
  28. #include <emmintrin.h>
  29. #if defined(__FMA__) || defined(__AVX2__)
  30. #include <immintrin.h>
  31. #endif
  32. ////////////////////////////////////////////////////////////////////////
  33. /// @name Platform specific defines for prefetching
  34. //@{
  35. #define AKSIMD_ARCHCACHELINESIZE (64) ///< Assumed cache line width for architectures on this platform
  36. #define AKSIMD_ARCHMAXPREFETCHSIZE (512) ///< Use this to control how much prefetching maximum is desirable (assuming 8-way cache)
  37. /// Cross-platform memory prefetch of effective address assuming non-temporal data
  38. #define AKSIMD_PREFETCHMEMORY( __offset__, __add__ ) _mm_prefetch(((char *)(__add__))+(__offset__), _MM_HINT_NTA )
  39. //@}
  40. ////////////////////////////////////////////////////////////////////////
  41. ////////////////////////////////////////////////////////////////////////
  42. /// @name Platform specific memory size alignment for allocation purposes
  43. //@{
  44. #define AKSIMD_ALIGNSIZE( __Size__ ) (((__Size__) + 15) & ~15)
  45. //@}
  46. ////////////////////////////////////////////////////////////////////////
  47. ////////////////////////////////////////////////////////////////////////
  48. /// @name AKSIMD types
  49. //@{
  50. typedef float AKSIMD_F32; ///< 32-bit float
  51. typedef __m128 AKSIMD_V4F32; ///< Vector of 4 32-bit floats
  52. typedef AKSIMD_V4F32 AKSIMD_V4COND; ///< Vector of 4 comparison results
  53. typedef AKSIMD_V4F32 AKSIMD_V4FCOND; ///< Vector of 4 comparison results
  54. typedef __m128i AKSIMD_V4I32; ///< Vector of 4 32-bit signed integers
  55. struct AKSIMD_V4I32X2 { ///< Pair of 4 32-bit signed integers
  56. AKSIMD_V4I32 val[2];
  57. };
  58. struct AKSIMD_V4I32X4 { ///< Quartet of 4 32-bit signed integers
  59. AKSIMD_V4I32 val[4];
  60. };
  61. typedef AKSIMD_V4I32 AKSIMD_V4ICOND;
  62. //@}
  63. ////////////////////////////////////////////////////////////////////////
  64. ////////////////////////////////////////////////////////////////////////
  65. /// @name AKSIMD loading / setting
  66. //@{
  67. /// Loads four single-precision floating-point values from unaligned
  68. /// memory (see _mm_loadu_ps)
  69. #define AKSIMD_LOAD_V4F32( __addr__ ) _mm_loadu_ps( (AkReal32*)(__addr__) )
  70. /// Loads four single-precision floating-point values from unaligned
  71. /// memory (see _mm_loadu_ps)
  72. #define AKSIMD_LOADU_V4F32( __addr__ ) _mm_loadu_ps( (__addr__) )
  73. /// Loads a single single-precision, floating-point value, copying it into
  74. /// all four words (see _mm_load1_ps, _mm_load_ps1)
  75. #define AKSIMD_LOAD1_V4F32( __scalar__ ) _mm_load1_ps( &(__scalar__) )
  76. /// Sets the four single-precision, floating-point values to in_value (see
  77. /// _mm_set1_ps, _mm_set_ps1)
  78. #define AKSIMD_SET_V4F32( __scalar__ ) _mm_set_ps1( (__scalar__) )
  79. /// Sets the two double-precision, floating-point values to in_value
  80. #define AKSIMD_SETV_V2F64( _b, _a ) _mm_castpd_ps(_mm_set_pd( (_b), (_a) ))
  81. /// Populates the full vector with the 4 floating point elements provided
  82. #define AKSIMD_SETV_V4F32( _d, _c, _b, _a ) _mm_set_ps( (_d), (_c), (_b), (_a) )
  83. /// Populates the full vector with the mask[3:0], setting each to 0 or ~0
  84. static AkForceInline AKSIMD_V4COND AKSIMD_SETMASK_V4COND( AkUInt32 x )
  85. {
  86. __m128i temp = _mm_set_epi32(8, 4, 2, 1);
  87. __m128i xvec = _mm_set1_epi32(x);
  88. __m128i xand = _mm_and_si128(xvec, temp);
  89. return _mm_castsi128_ps(_mm_cmpeq_epi32(temp, xand));
  90. }
  91. /// Sets the four single-precision, floating-point values to zero (see
  92. /// _mm_setzero_ps)
  93. #define AKSIMD_SETZERO_V4F32() _mm_setzero_ps()
  94. /// Loads a single-precision, floating-point value into the low word
  95. /// and clears the upper three words.
  96. /// r0 := *p; r1 := 0.0 ; r2 := 0.0 ; r3 := 0.0 (see _mm_load_ss)
  97. #define AKSIMD_LOAD_SS_V4F32( __addr__ ) _mm_load_ss( (__addr__) )
  98. //@}
  99. ////////////////////////////////////////////////////////////////////////
  100. ////////////////////////////////////////////////////////////////////////
  101. /// @name AKSIMD storing
  102. //@{
  103. /// Stores four single-precision, floating-point values. The address
  104. /// does not need to be 16-byte aligned (see _mm_storeu_ps).
  105. #define AKSIMD_STORE_V4F32( __addr__, __vec__ ) _mm_storeu_ps( (AkReal32*)(__addr__), (__vec__) )
  106. /// Stores four single-precision, floating-point values. The address
  107. /// does not need to be 16-byte aligned (see _mm_storeu_ps).
  108. #define AKSIMD_STOREU_V4F32( __addr__, __vec__ ) _mm_storeu_ps( (AkReal32*)(__addr__), (__vec__) )
  109. /// Stores the lower single-precision, floating-point value.
  110. /// *p := a0 (see _mm_store_ss)
  111. #define AKSIMD_STORE1_V4F32( __addr__, __vec__ ) _mm_store_ss( (AkReal32*)(__addr__), (__vec__) )
  112. /// Stores the lower double-precision, floating-point value.
  113. /// *p := a0 (see _mm_store_sd)
  114. #define AKSIMD_STORE1_V2F64( __addr__, __vec__ ) _mm_store_sd( (AkReal64*)(__addr__), _mm_castps_pd(__vec__) )
  115. //@}
  116. ////////////////////////////////////////////////////////////////////////
  117. ////////////////////////////////////////////////////////////////////////
  118. /// @name AKSIMD shuffling
  119. //@{
  120. // Macro for shuffle parameter for AKSIMD_SHUFFLE_V4F32() (see _MM_SHUFFLE)
  121. #define AKSIMD_SHUFFLE( fp3, fp2, fp1, fp0 ) _MM_SHUFFLE( (fp3), (fp2), (fp1), (fp0) )
  122. /// Selects four specific single-precision, floating-point values from
  123. /// a and b, based on the mask i (see _mm_shuffle_ps)
  124. // Usage: AKSIMD_SHUFFLE_V4F32( vec1, vec2, AKSIMD_SHUFFLE( z, y, x, w ) )
  125. #define AKSIMD_SHUFFLE_V4F32( a, b, i ) _mm_shuffle_ps( a, b, i )
  126. #define AKSIMD_SHUFFLE_V4I32( a, b, i ) _mm_castps_si128(_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), i ))
  127. /// Moves the upper two single-precision, floating-point values of b to
  128. /// the lower two single-precision, floating-point values of the result.
  129. /// The upper two single-precision, floating-point values of a are passed
  130. /// through to the result.
  131. /// r3 := a3; r2 := a2; r1 := b3; r0 := b2 (see _mm_movehl_ps)
  132. #define AKSIMD_MOVEHL_V4F32( a, b ) _mm_movehl_ps( a, b )
  133. /// Moves the lower two single-precision, floating-point values of b to
  134. /// the upper two single-precision, floating-point values of the result.
  135. /// The lower two single-precision, floating-point values of a are passed
  136. /// through to the result.
  137. /// r3 := b1 ; r2 := b0 ; r1 := a1 ; r0 := a0 (see _mm_movelh_ps)
  138. #define AKSIMD_MOVELH_V4F32( a, b ) _mm_movelh_ps( a, b )
  139. /// Swap the 2 lower floats together and the 2 higher floats together.
  140. #define AKSIMD_SHUFFLE_BADC( __a__ ) _mm_shuffle_ps( (__a__), (__a__), _MM_SHUFFLE(2,3,0,1))
  141. /// Swap the 2 lower floats with the 2 higher floats.
  142. #define AKSIMD_SHUFFLE_CDAB( __a__ ) _mm_shuffle_ps( (__a__), (__a__), _MM_SHUFFLE(1,0,3,2))
  143. /// Barrel-shift all floats by one.
  144. #define AKSIMD_SHUFFLE_BCDA( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), _MM_SHUFFLE(0,3,2,1))
  145. /// Duplicates the odd items into the even items (d c b a -> d d b b )
  146. #define AKSIMD_DUP_ODD(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(3,3,1,1))
  147. /// Duplicates the even items into the odd items (d c b a -> c c a a )
  148. #define AKSIMD_DUP_EVEN(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(2,2,0,0))
  149. //@}
  150. ////////////////////////////////////////////////////////////////////////
  151. ////////////////////////////////////////////////////////////////////////
  152. /// @name AKSIMD arithmetic
  153. //@{
  154. /// Subtracts the four single-precision, floating-point values of
  155. /// a and b (a - b) (see _mm_sub_ps)
  156. #define AKSIMD_SUB_V4F32( a, b ) _mm_sub_ps( a, b )
  157. /// Subtracts the lower single-precision, floating-point values of a and b.
  158. /// The upper three single-precision, floating-point values are passed through from a.
  159. /// r0 := a0 - b0 ; r1 := a1 ; r2 := a2 ; r3 := a3 (see _mm_sub_ss)
  160. #define AKSIMD_SUB_SS_V4F32( a, b ) _mm_sub_ss( a, b )
  161. /// Adds the four single-precision, floating-point values of
  162. /// a and b (see _mm_add_ps)
  163. #define AKSIMD_ADD_V4F32( a, b ) _mm_add_ps( a, b )
  164. /// Adds the lower single-precision, floating-point values of a and b; the
  165. /// upper three single-precision, floating-point values are passed through from a.
  166. /// r0 := a0 + b0; r1 := a1; r2 := a2; r3 := a3 (see _mm_add_ss)
  167. #define AKSIMD_ADD_SS_V4F32( a, b ) _mm_add_ss( a, b )
  168. /// Multiplies the four single-precision, floating-point values
  169. /// of a and b (see _mm_mul_ps)
  170. #define AKSIMD_MUL_V4F32( a, b ) _mm_mul_ps( a, b )
  171. #define AKSIMD_DIV_V4F32( a, b ) _mm_div_ps( a, b )
  172. /// Multiplies the lower single-precision, floating-point values of
  173. /// a and b; the upper three single-precision, floating-point values
  174. /// are passed through from a.
  175. /// r0 := a0 * b0; r1 := a1; r2 := a2; r3 := a3 (see _mm_add_ss)
  176. #define AKSIMD_MUL_SS_V4F32( a, b ) _mm_mul_ss( a, b )
  177. /// Vector multiply-add operation. (if we're targeting a platform or arch with FMA, (AVX2 implies FMA) using the fma intrinsics directly tends to be slightly more desirable)
  178. #if defined(__FMA__) || defined(__AVX2__)
  179. #define AKSIMD_MADD_V4F32( __a__, __b__, __c__ ) _mm_fmadd_ps( (__a__), (__b__) , (__c__) )
  180. #define AKSIMD_MSUB_V4F32( __a__, __b__, __c__ ) _mm_fmsub_ps( (__a__), (__b__) , (__c__) )
  181. #else
  182. #define AKSIMD_MADD_V4F32( __a__, __b__, __c__ ) _mm_add_ps( _mm_mul_ps( (__a__), (__b__) ), (__c__) )
  183. #define AKSIMD_MSUB_V4F32( __a__, __b__, __c__ ) _mm_sub_ps( _mm_mul_ps( (__a__), (__b__) ), (__c__) )
  184. #endif
  185. /// Vector multiply-add operation.
  186. #define AKSIMD_MADD_SS_V4F32( __a__, __b__, __c__ ) _mm_add_ss( _mm_mul_ss( (__a__), (__b__) ), (__c__) )
  187. /// Computes the minima of the four single-precision, floating-point
  188. /// values of a and b (see _mm_min_ps)
  189. #define AKSIMD_MIN_V4F32( a, b ) _mm_min_ps( a, b )
  190. /// Computes the maximums of the four single-precision, floating-point
  191. /// values of a and b (see _mm_max_ps)
  192. #define AKSIMD_MAX_V4F32( a, b ) _mm_max_ps( a, b )
  193. /// Computes the absolute value
  194. #define AKSIMD_ABS_V4F32( a ) _mm_andnot_ps(_mm_set1_ps(-0.f), a)
  195. /// Changes the sign
  196. #define AKSIMD_NEG_V4F32( __a__ ) _mm_xor_ps(_mm_set1_ps(-0.f), __a__)
  197. /// Vector square root aproximation (see _mm_sqrt_ps)
  198. #define AKSIMD_SQRT_V4F32( __a__ ) _mm_sqrt_ps( (__a__) )
  199. /// Vector reciprocal square root approximation 1/sqrt(a), or equivalently, sqrt(1/a)
  200. #define AKSIMD_RSQRT_V4F32( __a__ ) _mm_rsqrt_ps( (__a__) )
  201. /// Reciprocal of x (1/x)
  202. #define AKSIMD_RECIP_V4F32(__a__) _mm_rcp_ps(__a__)
  203. /// Binary xor for single-precision floating-point
  204. #define AKSIMD_XOR_V4F32( a, b ) _mm_xor_ps(a,b)
  205. /// Rounds to upper value
  206. static AkForceInline AKSIMD_V4F32 AKSIMD_CEIL_V4F32(const AKSIMD_V4F32 & x)
  207. {
  208. static const AKSIMD_V4F32 vEpsilon = { 0.49999f, 0.49999f, 0.49999f, 0.49999f };
  209. return _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_add_ps(x, vEpsilon)));
  210. }
  211. /// Faked in-place vector horizontal add - each element will represent sum of all elements
  212. /// \akwarning
  213. /// Don't expect this to be very efficient.
  214. /// \endakwarning
  215. static AkForceInline AKSIMD_V4F32 AKSIMD_HORIZONTALADD_V4F32(AKSIMD_V4F32 vVec)
  216. {
  217. __m128 vAb = _mm_shuffle_ps(vVec, vVec, 0xB1);
  218. __m128 vHaddAb = _mm_add_ps(vVec, vAb);
  219. __m128 vHaddCd = _mm_shuffle_ps(vHaddAb, vHaddAb, 0x4E);
  220. __m128 vHaddAbcd = _mm_add_ps(vHaddAb, vHaddCd);
  221. return vHaddAbcd;
  222. }
  223. static AkForceInline AKSIMD_V4F32 AKSIMD_DOTPRODUCT( AKSIMD_V4F32 & vVec, const AKSIMD_V4F32 & vfSigns )
  224. {
  225. AKSIMD_V4F32 vfDotProduct = AKSIMD_MUL_V4F32( vVec, vfSigns );
  226. return AKSIMD_HORIZONTALADD_V4F32( vfDotProduct );
  227. }
  228. /// Cross-platform SIMD multiplication of 2 complex data elements with interleaved real and imaginary parts
  229. static AkForceInline AKSIMD_V4F32 AKSIMD_COMPLEXMUL_V4F32( const AKSIMD_V4F32 vCIn1, const AKSIMD_V4F32 vCIn2 )
  230. {
  231. static const AKSIMD_V4F32 vSign = { -0.f, 0.f, -0.f, 0.f };
  232. AKSIMD_V4F32 vTmp1 = AKSIMD_SHUFFLE_V4F32( vCIn1, vCIn1, AKSIMD_SHUFFLE(2,2,0,0));
  233. vTmp1 = AKSIMD_MUL_V4F32( vTmp1, vCIn2 );
  234. AKSIMD_V4F32 vTmp2 = AKSIMD_SHUFFLE_V4F32( vCIn1, vCIn1, AKSIMD_SHUFFLE(3,3,1,1));
  235. vTmp2 = AKSIMD_XOR_V4F32( vTmp2, vSign );
  236. vTmp2 = AKSIMD_MADD_V4F32( vTmp2, AKSIMD_SHUFFLE_BADC( vCIn2 ), vTmp1 );
  237. return vTmp2;
  238. }
  239. #ifdef AK_SSE3
  240. #include <pmmintrin.h>
  241. /// Cross-platform SIMD multiplication of 2 complex data elements with interleaved real and imaginary parts
  242. static AKSIMD_V4F32 AKSIMD_COMPLEXMUL_SSE3( const AKSIMD_V4F32 vCIn1, const AKSIMD_V4F32 vCIn2 )
  243. {
  244. AKSIMD_V4F32 vXMM0 = _mm_moveldup_ps(vCIn1); // multiplier real (a1, a1, a0, a0)
  245. vXMM0 = AKSIMD_MUL_V4F32(vXMM0, vCIn2); // temp1 (a1d1, a1c1, a0d0, a0c0)
  246. AKSIMD_V4F32 xMM1 = _mm_shuffle_ps(vCIn2, vCIn2, 0xB1); // shuf multiplicand(c1, d1, c0, d0)
  247. AKSIMD_V4F32 xMM2 = _mm_movehdup_ps(vCIn1); // multiplier imag (b1, b1, b0, b0)
  248. xMM2 = AKSIMD_MUL_V4F32( xMM2, xMM1); // temp2 (b1c1, b1d1, b0c0, b0d0)
  249. AKSIMD_V4F32 vCOut = _mm_addsub_ps(vXMM0, xMM2); // b1c1+a1d1, a1c1-b1d1, a0d0+b0d0, a0c0-b0c0
  250. return vCOut;
  251. }
  252. #endif
  253. #if __SSE3__
  254. // Alternatively add and subtract packed single-precision (32-bit) floating-point elements in a
  255. // to/from packed elements in b, and store the results in dst.
  256. #define AKSIMD_ADDSUB_V4F32( a, b ) _mm_addsub_ps( a, b)
  257. #else
  258. // Alternatively add and subtract packed single-precision (32-bit) floating-point elements in a
  259. // to/from packed elements in b, and store the results in dst.
  260. #define AKSIMD_ADDSUB_V4F32( a, b ) _mm_add_ps( a, _mm_xor_ps(b, AKSIMD_SETV_V4F32(0.f, -0.f, 0.f, -0.f)))
  261. #endif
  262. #if defined _MSC_VER && ( _MSC_VER <= 1600 )
  263. #define AKSIMD_ASSERTFLUSHZEROMODE AKASSERT( _MM_GET_FLUSH_ZERO_MODE(dummy) == _MM_FLUSH_ZERO_ON )
  264. #elif defined(AK_CPU_X86) || defined(AK_CPU_X86_64)
  265. #define AKSIMD_ASSERTFLUSHZEROMODE AKASSERT( _MM_GET_FLUSH_ZERO_MODE() == _MM_FLUSH_ZERO_ON )
  266. #else
  267. #define AKSIMD_ASSERTFLUSHZEROMODE
  268. #endif
  269. //@}
  270. ////////////////////////////////////////////////////////////////////////
  271. ////////////////////////////////////////////////////////////////////////
  272. /// @name AKSIMD integer arithmetic
  273. //@{
  274. /// Adds the four integer values of a and b
  275. #define AKSIMD_ADD_V4I32( a, b ) _mm_add_epi32( a, b )
  276. #define AKSIMD_CMPLT_V4I32( a, b ) _mm_cmplt_epi32(a,b)
  277. #define AKSIMD_CMPGT_V4I32( a, b ) _mm_cmpgt_epi32(a,b)
  278. #define AKSIMD_OR_V4I32( a, b ) _mm_or_si128(a,b)
  279. #define AKSIMD_XOR_V4I32( a, b ) _mm_xor_si128(a,b)
  280. #define AKSIMD_SUB_V4I32( a, b ) _mm_sub_epi32(a,b)
  281. #define AKSIMD_NOT_V4I32( a ) _mm_xor_si128(a,_mm_set1_epi32(~0))
  282. #define AKSIMD_OR_V4F32( a, b ) _mm_or_ps(a,b)
  283. #define AKSIMD_AND_V4F32( a, b ) _mm_and_ps(a,b)
  284. #define AKSIMD_ANDNOT_V4F32( a, b ) _mm_andnot_ps(a,b)
  285. #define AKSIMD_NOT_V4F32( a ) _mm_xor_ps(a,_mm_castsi128_ps(_mm_set1_epi32(~0)))
  286. #define AKSIMD_OR_V4COND( a, b ) _mm_or_ps(a,b)
  287. #define AKSIMD_AND_V4COND( a, b ) _mm_and_ps(a,b)
  288. /// Multiplies the low 16bits of a by b and stores it in V4I32 (no overflow)
  289. #define AKSIMD_MULLO16_V4I32( a , b) _mm_mullo_epi16(a, b)
  290. /// Multiplies the low 32bits of a by b and stores it in V4I32 (no overflow)
  291. static AkForceInline AKSIMD_V4I32 AKSIMD_MULLO_V4I32(const AKSIMD_V4I32 vIn1, const AKSIMD_V4I32 vIn2)
  292. {
  293. #ifdef __SSE4_1__ // use SSE 4.1 version directly where possible
  294. return _mm_mullo_epi32(vIn1, vIn2);
  295. #else // use SSE 2 otherwise
  296. __m128i tmp1 = _mm_mul_epu32(vIn1, vIn2); // mul 2,0
  297. __m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(vIn1, 4), _mm_srli_si128(vIn2, 4)); // mul 3,1
  298. return _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 2, 0)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 2, 0))); // shuffle results to [63..0] and pack
  299. #endif
  300. }
  301. //@}
  302. ////////////////////////////////////////////////////////////////////////
  303. ////////////////////////////////////////////////////////////////////////
  304. /// @name AKSIMD packing / unpacking
  305. //@{
  306. /// Selects and interleaves the lower two single-precision, floating-point
  307. /// values from a and b (see _mm_unpacklo_ps)
  308. #define AKSIMD_UNPACKLO_V4F32( a, b ) _mm_unpacklo_ps( a, b )
  309. /// Selects and interleaves the upper two single-precision, floating-point
  310. /// values from a and b (see _mm_unpackhi_ps)
  311. #define AKSIMD_UNPACKHI_V4F32( a, b ) _mm_unpackhi_ps( a, b )
  312. // Given four pointers, gathers 32-bits of data from each location,
  313. // deinterleaves them as 16-bits of each, and sign-extends to 32-bits
  314. // e.g. (*addr[0]) := (b a)
  315. // e.g. (*addr[1]) := (d c)
  316. // e.g. (*addr[2]) := (f e)
  317. // e.g. (*addr[3]) := (h g)
  318. // return struct has
  319. // val[0] := (g e c a)
  320. // val[1] := (h f d b)
  321. static AkForceInline AKSIMD_V4I32X2 AKSIMD_GATHER_V4I32_AND_DEINTERLEAVE_V4I32X2(AkInt16* addr3, AkInt16* addr2, AkInt16* addr1, AkInt16* addr0)
  322. {
  323. __m128i data[4] = {
  324. _mm_set1_epi32(*(AkInt32*)addr0),
  325. _mm_set1_epi32(*(AkInt32*)addr1),
  326. _mm_set1_epi32(*(AkInt32*)addr2),
  327. _mm_set1_epi32(*(AkInt32*)addr3),
  328. };
  329. __m128i group[2] = {
  330. _mm_unpacklo_epi32(data[0], data[1]),
  331. _mm_unpacklo_epi32(data[2], data[3]),
  332. };
  333. __m128i shuffle = _mm_unpacklo_epi64(group[0], group[1]);
  334. AKSIMD_V4I32X2 ret{
  335. _mm_srai_epi32(_mm_slli_epi32(shuffle, 16), 16),
  336. _mm_srai_epi32(shuffle, 16)
  337. };
  338. return ret;
  339. }
  340. // Given four pointers, gathers 64-bits of data from each location,
  341. // deinterleaves them as 16-bits of each, and sign-extends to 32-bits
  342. // e.g. (*addr[0]) := (d c b a)
  343. // e.g. (*addr[1]) := (h g f e)
  344. // e.g. (*addr[2]) := (l k j i)
  345. // e.g. (*addr[3]) := (p o n m)
  346. // return struct has
  347. // val[0] := (m i e a)
  348. // val[1] := (n j f b)
  349. // val[2] := (o k g c)
  350. // val[3] := (p l h d)
  351. static AkForceInline AKSIMD_V4I32X4 AKSIMD_GATHER_V4I64_AND_DEINTERLEAVE_V4I32X4(AkInt16* addr3, AkInt16* addr2, AkInt16* addr1, AkInt16* addr0)
  352. {
  353. __m128i data[4] = {
  354. _mm_set1_epi64x(*(AkInt64*)addr0),
  355. _mm_set1_epi64x(*(AkInt64*)addr1),
  356. _mm_set1_epi64x(*(AkInt64*)addr2),
  357. _mm_set1_epi64x(*(AkInt64*)addr3),
  358. };
  359. __m128i group[2] = {
  360. _mm_unpacklo_epi64(data[0], data[1]),
  361. _mm_unpacklo_epi64(data[2], data[3]),
  362. };
  363. __m128i shuffle[2] = {
  364. _mm_castps_si128 (_mm_shuffle_ps(_mm_castsi128_ps(group[0]), _mm_castsi128_ps(group[1]), 0x88)),
  365. _mm_castps_si128 (_mm_shuffle_ps(_mm_castsi128_ps(group[0]), _mm_castsi128_ps(group[1]), 0xDD)),
  366. };
  367. AKSIMD_V4I32X4 ret{
  368. _mm_srai_epi32(_mm_slli_epi32(shuffle[0],16),16),
  369. _mm_srai_epi32(shuffle[0],16),
  370. _mm_srai_epi32(_mm_slli_epi32(shuffle[1],16),16),
  371. _mm_srai_epi32(shuffle[1],16),
  372. };
  373. return ret;
  374. }
  375. //@}
  376. ////////////////////////////////////////////////////////////////////////
  377. ////////////////////////////////////////////////////////////////////////
  378. /// @name AKSIMD vector comparison
  379. /// Apart from AKSIMD_SEL_GTEQ_V4F32, these implementations are limited to a few platforms.
  380. //@{
  381. #define AKSIMD_CMP_CTRLMASK __m128
  382. /// Vector "<=" operation (see _mm_cmple_ps)
  383. #define AKSIMD_LTEQ_V4F32( __a__, __b__ ) _mm_cmple_ps( (__a__), (__b__) )
  384. #define AKSIMD_LT_V4F32( __a__, __b__ ) _mm_cmplt_ps( (__a__), (__b__) )
  385. /// Vector ">=" operation (see _mm_cmple_ps)
  386. #define AKSIMD_GTEQ_V4F32( __a__, __b__ ) _mm_cmpge_ps( (__a__), (__b__) )
  387. #define AKSIMD_GT_V4F32( __a__, __b__ ) _mm_cmpgt_ps( (__a__), (__b__) )
  388. /// Vector "==" operation (see _mm_cmpeq_ps)
  389. #define AKSIMD_EQ_V4F32( __a__, __b__ ) _mm_cmpeq_ps( (__a__), (__b__) )
  390. /// Return a when control mask is 0, return b when control mask is non zero, control mask is in c and usually provided by above comparison operations
  391. static AkForceInline AKSIMD_V4F32 AKSIMD_VSEL_V4F32( AKSIMD_V4F32 vA, AKSIMD_V4F32 vB, AKSIMD_V4F32 vMask )
  392. {
  393. vB = _mm_and_ps( vB, vMask );
  394. vA= _mm_andnot_ps( vMask, vA );
  395. return _mm_or_ps( vA, vB );
  396. }
  397. // (cond1 >= cond2) ? b : a.
  398. #define AKSIMD_SEL_GTEQ_V4F32( __a__, __b__, __cond1__, __cond2__ ) AKSIMD_VSEL_V4F32( __a__, __b__, AKSIMD_GTEQ_V4F32( __cond1__, __cond2__ ) )
  399. // a >= 0 ? b : c ... Written, like, you know, the normal C++ operator syntax.
  400. #define AKSIMD_SEL_GTEZ_V4F32( __a__, __b__, __c__ ) AKSIMD_VSEL_V4F32( (__c__), (__b__), AKSIMD_GTEQ_V4F32( __a__, _mm_set1_ps(0) ) )
  401. #define AKSIMD_SPLAT_V4F32(var, idx) AKSIMD_SHUFFLE_V4F32(var,var, AKSIMD_SHUFFLE(idx,idx,idx,idx))
  402. #define AKSIMD_MASK_V4F32( __a__ ) _mm_movemask_ps( __a__ )
  403. // returns true if every element of the provided vector is zero
  404. static AkForceInline bool AKSIMD_TESTZERO_V4I32(AKSIMD_V4I32 a)
  405. {
  406. return _mm_movemask_epi8(_mm_cmpeq_epi32(a, _mm_setzero_si128())) == 0xFFFF;
  407. }
  408. #define AKSIMD_TESTZERO_V4F32( __a__ ) AKSIMD_TESTZERO_V4I32(_mm_castps_si128(__a__))
  409. #define AKSIMD_TESTZERO_V4COND( __a__ ) AKSIMD_TESTZERO_V4F32(__a__)
  410. // returns true if every element of the provided vector is ones
  411. static AkForceInline bool AKSIMD_TESTONES_V4I32(AKSIMD_V4I32 a)
  412. {
  413. return _mm_movemask_epi8(_mm_cmpeq_epi32(a, _mm_set1_epi32(~0))) == 0xFFFF;
  414. }
  415. #define AKSIMD_TESTONES_V4F32( __a__ ) AKSIMD_TESTONES_V4I32(_mm_castps_si128(__a__))
  416. #define AKSIMD_TESTONES_V4COND( __a__ ) AKSIMD_TESTONES_V4F32(__a__)
  417. //@}
  418. ////////////////////////////////////////////////////////////////////////
  419. /// Loads unaligned 128-bit value (see _mm_loadu_si128)
  420. #define AKSIMD_LOADU_V4I32( __addr__ ) _mm_loadu_si128( (__addr__) )
  421. /// Loads aligned 128-bit value (see _mm_loadu_si128)
  422. #define AKSIMD_LOAD_V4I32( __addr__ ) _mm_loadu_si128( (__addr__) )
  423. /// Sets the four 32-bit integer values to zero (see _mm_setzero_si128)
  424. #define AKSIMD_SETZERO_V4I32() _mm_setzero_si128()
  425. #define AKSIMD_SET_V4I32( __scalar__ ) _mm_set1_epi32( (__scalar__) )
  426. #define AKSIMD_SETV_V4I32( _d, _c, _b, _a ) _mm_set_epi32( (_d), (_c), (_b), (_a) )
  427. #define AKSIMD_SETV_V2I64( _b, _a ) _mm_set_epi64x( (_b), (_a) )
  428. /// Sets the 32b integer i at the location specified by index in a
  429. #define AKSIMD_INSERT_V4I32( a, i, index) _mm_insert_epi32(a, i, index)
  430. /// Sets the 64b integer i at the location specified by index in a
  431. #define AKSIMD_INSERT_V2I64( a, i, index) _mm_insert_epi64(a, i, index)
  432. /// Stores four 32-bit integer values. The address
  433. /// does not need to be 16-byte aligned (see _mm_storeu_si128).
  434. #define AKSIMD_STORE_V4I32( __addr__, __vec__ ) _mm_storeu_si128( (__m128i*)(__addr__), (__vec__) )
  435. /// Stores four 32-bit integer values. The address
  436. /// does not need to be 16-byte aligned (see _mm_storeu_si128).
  437. #define AKSIMD_STOREU_V4I32( __addr__, __vec__ ) _mm_storeu_si128( (__m128i*)(__addr__), (__vec__) )
  438. ////////////////////////////////////////////////////////////////////////
  439. /// @name AKSIMD conversion
  440. //@{
  441. /// Converts the four signed 32-bit integer values of a to single-precision,
  442. /// floating-point values (see _mm_cvtepi32_ps)
  443. #define AKSIMD_CONVERT_V4I32_TO_V4F32( __vec__ ) _mm_cvtepi32_ps( (__vec__) )
  444. /// Converts the four single-precision, floating-point values of a to signed
  445. /// 32-bit integer values by rounding (see _mm_cvtps_epi32)
  446. #define AKSIMD_ROUND_V4F32_TO_V4I32( __vec__ ) _mm_cvtps_epi32( (__vec__) )
  447. /// Converts the four single-precision, floating-point values of a to signed
  448. /// 32-bit integer values by truncating (see _mm_cvttps_epi32)
  449. #define AKSIMD_TRUNCATE_V4F32_TO_V4I32( __vec__ ) _mm_cvttps_epi32( (__vec__) )
  450. /// Computes the bitwise AND of the 128-bit value in a and the
  451. /// 128-bit value in b (see _mm_and_si128)
  452. #define AKSIMD_AND_V4I32( __a__, __b__ ) _mm_and_si128( (__a__), (__b__) )
  453. /// Compares the 8 signed 16-bit integers in a and the 8 signed
  454. /// 16-bit integers in b for greater than (see _mm_cmpgt_epi16)
  455. #define AKSIMD_CMPGT_V8I16( __a__, __b__ ) _mm_cmpgt_epi16( (__a__), (__b__) )
  456. /// Converts the 4 half-precision floats in the lower 64-bits of the provided
  457. /// vector to 4 full-precision floats
  458. #define AKSIMD_CONVERT_V4F16_TO_V4F32_LO(__vec__) AKSIMD_CONVERT_V4F16_TO_V4F32_HELPER( _mm_unpacklo_epi16(_mm_setzero_si128(), __vec__))
  459. /// Converts the 4 half-precision floats in the upper 64-bits of the provided
  460. /// vector to 4 full-precision floats
  461. #define AKSIMD_CONVERT_V4F16_TO_V4F32_HI(__vec__) AKSIMD_CONVERT_V4F16_TO_V4F32_HELPER( _mm_unpackhi_epi16(_mm_setzero_si128(), __vec__))
  462. static AkForceInline AKSIMD_V4F32 AKSIMD_CONVERT_V4F16_TO_V4F32_HELPER(AKSIMD_V4I32 vec)
  463. {
  464. __m128i expMantData = _mm_and_si128(vec, _mm_set1_epi32(0x7fff0000));
  465. __m128i expMantShifted = _mm_srli_epi32(expMantData, 3); // shift so that the float16 exp/mant is now split along float32's bounds
  466. // magic number to get scale fp16 exp range into fp32 exp range (also renormalize any denorms)
  467. __m128i expMantFloat = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(expMantShifted), _mm_castsi128_ps(_mm_set1_epi32(0x77800000))));
  468. // if fp16 val was inf or nan, preserve the inf/nan exponent field (we can just 'or' the new inf-bits into the attempt at scaling from inf previously)
  469. __m128i infnanCheck = _mm_cmpgt_epi32(expMantData, _mm_set1_epi32(0x7bffffff));
  470. __m128i infnanExp = _mm_and_si128(infnanCheck, _mm_set1_epi32(255 << 23));
  471. __m128i expMantWithInfNan = _mm_or_si128(expMantFloat, infnanExp);
  472. // reincorporate the sign
  473. __m128i signData = _mm_and_si128(vec, _mm_set1_epi32(0x80000000));
  474. __m128 assembledFloat = _mm_castsi128_ps(_mm_or_si128(signData, expMantWithInfNan));
  475. return assembledFloat;
  476. }
  477. /// Converts the 4 full-precision floats vector to 4 half-precision floats
  478. /// occupying the lower bits and leaving the upper bits as zero
  479. static AkForceInline AKSIMD_V4I32 AKSIMD_CONVERT_V4F32_TO_V4F16(AKSIMD_V4F32 vec)
  480. {
  481. __m128i signData = _mm_and_si128(_mm_castps_si128(vec), _mm_set1_epi32(0x80000000));
  482. __m128i unsignedVec = _mm_and_si128(_mm_castps_si128(vec), _mm_set1_epi32(0x7fffffff));
  483. // do the processing for values that will be denormed in float16
  484. // Add 0.5 to get value within range, and rounde; then move mantissa data up
  485. __m128 denormedVec = _mm_add_ps(_mm_castsi128_ps(unsignedVec), _mm_set1_ps(0.5f));
  486. __m128i denormResult = _mm_slli_epi32(_mm_castps_si128(denormedVec), 16);
  487. // processing for values that will be normal in float16
  488. __m128i subnormMagic = _mm_set1_epi32(0xC8000FFF); // -131072 + rounding bias
  489. __m128i normRoundPart1 = _mm_add_epi32(unsignedVec, subnormMagic);
  490. __m128i mantLsb = _mm_slli_epi32(unsignedVec, 31 - 13);
  491. __m128i mantSignExtendLsb = _mm_srai_epi32(mantLsb, 31); // Extend Lsb so that it's -1 when set
  492. __m128i normRoundPart2 = _mm_sub_epi32(normRoundPart1, mantSignExtendLsb); // and subtract the sign-extended bit to finish rounding up
  493. __m128i normResult = _mm_slli_epi32(normRoundPart2, 3);
  494. // Combine the norm and subnorm paths together
  495. __m128i normalMinimum = _mm_set1_epi32((127 - 14) << 23); // smallest float32 that yields a normalized float16
  496. __m128i denormMask = _mm_cmpgt_epi32(normalMinimum, unsignedVec);
  497. __m128i nonNanFloat = _mm_or_si128(_mm_and_si128(denormMask, denormResult), _mm_andnot_si128(denormMask, normResult));
  498. // apply inf/nan check
  499. __m128i isNotInfNanMask = _mm_cmplt_epi32(unsignedVec, _mm_set1_epi32(0x47800000)); // test if the value will be greater than the max representable by float16
  500. __m128i mantissaData = _mm_and_si128(unsignedVec, _mm_set1_epi32(0x007fffff));
  501. __m128i isNanMask = _mm_cmpgt_epi32(unsignedVec, _mm_set1_epi32(0x7F800000)); // mark the parts of the vector where we have a mantissa (i.e. NAN) as 0xffffffff
  502. __m128i nantissaBit = _mm_and_si128(isNanMask, _mm_set1_epi32(0x02000000)); // set the NaN mantissa bit if mantissa suggests this is NaN
  503. __m128i infData = _mm_andnot_si128(mantissaData, _mm_set1_epi32(0x7c000000)); // grab the exponent data from unsigned vec with no mantissa
  504. __m128i infNanFloat = _mm_or_si128(infData, nantissaBit); // if we have a non-zero mantissa, add the NaN mantissa bit
  505. __m128i resultWithInfNan = _mm_or_si128(_mm_and_si128(isNotInfNanMask, nonNanFloat), _mm_andnot_si128(isNotInfNanMask, infNanFloat));
  506. // reincorporate the original sign
  507. __m128i signedResult = _mm_or_si128(signData, resultWithInfNan);
  508. // store results packed in lower 64 bits, and set upper 64 to zero
  509. __m128i resultEpi16Lo = _mm_shufflelo_epi16(signedResult, 0xD); // move 16b ints (x,x,x,x,d,c,b,a) down to (x,x,x,x,x,x,d,b)
  510. __m128i resultEpi16Hi = _mm_shufflehi_epi16(signedResult, 0xD); // move 16b ints (h,g,f,e,x,x,x,x) down to (x,x,h,f,x,x,x,x)
  511. __m128 resultEpi16 = _mm_shuffle_ps(_mm_castsi128_ps(resultEpi16Lo), _mm_castsi128_ps(resultEpi16Hi), 0xE4); // combine - (x, x, h, f, x, x, d, b)
  512. __m128i result = _mm_castps_si128(_mm_shuffle_ps(resultEpi16, _mm_setzero_ps(), 0x8)); // reshuffle with zero - (0,0,0,0,h,f,d,b)
  513. return result;
  514. }
  515. //@}
  516. ////////////////////////////////////////////////////////////////////////
  517. ////////////////////////////////////////////////////////////////////////
  518. /// @name AKSIMD cast
  519. //@{
  520. /// Cast vector of type AKSIMD_V2F64 to type AKSIMD_V4F32. This intrinsic is only
  521. /// used for compilation and does not generate any instructions, thus it has zero latency.
  522. #define AKSIMD_CAST_V2F64_TO_V4F32( __vec__ ) _mm_castpd_ps(__vec__)
  523. /// Cast vector of type AKSIMD_V2F64 to type AKSIMD_V4I32. This intrinsic is only
  524. /// used for compilation and does not generate any instructions, thus it has zero latency.
  525. #define AKSIMD_CAST_V2F64_TO_V4I32( __vec__ ) _mm_castpd_si128(__vec__)
  526. /// Cast vector of type AKSIMD_V4F32 to type AKSIMD_V2F64. This intrinsic is only
  527. /// used for compilation and does not generate any instructions, thus it has zero latency.
  528. #define AKSIMD_CAST_V4F32_TO_V2F64( __vec__ ) _mm_castps_pd(__vec__)
  529. /// Cast vector of type AKSIMD_V4F32 to type AKSIMD_V4I32. This intrinsic is only
  530. /// used for compilation and does not generate any instructions, thus it has zero latency.
  531. #define AKSIMD_CAST_V4F32_TO_V4I32( __vec__ ) _mm_castps_si128(__vec__)
  532. /// Cast vector of type AKSIMD_V4I32 to type AKSIMD_V2F64. This intrinsic is only
  533. /// used for compilation and does not generate any instructions, thus it has zero latency.
  534. #define AKSIMD_CAST_V4I32_TO_V2F64( __vec__ ) _mm_castsi128_pd(__vec__)
  535. /// Cast vector of type AKSIMD_V4I32 to type AKSIMD_V4F32. This intrinsic is only
  536. /// used for compilation and does not generate any instructions, thus it has zero latency.
  537. #define AKSIMD_CAST_V4I32_TO_V4F32( __vec__ ) _mm_castsi128_ps(__vec__)
  538. /// Cast vector of type AKSIMD_V4COND to AKSIMD_V4F32.
  539. #define AKSIMD_CAST_V4COND_TO_V4F32( __vec__ ) (__vec__)
  540. /// Cast vector of type AKSIMD_V4F32 to AKSIMD_V4COND.
  541. #define AKSIMD_CAST_V4F32_TO_V4COND( __vec__ ) (__vec__)
  542. /// Cast vector of type AKSIMD_V4COND to AKSIMD_V4I32.
  543. #define AKSIMD_CAST_V4COND_TO_V4I32( __vec__ ) _mm_castps_si128(__vec__)
  544. /// Cast vector of type AKSIMD_V4I32 to AKSIMD_V4COND.
  545. #define AKSIMD_CAST_V4I32_TO_V4COND( __vec__ ) _mm_castsi128_ps(__vec__)
  546. //@}
  547. ////////////////////////////////////////////////////////////////////////
  548. /// Interleaves the lower 4 signed or unsigned 16-bit integers in a with
  549. /// the lower 4 signed or unsigned 16-bit integers in b (see _mm_unpacklo_epi16)
  550. #define AKSIMD_UNPACKLO_VECTOR8I16( a, b ) _mm_unpacklo_epi16( a, b )
  551. /// Interleaves the upper 4 signed or unsigned 16-bit integers in a with
  552. /// the upper 4 signed or unsigned 16-bit integers in b (see _mm_unpackhi_epi16)
  553. #define AKSIMD_UNPACKHI_VECTOR8I16( a, b ) _mm_unpackhi_epi16( a, b )
  554. /// Packs the 8 signed 32-bit integers from a and b into signed 16-bit
  555. /// integers and saturates (see _mm_packs_epi32)
  556. #define AKSIMD_PACKS_V4I32( a, b ) _mm_packs_epi32( a, b )
  557. ////////////////////////////////////////////////////////////////////////
  558. /// @name AKSIMD shifting
  559. //@{
  560. /// Shifts the 4 signed or unsigned 32-bit integers in a left by
  561. /// in_shiftBy bits while shifting in zeros (see _mm_slli_epi32)
  562. #define AKSIMD_SHIFTLEFT_V4I32( __vec__, __shiftBy__ ) \
  563. _mm_slli_epi32( (__vec__), (__shiftBy__) )
  564. /// Shifts the 4 signed or unsigned 32-bit integers in a right by
  565. /// in_shiftBy bits while shifting in zeros (see _mm_srli_epi32)
  566. #define AKSIMD_SHIFTRIGHT_V4I32( __vec__, __shiftBy__ ) \
  567. _mm_srli_epi32( (__vec__), (__shiftBy__) )
  568. /// Shifts the 4 signed 32-bit integers in a right by in_shiftBy
  569. /// bits while shifting in the sign bit (see _mm_srai_epi32)
  570. #define AKSIMD_SHIFTRIGHTARITH_V4I32( __vec__, __shiftBy__ ) \
  571. _mm_srai_epi32( (__vec__), (__shiftBy__) )
  572. //@}
  573. ////////////////////////////////////////////////////////////////////////
  574. #if defined( AK_CPU_X86 ) /// MMX
  575. typedef __m64 AKSIMD_V2F32; ///< Vector of 2 32-bit floats
  576. #define AKSIMD_SETZERO_V2F32() _mm_setzero_si64();
  577. #define AKSIMD_CMPGT_V2I32( a, b ) _mm_cmpgt_pi16(a,b)
  578. /// Interleaves the lower 2 signed or unsigned 16-bit integers in a with
  579. /// the lower 2 signed or unsigned 16-bit integers in b (see _mm_unpackhi_epi16)
  580. #define AKSIMD_UNPACKLO_VECTOR4I16( a, b ) _mm_unpacklo_pi16( a, b )
  581. /// Interleaves the upper 2 signed or unsigned 16-bit integers in a with
  582. /// the upper 2 signed or unsigned 16-bit integers in b (see _mm_unpackhi_epi16)
  583. #define AKSIMD_UNPACKHI_VECTOR4I16( a, b ) _mm_unpackhi_pi16( a, b )
  584. /// Shifts the 2 signed or unsigned 32-bit integers in a left by
  585. /// in_shiftBy bits while shifting in zeros (see _mm_slli_epi32)
  586. #define AKSIMD_SHIFTLEFT_V2I32( __vec__, __shiftBy__ ) \
  587. _mm_slli_pi32( (__vec__), (__shiftBy__) )
  588. /// Shifts the 2 signed 32-bit integers in a right by in_shiftBy
  589. /// bits while shifting in the sign bit (see _mm_srai_epi32)
  590. #define AKSIMD_SHIFTRIGHTARITH_V2I32( __vec__, __shiftBy__ ) \
  591. _mm_srai_pi32( (__vec__), (__shiftBy__) )
  592. /// Used when ending a block of code that utilizes any MMX construct on x86 code
  593. /// so that the x87 FPU can be used again
  594. #define AKSIMD_MMX_EMPTY _mm_empty();
  595. #endif
  596. #endif //_AK_SIMD_SSE_H_