AkSimdAvx2.h 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275
  1. /*******************************************************************************
  2. The content of this file includes portions of the AUDIOKINETIC Wwise Technology
  3. released in source code form as part of the SDK installer package.
  4. Commercial License Usage
  5. Licensees holding valid commercial licenses to the AUDIOKINETIC Wwise Technology
  6. may use this file in accordance with the end user license agreement provided
  7. with the software or, alternatively, in accordance with the terms contained in a
  8. written agreement between you and Audiokinetic Inc.
  9. Apache License Usage
  10. Alternatively, this file may be used under the Apache License, Version 2.0 (the
  11. "Apache License"); you may not use this file except in compliance with the
  12. Apache License. You may obtain a copy of the Apache License at
  13. http://www.apache.org/licenses/LICENSE-2.0.
  14. Unless required by applicable law or agreed to in writing, software distributed
  15. under the Apache License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
  16. OR CONDITIONS OF ANY KIND, either express or implied. See the Apache License for
  17. the specific language governing permissions and limitations under the License.
  18. Copyright (c) 2023 Audiokinetic Inc.
  19. *******************************************************************************/
  20. // AkSimdAvx2.h
  21. /// \file
  22. /// AKSIMD - AVX2 implementation
  23. #ifndef _AK_SIMD_AVX2_H_
  24. #define _AK_SIMD_AVX2_H_
  25. #include <AK/SoundEngine/Common/AkTypes.h>
  26. #include <AK/SoundEngine/Platforms/SSE/AkSimd.h>
  27. #if !defined(__AVX2__)
  28. #error "Inclusion of AkSimdAvx2.h requires AVX2 instruction sets to be defined on platform"
  29. #endif
  30. #include <AK/SoundEngine/Platforms/SSE/AkSimdAvx.h>
  31. #include <string.h>
  32. ////////////////////////////////////////////////////////////////////////
  33. /// @name AKSIMD arithmetic
  34. //@{
  35. /// Cross-platform SIMD multiplication of 8 complex data elements with interleaved real and imaginary parts,
  36. /// and taking advantage of fused-multiply-add instructions
  37. static AkForceInline AKSIMD_V8F32 AKSIMD_COMPLEXMUL_AVX2(const AKSIMD_V8F32 cIn1, const AKSIMD_V8F32 cIn2)
  38. {
  39. __m256 real1Ext = _mm256_moveldup_ps(cIn1); // reals extended (a3, a3, a2, a2, a1, a1, a0, a0)
  40. __m256 in2Shuf = _mm256_shuffle_ps(cIn2, cIn2, 0xB1); // shuf multiplicand (c3, d3, c2, d2, c1, d1, c0, d0)
  41. __m256 imag1Ext = _mm256_movehdup_ps(cIn1); // multiplier imag (b3, b3, b2, b2, b1, b1, b0, b0)
  42. __m256 temp = _mm256_mul_ps(imag1Ext, in2Shuf); // temp (b3c3, b3d3, b2c2, b2d2, b1c1, b1d1, b0c0, b0d0)
  43. __m256 out = _mm256_fmaddsub_ps(real1Ext, cIn2, temp); // final (a3d3+b3c3, a3c3-b3d3, a2d2+b2c2, a2c2-b2d2, a1d1+b1c1, a1c1-b1d1, a0d0+b0c0, a0c0-b0d0)
  44. return out;
  45. }
  46. /// Vector multiply-add-sub operation.
  47. #define AKSIMD_MADDSUB_V8F32( __a__, __b__, __c__ ) _mm256_fmaddsub_ps( (__a__), (__b__), (__c__) )
  48. #define AKSIMD_MSUBADD_V8F32( __a__, __b__, __c__ ) _mm256_fmsubadd_ps( (__a__), (__b__), (__c__) )
  49. /// Vector multiply-add operation.
  50. #define AKSIMD_MADD_V8F32( __a__, __b__, __c__ ) _mm256_fmadd_ps( (__a__), (__b__) , (__c__) )
  51. #define AKSIMD_MSUB_V8F32( __a__, __b__, __c__ ) _mm256_fmsub_ps( (__a__), (__b__) , (__c__) )
  52. //@}
  53. ////////////////////////////////////////////////////////////////////////
  54. ////////////////////////////////////////////////////////////////////////
  55. /// @name AKSIMD shuffling
  56. //@{
  57. /// For each 8b value in a, move it to the designated location in each 128b lane specified by the
  58. /// corresponding control byte in b (or, if the control byte is >=16, set the dest to zero) (see _mm_shuffle_epi8)
  59. #define AKSIMD_SHUFFLEB_V8I32(a, b) _mm256_shuffle_epi8(a, b)
  60. /// For each 16b integer, select one of the values from a and b using the provided control mask - if the
  61. /// nth bit is false, the nth value from a will be selected; if true, the value from b will be selected.
  62. /// (the mask applies to each 128b lane identically)
  63. #define AKSIMD_BLEND_V16I16(a, b, i) _mm256_blend_epi16(a, b, i)
  64. #define AKSIMD_INSERT_V2I128( a, m128, idx) _mm256_inserti128_si256(a, m128, idx)
  65. /// For each 128b lane, select one of the four input 128b lanes across a and b,
  66. /// based on the mask i. AKSIMD_SHUFFLE can still be directly used as a control
  67. #define AKSIMD_PERMUTE_2X128_V8I32( a, b, i ) _mm256_permute2x128_si256(a, b, i)
  68. /// Selects the lower of each of the 128b lanes in a and b to be the result ( B A ), ( D C ) -> ( C A )
  69. #define AKSIMD_DEINTERLEAVELANES_LO_V8I32( a, b ) AKSIMD_PERMUTE_2X128_V8I32(a, b, AKSIMD_PERMUTE128(2, 0))
  70. /// Selects the higher of each of the 128b lanes in a and b to be the result ( B A ), ( D C) -> ( D B )
  71. #define AKSIMD_DEINTERLEAVELANES_HI_V8I32( a, b ) AKSIMD_PERMUTE_2X128_V8I32(a, b, AKSIMD_PERMUTE128(3, 1))
  72. /// Shuffle 64b elements across the 128b lanes of a, based on the mask i.
  73. /// AKSIMD_SHUFFLE can still be directly used as a control
  74. #define AKSIMD_PERMUTE_4X64_V8F32( a, i ) _mm256_castpd_ps(_mm256_permute4x64_pd(_mm256_castps_pd(a), i))
  75. //@}
  76. ////////////////////////////////////////////////////////////////////////
  77. ////////////////////////////////////////////////////////////////////////
  78. /// @name AKSIMD conversion
  79. //@{
  80. /// Converts the eight signed 16b integer values of a to signed 32-bit integer values
  81. #define AKSIMD_CONVERT_V8I16_TO_V8I32( __vec__ ) _mm256_cvtepi16_epi32( (__vec__) )
  82. //@}
  83. ////////////////////////////////////////////////////////////////////////
  84. ////////////////////////////////////////////////////////////////////////
  85. /// @name AKSIMD integer arithmetic
  86. //@{
  87. /// Adds the eight integer values of a and b
  88. #define AKSIMD_ADD_V8I32( a, b ) _mm256_add_epi32( a, b )
  89. #define AKSIMD_CMPLT_V8I32( a, b ) _mm256_cmpgt_epi32( b, a )
  90. #define AKSIMD_CMPGT_V8I32( a, b ) _mm256_cmpgt_epi32( a, b )
  91. #define AKSIMD_OR_V8I32( a, b ) _mm256_or_si256(a,b)
  92. #define AKSIMD_XOR_V8I32( a, b ) _mm256_xor_si256(a,b)
  93. #define AKSIMD_SUB_V8I32( a, b ) _mm256_sub_epi32(a,b)
  94. /// Computes the bitwise AND of the 256-bit value in a and the
  95. /// 256-bit value in b (see _mm_and_si128)
  96. #define AKSIMD_AND_V8I32( __a__, __b__ ) _mm256_and_si256( (__a__), (__b__) )
  97. /// Multiplies each 32-bit int value of a by b and returns the lower 32b of the result (no overflow or clamp)
  98. #define AKSIMD_MULLO_V8I32( a , b) _mm256_mullo_epi32(a, b)
  99. /// Multiplies the low 16bits of a by b and stores it in V8I32 (no overflow)
  100. #define AKSIMD_MULLO16_V8I32( a , b) _mm256_mullo_epi16(a, b)
  101. /// Subtracts each 16b integer of a by b
  102. #define AKSIMD_SUB_V16I16( a, b ) _mm256_sub_epi16( a, b )
  103. /// Compares the 16 signed 16-bit integers in a and the 16 signed
  104. /// 16-bit integers in b for greater than (see _mm_cmpgt_epi16)
  105. #define AKSIMD_CMPGT_V16I16( __a__, __b__ ) _mm256_cmpgt_epi16( (__a__), (__b__) )
  106. //@}
  107. ////////////////////////////////////////////////////////////////////////
  108. ////////////////////////////////////////////////////////////////////////
  109. /// @name AKSIMD packing / unpacking
  110. //@{
  111. /// Interleaves the lower 4 signed or unsigned 16-bit integers in each lane of a
  112. /// with the lower 4 signed or unsigned 16-bit integers in each lane of b
  113. /// (see _mm_unpacklo_epi16)
  114. #define AKSIMD_UNPACKLO_VECTOR16I16( a, b ) _mm256_unpacklo_epi16( a, b )
  115. /// Interleaves the upper 8 signed or unsigned 16-bit integers in each lane of a
  116. /// with the upper 8 signed or unsigned 16-bit integers in each lane of b
  117. /// (see _mm_unpackhi_epi16)
  118. #define AKSIMD_UNPACKHI_VECTOR16I16( a, b ) _mm256_unpackhi_epi16( a, b )
  119. /// Packs the 8 signed 32-bit integers from a and b into 16 signed 16-bit
  120. /// integers and saturates (see _mm_packs_epi32)
  121. #define AKSIMD_PACKS_V8I32( a, b ) _mm256_packs_epi32( a, b )
  122. //@}
  123. ////////////////////////////////////////////////////////////////////////
  124. ////////////////////////////////////////////////////////////////////////
  125. /// @name AKSIMD shifting
  126. //@{
  127. /// Shifts the 8 signed or unsigned 32-bit integers in a left by
  128. /// in_shiftBy bits while shifting in zeros (see _mm_slli_epi32)
  129. #define AKSIMD_SHIFTLEFT_V8I32( __vec__, __shiftBy__ ) \
  130. _mm256_slli_epi32( (__vec__), (__shiftBy__) )
  131. /// Shifts the 8 signed or unsigned 32-bit integers in __vec__ left-wards by
  132. /// SIXTEEN bits while shifting in zeros (see _mm_shuffle_epi8)
  133. #define AKSIMD_SHIFTLEFT16_V8I32( __vec__ ) \
  134. _mm256_shuffle_epi8( (__vec__), _mm256_set_epi8( \
  135. 0xd, 0xc, -1, -1, \
  136. 0x9, 0x8, -1, -1, \
  137. 0x5, 0x4, -1, -1, \
  138. 0x1, 0x0, -1, -1, \
  139. 0xd, 0xc, -1, -1, \
  140. 0x9, 0x8, -1, -1, \
  141. 0x5, 0x4, -1, -1, \
  142. 0x1, 0x0, -1, -1) )
  143. /// Shifts the 8 signed 32-bit integers in a right by in_shiftBy
  144. /// bits while shifting in zeroes (see _mm_srli_epi32)
  145. #define AKSIMD_SHIFTRIGHT_V8I32( __vec__, __shiftBy__ ) \
  146. _mm256_srli_epi32( (__vec__), (__shiftBy__) )
  147. /// Shifts the 8 signed 32-bit integers in a right by in_shiftBy
  148. /// bits while shifting in the sign bit (see _mm_srai_epi32)
  149. #define AKSIMD_SHIFTRIGHTARITH_V8I32( __vec__, __shiftBy__ ) \
  150. _mm256_srai_epi32( (__vec__), (__shiftBy__) )
  151. //@}
  152. ////////////////////////////////////////////////////////////////////////
  153. ////////////////////////////////////////////////////////////////////////
  154. /// @name AKSIMD gather
  155. //@{
  156. /// To use these, provide a base_ptr, and an expression that calculates an
  157. /// array index for the provided base_ptr. The expression can be a lambda,
  158. /// such as follows:
  159. /// AKSIMD_V8I32 viData = AKSIMD_GATHER_EPI32(src, [uIndex, uStep](int i)
  160. /// { return (uIndex + uStep * i); });
  161. /// This tends to perform better than a native VGATHER on most CPUs
  162. template <typename T, typename Function>
  163. inline AKSIMD_V8I32 AKSIMD_GATHER_EPI32(const T* __restrict base_ptr, Function expr)
  164. {
  165. __m256i vals = _mm256_setzero_si256();
  166. __m128i valsTemp[2] = { _mm_setzero_si128(),_mm_setzero_si128() };
  167. #define _GATHER_SIM_FETCH(_x) \
  168. {\
  169. AkInt32 val;\
  170. memcpy(&val, (base_ptr + expr(_x)), sizeof(val)); \
  171. valsTemp[_x/4] = _mm_insert_epi32(valsTemp[_x/4], val, _x%4);\
  172. }
  173. _GATHER_SIM_FETCH(0);
  174. _GATHER_SIM_FETCH(1);
  175. _GATHER_SIM_FETCH(2);
  176. _GATHER_SIM_FETCH(3);
  177. _GATHER_SIM_FETCH(4);
  178. _GATHER_SIM_FETCH(5);
  179. _GATHER_SIM_FETCH(6);
  180. _GATHER_SIM_FETCH(7);
  181. #undef _GATHER_SIM_FETCH
  182. vals = _mm256_setr_m128i(valsTemp[0], valsTemp[1]);
  183. return vals;
  184. }
  185. template <typename T, typename Function>
  186. inline AKSIMD_V8I32 AKSIMD_GATHER_EPI64(const T* base_ptr, Function expr)
  187. {
  188. __m256i vals = _mm256_setzero_si256();
  189. __m128i valsTemp[2] = { _mm_setzero_si128(),_mm_setzero_si128() };
  190. #define _GATHER_SIM_FETCH(_x) \
  191. {\
  192. AkInt64 val; \
  193. memcpy(&val, (base_ptr + expr(_x)), sizeof(val)); \
  194. valsTemp[_x/2] = _mm_insert_epi64(valsTemp[_x/2], val, _x%2);\
  195. }
  196. _GATHER_SIM_FETCH(0);
  197. _GATHER_SIM_FETCH(1);
  198. _GATHER_SIM_FETCH(2);
  199. _GATHER_SIM_FETCH(3);
  200. #undef _GATHER_SIM_FETCH
  201. vals = _mm256_setr_m128i(valsTemp[0], valsTemp[1]);
  202. return vals;
  203. }
  204. template <typename T, typename Function>
  205. inline AKSIMD_V8F32 AKSIMD_GATHER_PS(const T* base_ptr, Function expr)
  206. {
  207. return _mm256_castsi256_ps(AKSIMD_GATHER_EPI32(base_ptr, expr));
  208. }
  209. template <typename T, typename Function>
  210. inline AKSIMD_V4F64 AKSIMD_GATHER_PD(const T* base_ptr, Function expr)
  211. {
  212. return _mm256_castsi256_pd(AKSIMD_GATHER_EPI64(base_ptr, expr));
  213. }
  214. //@}
  215. ////////////////////////////////////////////////////////////////////////
  216. #endif //_AK_SIMD_AVX2_H_