1 /*===----------------- avxifmaintrin.h - IFMA intrinsics -------------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10 #ifndef __IMMINTRIN_H 11 #error "Never use <avxifmaintrin.h> directly; include <immintrin.h> instead." 12 #endif 13 14 #ifndef __AVXIFMAINTRIN_H 15 #define __AVXIFMAINTRIN_H 16 17 /* Define the default attributes for the functions in this file. */ 18 #define __DEFAULT_FN_ATTRS128 \ 19 __attribute__((__always_inline__, __nodebug__, __target__("avxifma"), \ 20 __min_vector_width__(128))) 21 #define __DEFAULT_FN_ATTRS256 \ 22 __attribute__((__always_inline__, __nodebug__, __target__("avxifma"), \ 23 __min_vector_width__(256))) 24 25 // must vex-encoding 26 27 /// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y 28 /// and \a __Z to form a 104-bit intermediate result. Add the high 52-bit 29 /// unsigned integer from the intermediate result with the corresponding 30 /// unsigned 64-bit integer in \a __X, and store the results in \a dst. 31 /// 32 /// \headerfile <immintrin.h> 33 /// 34 /// \code 35 /// __m128i 36 /// _mm_madd52hi_avx_epu64 (__m128i __X, __m128i __Y, __m128i __Z) 37 /// \endcode 38 /// 39 /// This intrinsic corresponds to the \c VPMADD52HUQ instruction. 40 /// 41 /// \return 42 /// return __m128i dst. 43 /// \param __X 44 /// A 128-bit vector of [2 x i64] 45 /// \param __Y 46 /// A 128-bit vector of [2 x i64] 47 /// \param __Z 48 /// A 128-bit vector of [2 x i64] 49 /// 50 /// \code{.operation} 51 /// FOR j := 0 to 1 52 /// i := j*64 53 /// tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i]) 54 /// dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[103:52]) 55 /// ENDFOR 56 /// dst[MAX:128] := 0 57 /// \endcode 58 static __inline__ __m128i __DEFAULT_FN_ATTRS128 59 _mm_madd52hi_avx_epu64(__m128i __X, __m128i __Y, __m128i __Z) { 60 return (__m128i)__builtin_ia32_vpmadd52huq128((__v2di)__X, (__v2di)__Y, 61 (__v2di)__Z); 62 } 63 64 /// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y 65 /// and \a __Z to form a 104-bit intermediate result. Add the high 52-bit 66 /// unsigned integer from the intermediate result with the corresponding 67 /// unsigned 64-bit integer in \a __X, and store the results in \a dst. 68 /// 69 /// \headerfile <immintrin.h> 70 /// 71 /// \code 72 /// __m256i 73 /// _mm256_madd52hi_avx_epu64 (__m256i __X, __m256i __Y, __m256i __Z) 74 /// \endcode 75 /// 76 /// This intrinsic corresponds to the \c VPMADD52HUQ instruction. 77 /// 78 /// \return 79 /// return __m256i dst. 80 /// \param __X 81 /// A 256-bit vector of [4 x i64] 82 /// \param __Y 83 /// A 256-bit vector of [4 x i64] 84 /// \param __Z 85 /// A 256-bit vector of [4 x i64] 86 /// 87 /// \code{.operation} 88 /// FOR j := 0 to 3 89 /// i := j*64 90 /// tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i]) 91 /// dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[103:52]) 92 /// ENDFOR 93 /// dst[MAX:256] := 0 94 /// \endcode 95 static __inline__ __m256i __DEFAULT_FN_ATTRS256 96 _mm256_madd52hi_avx_epu64(__m256i __X, __m256i __Y, __m256i __Z) { 97 return (__m256i)__builtin_ia32_vpmadd52huq256((__v4di)__X, (__v4di)__Y, 98 (__v4di)__Z); 99 } 100 101 /// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y 102 /// and \a __Z to form a 104-bit intermediate result. Add the low 52-bit 103 /// unsigned integer from the intermediate result with the corresponding 104 /// unsigned 64-bit integer in \a __X, and store the results in \a dst. 105 /// 106 /// \headerfile <immintrin.h> 107 /// 108 /// \code 109 /// __m128i 110 /// _mm_madd52lo_avx_epu64 (__m128i __X, __m128i __Y, __m128i __Z) 111 /// \endcode 112 /// 113 /// This intrinsic corresponds to the \c VPMADD52LUQ instruction. 114 /// 115 /// \return 116 /// return __m128i dst. 117 /// \param __X 118 /// A 128-bit vector of [2 x i64] 119 /// \param __Y 120 /// A 128-bit vector of [2 x i64] 121 /// \param __Z 122 /// A 128-bit vector of [2 x i64] 123 /// 124 /// \code{.operation} 125 /// FOR j := 0 to 1 126 /// i := j*64 127 /// tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i]) 128 /// dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[51:0]) 129 /// ENDFOR 130 /// dst[MAX:128] := 0 131 /// \endcode 132 static __inline__ __m128i __DEFAULT_FN_ATTRS128 133 _mm_madd52lo_avx_epu64(__m128i __X, __m128i __Y, __m128i __Z) { 134 return (__m128i)__builtin_ia32_vpmadd52luq128((__v2di)__X, (__v2di)__Y, 135 (__v2di)__Z); 136 } 137 138 /// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y 139 /// and \a __Z to form a 104-bit intermediate result. Add the low 52-bit 140 /// unsigned integer from the intermediate result with the corresponding 141 /// unsigned 64-bit integer in \a __X, and store the results in \a dst. 142 /// 143 /// \headerfile <immintrin.h> 144 /// 145 /// \code 146 /// __m256i 147 /// _mm256_madd52lo_avx_epu64 (__m256i __X, __m256i __Y, __m256i __Z) 148 /// \endcode 149 /// 150 /// This intrinsic corresponds to the \c VPMADD52LUQ instruction. 151 /// 152 /// \return 153 /// return __m256i dst. 154 /// \param __X 155 /// A 256-bit vector of [4 x i64] 156 /// \param __Y 157 /// A 256-bit vector of [4 x i64] 158 /// \param __Z 159 /// A 256-bit vector of [4 x i64] 160 /// 161 /// \code{.operation} 162 /// FOR j := 0 to 3 163 /// i := j*64 164 /// tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i]) 165 /// dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[51:0]) 166 /// ENDFOR 167 /// dst[MAX:256] := 0 168 /// \endcode 169 static __inline__ __m256i __DEFAULT_FN_ATTRS256 170 _mm256_madd52lo_avx_epu64(__m256i __X, __m256i __Y, __m256i __Z) { 171 return (__m256i)__builtin_ia32_vpmadd52luq256((__v4di)__X, (__v4di)__Y, 172 (__v4di)__Z); 173 } 174 #undef __DEFAULT_FN_ATTRS128 175 #undef __DEFAULT_FN_ATTRS256 176 177 #endif // __AVXIFMAINTRIN_H 178