1 /*===---- pmmintrin.h - SSE3 intrinsics ------------------------------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10 #ifndef __PMMINTRIN_H 11 #define __PMMINTRIN_H 12 13 #if !defined(__i386__) && !defined(__x86_64__) 14 #error "This header is only meant to be used on x86 and x64 architecture" 15 #endif 16 17 #include <emmintrin.h> 18 19 /* Define the default attributes for the functions in this file. */ 20 #define __DEFAULT_FN_ATTRS \ 21 __attribute__((__always_inline__, __nodebug__, \ 22 __target__("sse3,no-evex512"), __min_vector_width__(128))) 23 24 /// Loads data from an unaligned memory location to elements in a 128-bit 25 /// vector. 26 /// 27 /// If the address of the data is not 16-byte aligned, the instruction may 28 /// read two adjacent aligned blocks of memory to retrieve the requested 29 /// data. 30 /// 31 /// \headerfile <x86intrin.h> 32 /// 33 /// This intrinsic corresponds to the <c> VLDDQU </c> instruction. 34 /// 35 /// \param __p 36 /// A pointer to a 128-bit integer vector containing integer values. 37 /// \returns A 128-bit vector containing the moved values. 38 static __inline__ __m128i __DEFAULT_FN_ATTRS 39 _mm_lddqu_si128(__m128i_u const *__p) 40 { 41 return (__m128i)__builtin_ia32_lddqu((char const *)__p); 42 } 43 44 /// Adds the even-indexed values and subtracts the odd-indexed values of 45 /// two 128-bit vectors of [4 x float]. 46 /// 47 /// \headerfile <x86intrin.h> 48 /// 49 /// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction. 50 /// 51 /// \param __a 52 /// A 128-bit vector of [4 x float] containing the left source operand. 53 /// \param __b 54 /// A 128-bit vector of [4 x float] containing the right source operand. 55 /// \returns A 128-bit vector of [4 x float] containing the alternating sums and 56 /// differences of both operands. 57 static __inline__ __m128 __DEFAULT_FN_ATTRS 58 _mm_addsub_ps(__m128 __a, __m128 __b) 59 { 60 return __builtin_ia32_addsubps((__v4sf)__a, (__v4sf)__b); 61 } 62 63 /// Horizontally adds the adjacent pairs of values contained in two 64 /// 128-bit vectors of [4 x float]. 65 /// 66 /// \headerfile <x86intrin.h> 67 /// 68 /// This intrinsic corresponds to the <c> VHADDPS </c> instruction. 69 /// 70 /// \param __a 71 /// A 128-bit vector of [4 x float] containing one of the source operands. 72 /// The horizontal sums of the values are stored in the lower bits of the 73 /// destination. 74 /// \param __b 75 /// A 128-bit vector of [4 x float] containing one of the source operands. 76 /// The horizontal sums of the values are stored in the upper bits of the 77 /// destination. 78 /// \returns A 128-bit vector of [4 x float] containing the horizontal sums of 79 /// both operands. 80 static __inline__ __m128 __DEFAULT_FN_ATTRS 81 _mm_hadd_ps(__m128 __a, __m128 __b) 82 { 83 return __builtin_ia32_haddps((__v4sf)__a, (__v4sf)__b); 84 } 85 86 /// Horizontally subtracts the adjacent pairs of values contained in two 87 /// 128-bit vectors of [4 x float]. 88 /// 89 /// \headerfile <x86intrin.h> 90 /// 91 /// This intrinsic corresponds to the <c> VHSUBPS </c> instruction. 92 /// 93 /// \param __a 94 /// A 128-bit vector of [4 x float] containing one of the source operands. 95 /// The horizontal differences between the values are stored in the lower 96 /// bits of the destination. 97 /// \param __b 98 /// A 128-bit vector of [4 x float] containing one of the source operands. 99 /// The horizontal differences between the values are stored in the upper 100 /// bits of the destination. 101 /// \returns A 128-bit vector of [4 x float] containing the horizontal 102 /// differences of both operands. 103 static __inline__ __m128 __DEFAULT_FN_ATTRS 104 _mm_hsub_ps(__m128 __a, __m128 __b) 105 { 106 return __builtin_ia32_hsubps((__v4sf)__a, (__v4sf)__b); 107 } 108 109 /// Moves and duplicates odd-indexed values from a 128-bit vector 110 /// of [4 x float] to float values stored in a 128-bit vector of 111 /// [4 x float]. 112 /// 113 /// \headerfile <x86intrin.h> 114 /// 115 /// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction. 116 /// 117 /// \param __a 118 /// A 128-bit vector of [4 x float]. \n 119 /// Bits [127:96] of the source are written to bits [127:96] and [95:64] of 120 /// the destination. \n 121 /// Bits [63:32] of the source are written to bits [63:32] and [31:0] of the 122 /// destination. 123 /// \returns A 128-bit vector of [4 x float] containing the moved and duplicated 124 /// values. 125 static __inline__ __m128 __DEFAULT_FN_ATTRS 126 _mm_movehdup_ps(__m128 __a) 127 { 128 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 1, 1, 3, 3); 129 } 130 131 /// Duplicates even-indexed values from a 128-bit vector of 132 /// [4 x float] to float values stored in a 128-bit vector of [4 x float]. 133 /// 134 /// \headerfile <x86intrin.h> 135 /// 136 /// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction. 137 /// 138 /// \param __a 139 /// A 128-bit vector of [4 x float] \n 140 /// Bits [95:64] of the source are written to bits [127:96] and [95:64] of 141 /// the destination. \n 142 /// Bits [31:0] of the source are written to bits [63:32] and [31:0] of the 143 /// destination. 144 /// \returns A 128-bit vector of [4 x float] containing the moved and duplicated 145 /// values. 146 static __inline__ __m128 __DEFAULT_FN_ATTRS 147 _mm_moveldup_ps(__m128 __a) 148 { 149 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 2, 2); 150 } 151 152 /// Adds the even-indexed values and subtracts the odd-indexed values of 153 /// two 128-bit vectors of [2 x double]. 154 /// 155 /// \headerfile <x86intrin.h> 156 /// 157 /// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction. 158 /// 159 /// \param __a 160 /// A 128-bit vector of [2 x double] containing the left source operand. 161 /// \param __b 162 /// A 128-bit vector of [2 x double] containing the right source operand. 163 /// \returns A 128-bit vector of [2 x double] containing the alternating sums 164 /// and differences of both operands. 165 static __inline__ __m128d __DEFAULT_FN_ATTRS 166 _mm_addsub_pd(__m128d __a, __m128d __b) 167 { 168 return __builtin_ia32_addsubpd((__v2df)__a, (__v2df)__b); 169 } 170 171 /// Horizontally adds the pairs of values contained in two 128-bit 172 /// vectors of [2 x double]. 173 /// 174 /// \headerfile <x86intrin.h> 175 /// 176 /// This intrinsic corresponds to the <c> VHADDPD </c> instruction. 177 /// 178 /// \param __a 179 /// A 128-bit vector of [2 x double] containing one of the source operands. 180 /// The horizontal sum of the values is stored in the lower bits of the 181 /// destination. 182 /// \param __b 183 /// A 128-bit vector of [2 x double] containing one of the source operands. 184 /// The horizontal sum of the values is stored in the upper bits of the 185 /// destination. 186 /// \returns A 128-bit vector of [2 x double] containing the horizontal sums of 187 /// both operands. 188 static __inline__ __m128d __DEFAULT_FN_ATTRS 189 _mm_hadd_pd(__m128d __a, __m128d __b) 190 { 191 return __builtin_ia32_haddpd((__v2df)__a, (__v2df)__b); 192 } 193 194 /// Horizontally subtracts the pairs of values contained in two 128-bit 195 /// vectors of [2 x double]. 196 /// 197 /// \headerfile <x86intrin.h> 198 /// 199 /// This intrinsic corresponds to the <c> VHSUBPD </c> instruction. 200 /// 201 /// \param __a 202 /// A 128-bit vector of [2 x double] containing one of the source operands. 203 /// The horizontal difference of the values is stored in the lower bits of 204 /// the destination. 205 /// \param __b 206 /// A 128-bit vector of [2 x double] containing one of the source operands. 207 /// The horizontal difference of the values is stored in the upper bits of 208 /// the destination. 209 /// \returns A 128-bit vector of [2 x double] containing the horizontal 210 /// differences of both operands. 211 static __inline__ __m128d __DEFAULT_FN_ATTRS 212 _mm_hsub_pd(__m128d __a, __m128d __b) 213 { 214 return __builtin_ia32_hsubpd((__v2df)__a, (__v2df)__b); 215 } 216 217 /// Moves and duplicates one double-precision value to double-precision 218 /// values stored in a 128-bit vector of [2 x double]. 219 /// 220 /// \headerfile <x86intrin.h> 221 /// 222 /// \code 223 /// __m128d _mm_loaddup_pd(double const *dp); 224 /// \endcode 225 /// 226 /// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction. 227 /// 228 /// \param dp 229 /// A pointer to a double-precision value to be moved and duplicated. 230 /// \returns A 128-bit vector of [2 x double] containing the moved and 231 /// duplicated values. 232 #define _mm_loaddup_pd(dp) _mm_load1_pd(dp) 233 234 /// Moves and duplicates the double-precision value in the lower bits of 235 /// a 128-bit vector of [2 x double] to double-precision values stored in a 236 /// 128-bit vector of [2 x double]. 237 /// 238 /// \headerfile <x86intrin.h> 239 /// 240 /// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction. 241 /// 242 /// \param __a 243 /// A 128-bit vector of [2 x double]. Bits [63:0] are written to bits 244 /// [127:64] and [63:0] of the destination. 245 /// \returns A 128-bit vector of [2 x double] containing the moved and 246 /// duplicated values. 247 static __inline__ __m128d __DEFAULT_FN_ATTRS 248 _mm_movedup_pd(__m128d __a) 249 { 250 return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0); 251 } 252 253 /// Establishes a linear address memory range to be monitored and puts 254 /// the processor in the monitor event pending state. Data stored in the 255 /// monitored address range causes the processor to exit the pending state. 256 /// 257 /// The \c MONITOR instruction can be used in kernel mode, and in other modes 258 /// if MSR <c> C001_0015h[MonMwaitUserEn] </c> is set. 259 /// 260 /// \headerfile <x86intrin.h> 261 /// 262 /// This intrinsic corresponds to the \c MONITOR instruction. 263 /// 264 /// \param __p 265 /// The memory range to be monitored. The size of the range is determined by 266 /// CPUID function 0000_0005h. 267 /// \param __extensions 268 /// Optional extensions for the monitoring state. 269 /// \param __hints 270 /// Optional hints for the monitoring state. 271 static __inline__ void __DEFAULT_FN_ATTRS 272 _mm_monitor(void const *__p, unsigned __extensions, unsigned __hints) 273 { 274 __builtin_ia32_monitor(__p, __extensions, __hints); 275 } 276 277 /// Used with the \c MONITOR instruction to wait while the processor is in 278 /// the monitor event pending state. Data stored in the monitored address 279 /// range, or an interrupt, causes the processor to exit the pending state. 280 /// 281 /// The \c MWAIT instruction can be used in kernel mode, and in other modes if 282 /// MSR <c> C001_0015h[MonMwaitUserEn] </c> is set. 283 /// 284 /// \headerfile <x86intrin.h> 285 /// 286 /// This intrinsic corresponds to the \c MWAIT instruction. 287 /// 288 /// \param __extensions 289 /// Optional extensions for the monitoring state, which can vary by 290 /// processor. 291 /// \param __hints 292 /// Optional hints for the monitoring state, which can vary by processor. 293 static __inline__ void __DEFAULT_FN_ATTRS 294 _mm_mwait(unsigned __extensions, unsigned __hints) 295 { 296 __builtin_ia32_mwait(__extensions, __hints); 297 } 298 299 #undef __DEFAULT_FN_ATTRS 300 301 #endif /* __PMMINTRIN_H */ 302