1 /*===---- pmmintrin.h - SSE3 intrinsics ------------------------------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10 #ifndef __PMMINTRIN_H 11 #define __PMMINTRIN_H 12 13 #include <emmintrin.h> 14 15 /* Define the default attributes for the functions in this file. */ 16 #define __DEFAULT_FN_ATTRS \ 17 __attribute__((__always_inline__, __nodebug__, __target__("sse3"), __min_vector_width__(128))) 18 19 /// Loads data from an unaligned memory location to elements in a 128-bit 20 /// vector. 21 /// 22 /// If the address of the data is not 16-byte aligned, the instruction may 23 /// read two adjacent aligned blocks of memory to retrieve the requested 24 /// data. 25 /// 26 /// \headerfile <x86intrin.h> 27 /// 28 /// This intrinsic corresponds to the <c> VLDDQU </c> instruction. 29 /// 30 /// \param __p 31 /// A pointer to a 128-bit integer vector containing integer values. 32 /// \returns A 128-bit vector containing the moved values. 33 static __inline__ __m128i __DEFAULT_FN_ATTRS 34 _mm_lddqu_si128(__m128i const *__p) 35 { 36 return (__m128i)__builtin_ia32_lddqu((char const *)__p); 37 } 38 39 /// Adds the even-indexed values and subtracts the odd-indexed values of 40 /// two 128-bit vectors of [4 x float]. 41 /// 42 /// \headerfile <x86intrin.h> 43 /// 44 /// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction. 45 /// 46 /// \param __a 47 /// A 128-bit vector of [4 x float] containing the left source operand. 48 /// \param __b 49 /// A 128-bit vector of [4 x float] containing the right source operand. 50 /// \returns A 128-bit vector of [4 x float] containing the alternating sums and 51 /// differences of both operands. 52 static __inline__ __m128 __DEFAULT_FN_ATTRS 53 _mm_addsub_ps(__m128 __a, __m128 __b) 54 { 55 return __builtin_ia32_addsubps((__v4sf)__a, (__v4sf)__b); 56 } 57 58 /// Horizontally adds the adjacent pairs of values contained in two 59 /// 128-bit vectors of [4 x float]. 60 /// 61 /// \headerfile <x86intrin.h> 62 /// 63 /// This intrinsic corresponds to the <c> VHADDPS </c> instruction. 64 /// 65 /// \param __a 66 /// A 128-bit vector of [4 x float] containing one of the source operands. 67 /// The horizontal sums of the values are stored in the lower bits of the 68 /// destination. 69 /// \param __b 70 /// A 128-bit vector of [4 x float] containing one of the source operands. 71 /// The horizontal sums of the values are stored in the upper bits of the 72 /// destination. 73 /// \returns A 128-bit vector of [4 x float] containing the horizontal sums of 74 /// both operands. 75 static __inline__ __m128 __DEFAULT_FN_ATTRS 76 _mm_hadd_ps(__m128 __a, __m128 __b) 77 { 78 return __builtin_ia32_haddps((__v4sf)__a, (__v4sf)__b); 79 } 80 81 /// Horizontally subtracts the adjacent pairs of values contained in two 82 /// 128-bit vectors of [4 x float]. 83 /// 84 /// \headerfile <x86intrin.h> 85 /// 86 /// This intrinsic corresponds to the <c> VHSUBPS </c> instruction. 87 /// 88 /// \param __a 89 /// A 128-bit vector of [4 x float] containing one of the source operands. 90 /// The horizontal differences between the values are stored in the lower 91 /// bits of the destination. 92 /// \param __b 93 /// A 128-bit vector of [4 x float] containing one of the source operands. 94 /// The horizontal differences between the values are stored in the upper 95 /// bits of the destination. 96 /// \returns A 128-bit vector of [4 x float] containing the horizontal 97 /// differences of both operands. 98 static __inline__ __m128 __DEFAULT_FN_ATTRS 99 _mm_hsub_ps(__m128 __a, __m128 __b) 100 { 101 return __builtin_ia32_hsubps((__v4sf)__a, (__v4sf)__b); 102 } 103 104 /// Moves and duplicates odd-indexed values from a 128-bit vector 105 /// of [4 x float] to float values stored in a 128-bit vector of 106 /// [4 x float]. 107 /// 108 /// \headerfile <x86intrin.h> 109 /// 110 /// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction. 111 /// 112 /// \param __a 113 /// A 128-bit vector of [4 x float]. \n 114 /// Bits [127:96] of the source are written to bits [127:96] and [95:64] of 115 /// the destination. \n 116 /// Bits [63:32] of the source are written to bits [63:32] and [31:0] of the 117 /// destination. 118 /// \returns A 128-bit vector of [4 x float] containing the moved and duplicated 119 /// values. 120 static __inline__ __m128 __DEFAULT_FN_ATTRS 121 _mm_movehdup_ps(__m128 __a) 122 { 123 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 1, 1, 3, 3); 124 } 125 126 /// Duplicates even-indexed values from a 128-bit vector of 127 /// [4 x float] to float values stored in a 128-bit vector of [4 x float]. 128 /// 129 /// \headerfile <x86intrin.h> 130 /// 131 /// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction. 132 /// 133 /// \param __a 134 /// A 128-bit vector of [4 x float] \n 135 /// Bits [95:64] of the source are written to bits [127:96] and [95:64] of 136 /// the destination. \n 137 /// Bits [31:0] of the source are written to bits [63:32] and [31:0] of the 138 /// destination. 139 /// \returns A 128-bit vector of [4 x float] containing the moved and duplicated 140 /// values. 141 static __inline__ __m128 __DEFAULT_FN_ATTRS 142 _mm_moveldup_ps(__m128 __a) 143 { 144 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 2, 2); 145 } 146 147 /// Adds the even-indexed values and subtracts the odd-indexed values of 148 /// two 128-bit vectors of [2 x double]. 149 /// 150 /// \headerfile <x86intrin.h> 151 /// 152 /// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction. 153 /// 154 /// \param __a 155 /// A 128-bit vector of [2 x double] containing the left source operand. 156 /// \param __b 157 /// A 128-bit vector of [2 x double] containing the right source operand. 158 /// \returns A 128-bit vector of [2 x double] containing the alternating sums 159 /// and differences of both operands. 160 static __inline__ __m128d __DEFAULT_FN_ATTRS 161 _mm_addsub_pd(__m128d __a, __m128d __b) 162 { 163 return __builtin_ia32_addsubpd((__v2df)__a, (__v2df)__b); 164 } 165 166 /// Horizontally adds the pairs of values contained in two 128-bit 167 /// vectors of [2 x double]. 168 /// 169 /// \headerfile <x86intrin.h> 170 /// 171 /// This intrinsic corresponds to the <c> VHADDPD </c> instruction. 172 /// 173 /// \param __a 174 /// A 128-bit vector of [2 x double] containing one of the source operands. 175 /// The horizontal sum of the values is stored in the lower bits of the 176 /// destination. 177 /// \param __b 178 /// A 128-bit vector of [2 x double] containing one of the source operands. 179 /// The horizontal sum of the values is stored in the upper bits of the 180 /// destination. 181 /// \returns A 128-bit vector of [2 x double] containing the horizontal sums of 182 /// both operands. 183 static __inline__ __m128d __DEFAULT_FN_ATTRS 184 _mm_hadd_pd(__m128d __a, __m128d __b) 185 { 186 return __builtin_ia32_haddpd((__v2df)__a, (__v2df)__b); 187 } 188 189 /// Horizontally subtracts the pairs of values contained in two 128-bit 190 /// vectors of [2 x double]. 191 /// 192 /// \headerfile <x86intrin.h> 193 /// 194 /// This intrinsic corresponds to the <c> VHSUBPD </c> instruction. 195 /// 196 /// \param __a 197 /// A 128-bit vector of [2 x double] containing one of the source operands. 198 /// The horizontal difference of the values is stored in the lower bits of 199 /// the destination. 200 /// \param __b 201 /// A 128-bit vector of [2 x double] containing one of the source operands. 202 /// The horizontal difference of the values is stored in the upper bits of 203 /// the destination. 204 /// \returns A 128-bit vector of [2 x double] containing the horizontal 205 /// differences of both operands. 206 static __inline__ __m128d __DEFAULT_FN_ATTRS 207 _mm_hsub_pd(__m128d __a, __m128d __b) 208 { 209 return __builtin_ia32_hsubpd((__v2df)__a, (__v2df)__b); 210 } 211 212 /// Moves and duplicates one double-precision value to double-precision 213 /// values stored in a 128-bit vector of [2 x double]. 214 /// 215 /// \headerfile <x86intrin.h> 216 /// 217 /// \code 218 /// __m128d _mm_loaddup_pd(double const *dp); 219 /// \endcode 220 /// 221 /// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction. 222 /// 223 /// \param dp 224 /// A pointer to a double-precision value to be moved and duplicated. 225 /// \returns A 128-bit vector of [2 x double] containing the moved and 226 /// duplicated values. 227 #define _mm_loaddup_pd(dp) _mm_load1_pd(dp) 228 229 /// Moves and duplicates the double-precision value in the lower bits of 230 /// a 128-bit vector of [2 x double] to double-precision values stored in a 231 /// 128-bit vector of [2 x double]. 232 /// 233 /// \headerfile <x86intrin.h> 234 /// 235 /// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction. 236 /// 237 /// \param __a 238 /// A 128-bit vector of [2 x double]. Bits [63:0] are written to bits 239 /// [127:64] and [63:0] of the destination. 240 /// \returns A 128-bit vector of [2 x double] containing the moved and 241 /// duplicated values. 242 static __inline__ __m128d __DEFAULT_FN_ATTRS 243 _mm_movedup_pd(__m128d __a) 244 { 245 return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0); 246 } 247 248 /// Establishes a linear address memory range to be monitored and puts 249 /// the processor in the monitor event pending state. Data stored in the 250 /// monitored address range causes the processor to exit the pending state. 251 /// 252 /// \headerfile <x86intrin.h> 253 /// 254 /// This intrinsic corresponds to the <c> MONITOR </c> instruction. 255 /// 256 /// \param __p 257 /// The memory range to be monitored. The size of the range is determined by 258 /// CPUID function 0000_0005h. 259 /// \param __extensions 260 /// Optional extensions for the monitoring state. 261 /// \param __hints 262 /// Optional hints for the monitoring state. 263 static __inline__ void __DEFAULT_FN_ATTRS 264 _mm_monitor(void const *__p, unsigned __extensions, unsigned __hints) 265 { 266 __builtin_ia32_monitor(__p, __extensions, __hints); 267 } 268 269 /// Used with the MONITOR instruction to wait while the processor is in 270 /// the monitor event pending state. Data stored in the monitored address 271 /// range causes the processor to exit the pending state. 272 /// 273 /// \headerfile <x86intrin.h> 274 /// 275 /// This intrinsic corresponds to the <c> MWAIT </c> instruction. 276 /// 277 /// \param __extensions 278 /// Optional extensions for the monitoring state, which may vary by 279 /// processor. 280 /// \param __hints 281 /// Optional hints for the monitoring state, which may vary by processor. 282 static __inline__ void __DEFAULT_FN_ATTRS 283 _mm_mwait(unsigned __extensions, unsigned __hints) 284 { 285 __builtin_ia32_mwait(__extensions, __hints); 286 } 287 288 #undef __DEFAULT_FN_ATTRS 289 290 #endif /* __PMMINTRIN_H */ 291