1 /*===---- smmintrin.h - SSE4 intrinsics ------------------------------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10 #ifndef __SMMINTRIN_H 11 #define __SMMINTRIN_H 12 13 #if !defined(__i386__) && !defined(__x86_64__) 14 #error "This header is only meant to be used on x86 and x64 architecture" 15 #endif 16 17 #include <tmmintrin.h> 18 19 /* Define the default attributes for the functions in this file. */ 20 #define __DEFAULT_FN_ATTRS \ 21 __attribute__((__always_inline__, __nodebug__, \ 22 __target__("sse4.1,no-evex512"), __min_vector_width__(128))) 23 24 /* SSE4 Rounding macros. */ 25 #define _MM_FROUND_TO_NEAREST_INT 0x00 26 #define _MM_FROUND_TO_NEG_INF 0x01 27 #define _MM_FROUND_TO_POS_INF 0x02 28 #define _MM_FROUND_TO_ZERO 0x03 29 #define _MM_FROUND_CUR_DIRECTION 0x04 30 31 #define _MM_FROUND_RAISE_EXC 0x00 32 #define _MM_FROUND_NO_EXC 0x08 33 34 #define _MM_FROUND_NINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT) 35 #define _MM_FROUND_FLOOR (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF) 36 #define _MM_FROUND_CEIL (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF) 37 #define _MM_FROUND_TRUNC (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO) 38 #define _MM_FROUND_RINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION) 39 #define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION) 40 41 /// Rounds up each element of the 128-bit vector of [4 x float] to an 42 /// integer and returns the rounded values in a 128-bit vector of 43 /// [4 x float]. 44 /// 45 /// \headerfile <x86intrin.h> 46 /// 47 /// \code 48 /// __m128 _mm_ceil_ps(__m128 X); 49 /// \endcode 50 /// 51 /// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction. 52 /// 53 /// \param X 54 /// A 128-bit vector of [4 x float] values to be rounded up. 55 /// \returns A 128-bit vector of [4 x float] containing the rounded values. 56 #define _mm_ceil_ps(X) _mm_round_ps((X), _MM_FROUND_CEIL) 57 58 /// Rounds up each element of the 128-bit vector of [2 x double] to an 59 /// integer and returns the rounded values in a 128-bit vector of 60 /// [2 x double]. 61 /// 62 /// \headerfile <x86intrin.h> 63 /// 64 /// \code 65 /// __m128d _mm_ceil_pd(__m128d X); 66 /// \endcode 67 /// 68 /// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction. 69 /// 70 /// \param X 71 /// A 128-bit vector of [2 x double] values to be rounded up. 72 /// \returns A 128-bit vector of [2 x double] containing the rounded values. 73 #define _mm_ceil_pd(X) _mm_round_pd((X), _MM_FROUND_CEIL) 74 75 /// Copies three upper elements of the first 128-bit vector operand to 76 /// the corresponding three upper elements of the 128-bit result vector of 77 /// [4 x float]. Rounds up the lowest element of the second 128-bit vector 78 /// operand to an integer and copies it to the lowest element of the 128-bit 79 /// result vector of [4 x float]. 80 /// 81 /// \headerfile <x86intrin.h> 82 /// 83 /// \code 84 /// __m128 _mm_ceil_ss(__m128 X, __m128 Y); 85 /// \endcode 86 /// 87 /// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction. 88 /// 89 /// \param X 90 /// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are 91 /// copied to the corresponding bits of the result. 92 /// \param Y 93 /// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is 94 /// rounded up to the nearest integer and copied to the corresponding bits 95 /// of the result. 96 /// \returns A 128-bit vector of [4 x float] containing the copied and rounded 97 /// values. 98 #define _mm_ceil_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_CEIL) 99 100 /// Copies the upper element of the first 128-bit vector operand to the 101 /// corresponding upper element of the 128-bit result vector of [2 x double]. 102 /// Rounds up the lower element of the second 128-bit vector operand to an 103 /// integer and copies it to the lower element of the 128-bit result vector 104 /// of [2 x double]. 105 /// 106 /// \headerfile <x86intrin.h> 107 /// 108 /// \code 109 /// __m128d _mm_ceil_sd(__m128d X, __m128d Y); 110 /// \endcode 111 /// 112 /// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction. 113 /// 114 /// \param X 115 /// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is 116 /// copied to the corresponding bits of the result. 117 /// \param Y 118 /// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is 119 /// rounded up to the nearest integer and copied to the corresponding bits 120 /// of the result. 121 /// \returns A 128-bit vector of [2 x double] containing the copied and rounded 122 /// values. 123 #define _mm_ceil_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_CEIL) 124 125 /// Rounds down each element of the 128-bit vector of [4 x float] to an 126 /// an integer and returns the rounded values in a 128-bit vector of 127 /// [4 x float]. 128 /// 129 /// \headerfile <x86intrin.h> 130 /// 131 /// \code 132 /// __m128 _mm_floor_ps(__m128 X); 133 /// \endcode 134 /// 135 /// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction. 136 /// 137 /// \param X 138 /// A 128-bit vector of [4 x float] values to be rounded down. 139 /// \returns A 128-bit vector of [4 x float] containing the rounded values. 140 #define _mm_floor_ps(X) _mm_round_ps((X), _MM_FROUND_FLOOR) 141 142 /// Rounds down each element of the 128-bit vector of [2 x double] to an 143 /// integer and returns the rounded values in a 128-bit vector of 144 /// [2 x double]. 145 /// 146 /// \headerfile <x86intrin.h> 147 /// 148 /// \code 149 /// __m128d _mm_floor_pd(__m128d X); 150 /// \endcode 151 /// 152 /// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction. 153 /// 154 /// \param X 155 /// A 128-bit vector of [2 x double]. 156 /// \returns A 128-bit vector of [2 x double] containing the rounded values. 157 #define _mm_floor_pd(X) _mm_round_pd((X), _MM_FROUND_FLOOR) 158 159 /// Copies three upper elements of the first 128-bit vector operand to 160 /// the corresponding three upper elements of the 128-bit result vector of 161 /// [4 x float]. Rounds down the lowest element of the second 128-bit vector 162 /// operand to an integer and copies it to the lowest element of the 128-bit 163 /// result vector of [4 x float]. 164 /// 165 /// \headerfile <x86intrin.h> 166 /// 167 /// \code 168 /// __m128 _mm_floor_ss(__m128 X, __m128 Y); 169 /// \endcode 170 /// 171 /// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction. 172 /// 173 /// \param X 174 /// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are 175 /// copied to the corresponding bits of the result. 176 /// \param Y 177 /// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is 178 /// rounded down to the nearest integer and copied to the corresponding bits 179 /// of the result. 180 /// \returns A 128-bit vector of [4 x float] containing the copied and rounded 181 /// values. 182 #define _mm_floor_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_FLOOR) 183 184 /// Copies the upper element of the first 128-bit vector operand to the 185 /// corresponding upper element of the 128-bit result vector of [2 x double]. 186 /// Rounds down the lower element of the second 128-bit vector operand to an 187 /// integer and copies it to the lower element of the 128-bit result vector 188 /// of [2 x double]. 189 /// 190 /// \headerfile <x86intrin.h> 191 /// 192 /// \code 193 /// __m128d _mm_floor_sd(__m128d X, __m128d Y); 194 /// \endcode 195 /// 196 /// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction. 197 /// 198 /// \param X 199 /// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is 200 /// copied to the corresponding bits of the result. 201 /// \param Y 202 /// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is 203 /// rounded down to the nearest integer and copied to the corresponding bits 204 /// of the result. 205 /// \returns A 128-bit vector of [2 x double] containing the copied and rounded 206 /// values. 207 #define _mm_floor_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_FLOOR) 208 209 /// Rounds each element of the 128-bit vector of [4 x float] to an 210 /// integer value according to the rounding control specified by the second 211 /// argument and returns the rounded values in a 128-bit vector of 212 /// [4 x float]. 213 /// 214 /// \headerfile <x86intrin.h> 215 /// 216 /// \code 217 /// __m128 _mm_round_ps(__m128 X, const int M); 218 /// \endcode 219 /// 220 /// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction. 221 /// 222 /// \param X 223 /// A 128-bit vector of [4 x float]. 224 /// \param M 225 /// An integer value that specifies the rounding operation. \n 226 /// Bits [7:4] are reserved. \n 227 /// Bit [3] is a precision exception value: \n 228 /// 0: A normal PE exception is used \n 229 /// 1: The PE field is not updated \n 230 /// Bit [2] is the rounding control source: \n 231 /// 0: Use bits [1:0] of \a M \n 232 /// 1: Use the current MXCSR setting \n 233 /// Bits [1:0] contain the rounding control definition: \n 234 /// 00: Nearest \n 235 /// 01: Downward (toward negative infinity) \n 236 /// 10: Upward (toward positive infinity) \n 237 /// 11: Truncated 238 /// \returns A 128-bit vector of [4 x float] containing the rounded values. 239 #define _mm_round_ps(X, M) \ 240 ((__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M))) 241 242 /// Copies three upper elements of the first 128-bit vector operand to 243 /// the corresponding three upper elements of the 128-bit result vector of 244 /// [4 x float]. Rounds the lowest element of the second 128-bit vector 245 /// operand to an integer value according to the rounding control specified 246 /// by the third argument and copies it to the lowest element of the 128-bit 247 /// result vector of [4 x float]. 248 /// 249 /// \headerfile <x86intrin.h> 250 /// 251 /// \code 252 /// __m128 _mm_round_ss(__m128 X, __m128 Y, const int M); 253 /// \endcode 254 /// 255 /// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction. 256 /// 257 /// \param X 258 /// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are 259 /// copied to the corresponding bits of the result. 260 /// \param Y 261 /// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is 262 /// rounded to the nearest integer using the specified rounding control and 263 /// copied to the corresponding bits of the result. 264 /// \param M 265 /// An integer value that specifies the rounding operation. \n 266 /// Bits [7:4] are reserved. \n 267 /// Bit [3] is a precision exception value: \n 268 /// 0: A normal PE exception is used \n 269 /// 1: The PE field is not updated \n 270 /// Bit [2] is the rounding control source: \n 271 /// 0: Use bits [1:0] of \a M \n 272 /// 1: Use the current MXCSR setting \n 273 /// Bits [1:0] contain the rounding control definition: \n 274 /// 00: Nearest \n 275 /// 01: Downward (toward negative infinity) \n 276 /// 10: Upward (toward positive infinity) \n 277 /// 11: Truncated 278 /// \returns A 128-bit vector of [4 x float] containing the copied and rounded 279 /// values. 280 #define _mm_round_ss(X, Y, M) \ 281 ((__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), \ 282 (M))) 283 284 /// Rounds each element of the 128-bit vector of [2 x double] to an 285 /// integer value according to the rounding control specified by the second 286 /// argument and returns the rounded values in a 128-bit vector of 287 /// [2 x double]. 288 /// 289 /// \headerfile <x86intrin.h> 290 /// 291 /// \code 292 /// __m128d _mm_round_pd(__m128d X, const int M); 293 /// \endcode 294 /// 295 /// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction. 296 /// 297 /// \param X 298 /// A 128-bit vector of [2 x double]. 299 /// \param M 300 /// An integer value that specifies the rounding operation. \n 301 /// Bits [7:4] are reserved. \n 302 /// Bit [3] is a precision exception value: \n 303 /// 0: A normal PE exception is used \n 304 /// 1: The PE field is not updated \n 305 /// Bit [2] is the rounding control source: \n 306 /// 0: Use bits [1:0] of \a M \n 307 /// 1: Use the current MXCSR setting \n 308 /// Bits [1:0] contain the rounding control definition: \n 309 /// 00: Nearest \n 310 /// 01: Downward (toward negative infinity) \n 311 /// 10: Upward (toward positive infinity) \n 312 /// 11: Truncated 313 /// \returns A 128-bit vector of [2 x double] containing the rounded values. 314 #define _mm_round_pd(X, M) \ 315 ((__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M))) 316 317 /// Copies the upper element of the first 128-bit vector operand to the 318 /// corresponding upper element of the 128-bit result vector of [2 x double]. 319 /// Rounds the lower element of the second 128-bit vector operand to an 320 /// integer value according to the rounding control specified by the third 321 /// argument and copies it to the lower element of the 128-bit result vector 322 /// of [2 x double]. 323 /// 324 /// \headerfile <x86intrin.h> 325 /// 326 /// \code 327 /// __m128d _mm_round_sd(__m128d X, __m128d Y, const int M); 328 /// \endcode 329 /// 330 /// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction. 331 /// 332 /// \param X 333 /// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is 334 /// copied to the corresponding bits of the result. 335 /// \param Y 336 /// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is 337 /// rounded to the nearest integer using the specified rounding control and 338 /// copied to the corresponding bits of the result. 339 /// \param M 340 /// An integer value that specifies the rounding operation. \n 341 /// Bits [7:4] are reserved. \n 342 /// Bit [3] is a precision exception value: \n 343 /// 0: A normal PE exception is used \n 344 /// 1: The PE field is not updated \n 345 /// Bit [2] is the rounding control source: \n 346 /// 0: Use bits [1:0] of \a M \n 347 /// 1: Use the current MXCSR setting \n 348 /// Bits [1:0] contain the rounding control definition: \n 349 /// 00: Nearest \n 350 /// 01: Downward (toward negative infinity) \n 351 /// 10: Upward (toward positive infinity) \n 352 /// 11: Truncated 353 /// \returns A 128-bit vector of [2 x double] containing the copied and rounded 354 /// values. 355 #define _mm_round_sd(X, Y, M) \ 356 ((__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), \ 357 (M))) 358 359 /* SSE4 Packed Blending Intrinsics. */ 360 /// Returns a 128-bit vector of [2 x double] where the values are 361 /// selected from either the first or second operand as specified by the 362 /// third operand, the control mask. 363 /// 364 /// \headerfile <x86intrin.h> 365 /// 366 /// \code 367 /// __m128d _mm_blend_pd(__m128d V1, __m128d V2, const int M); 368 /// \endcode 369 /// 370 /// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction. 371 /// 372 /// \param V1 373 /// A 128-bit vector of [2 x double]. 374 /// \param V2 375 /// A 128-bit vector of [2 x double]. 376 /// \param M 377 /// An immediate integer operand, with mask bits [1:0] specifying how the 378 /// values are to be copied. The position of the mask bit corresponds to the 379 /// index of a copied value. When a mask bit is 0, the corresponding 64-bit 380 /// element in operand \a V1 is copied to the same position in the result. 381 /// When a mask bit is 1, the corresponding 64-bit element in operand \a V2 382 /// is copied to the same position in the result. 383 /// \returns A 128-bit vector of [2 x double] containing the copied values. 384 #define _mm_blend_pd(V1, V2, M) \ 385 ((__m128d)__builtin_ia32_blendpd((__v2df)(__m128d)(V1), \ 386 (__v2df)(__m128d)(V2), (int)(M))) 387 388 /// Returns a 128-bit vector of [4 x float] where the values are selected 389 /// from either the first or second operand as specified by the third 390 /// operand, the control mask. 391 /// 392 /// \headerfile <x86intrin.h> 393 /// 394 /// \code 395 /// __m128 _mm_blend_ps(__m128 V1, __m128 V2, const int M); 396 /// \endcode 397 /// 398 /// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS </c> instruction. 399 /// 400 /// \param V1 401 /// A 128-bit vector of [4 x float]. 402 /// \param V2 403 /// A 128-bit vector of [4 x float]. 404 /// \param M 405 /// An immediate integer operand, with mask bits [3:0] specifying how the 406 /// values are to be copied. The position of the mask bit corresponds to the 407 /// index of a copied value. When a mask bit is 0, the corresponding 32-bit 408 /// element in operand \a V1 is copied to the same position in the result. 409 /// When a mask bit is 1, the corresponding 32-bit element in operand \a V2 410 /// is copied to the same position in the result. 411 /// \returns A 128-bit vector of [4 x float] containing the copied values. 412 #define _mm_blend_ps(V1, V2, M) \ 413 ((__m128)__builtin_ia32_blendps((__v4sf)(__m128)(V1), (__v4sf)(__m128)(V2), \ 414 (int)(M))) 415 416 /// Returns a 128-bit vector of [2 x double] where the values are 417 /// selected from either the first or second operand as specified by the 418 /// third operand, the control mask. 419 /// 420 /// \headerfile <x86intrin.h> 421 /// 422 /// This intrinsic corresponds to the <c> VBLENDVPD / BLENDVPD </c> instruction. 423 /// 424 /// \param __V1 425 /// A 128-bit vector of [2 x double]. 426 /// \param __V2 427 /// A 128-bit vector of [2 x double]. 428 /// \param __M 429 /// A 128-bit vector operand, with mask bits 127 and 63 specifying how the 430 /// values are to be copied. The position of the mask bit corresponds to the 431 /// most significant bit of a copied value. When a mask bit is 0, the 432 /// corresponding 64-bit element in operand \a __V1 is copied to the same 433 /// position in the result. When a mask bit is 1, the corresponding 64-bit 434 /// element in operand \a __V2 is copied to the same position in the result. 435 /// \returns A 128-bit vector of [2 x double] containing the copied values. 436 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_blendv_pd(__m128d __V1, 437 __m128d __V2, 438 __m128d __M) { 439 return (__m128d)__builtin_ia32_blendvpd((__v2df)__V1, (__v2df)__V2, 440 (__v2df)__M); 441 } 442 443 /// Returns a 128-bit vector of [4 x float] where the values are 444 /// selected from either the first or second operand as specified by the 445 /// third operand, the control mask. 446 /// 447 /// \headerfile <x86intrin.h> 448 /// 449 /// This intrinsic corresponds to the <c> VBLENDVPS / BLENDVPS </c> instruction. 450 /// 451 /// \param __V1 452 /// A 128-bit vector of [4 x float]. 453 /// \param __V2 454 /// A 128-bit vector of [4 x float]. 455 /// \param __M 456 /// A 128-bit vector operand, with mask bits 127, 95, 63, and 31 specifying 457 /// how the values are to be copied. The position of the mask bit corresponds 458 /// to the most significant bit of a copied value. When a mask bit is 0, the 459 /// corresponding 32-bit element in operand \a __V1 is copied to the same 460 /// position in the result. When a mask bit is 1, the corresponding 32-bit 461 /// element in operand \a __V2 is copied to the same position in the result. 462 /// \returns A 128-bit vector of [4 x float] containing the copied values. 463 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_blendv_ps(__m128 __V1, 464 __m128 __V2, 465 __m128 __M) { 466 return (__m128)__builtin_ia32_blendvps((__v4sf)__V1, (__v4sf)__V2, 467 (__v4sf)__M); 468 } 469 470 /// Returns a 128-bit vector of [16 x i8] where the values are selected 471 /// from either of the first or second operand as specified by the third 472 /// operand, the control mask. 473 /// 474 /// \headerfile <x86intrin.h> 475 /// 476 /// This intrinsic corresponds to the <c> VPBLENDVB / PBLENDVB </c> instruction. 477 /// 478 /// \param __V1 479 /// A 128-bit vector of [16 x i8]. 480 /// \param __V2 481 /// A 128-bit vector of [16 x i8]. 482 /// \param __M 483 /// A 128-bit vector operand, with mask bits 127, 119, 111...7 specifying 484 /// how the values are to be copied. The position of the mask bit corresponds 485 /// to the most significant bit of a copied value. When a mask bit is 0, the 486 /// corresponding 8-bit element in operand \a __V1 is copied to the same 487 /// position in the result. When a mask bit is 1, the corresponding 8-bit 488 /// element in operand \a __V2 is copied to the same position in the result. 489 /// \returns A 128-bit vector of [16 x i8] containing the copied values. 490 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_blendv_epi8(__m128i __V1, 491 __m128i __V2, 492 __m128i __M) { 493 return (__m128i)__builtin_ia32_pblendvb128((__v16qi)__V1, (__v16qi)__V2, 494 (__v16qi)__M); 495 } 496 497 /// Returns a 128-bit vector of [8 x i16] where the values are selected 498 /// from either of the first or second operand as specified by the third 499 /// operand, the control mask. 500 /// 501 /// \headerfile <x86intrin.h> 502 /// 503 /// \code 504 /// __m128i _mm_blend_epi16(__m128i V1, __m128i V2, const int M); 505 /// \endcode 506 /// 507 /// This intrinsic corresponds to the <c> VPBLENDW / PBLENDW </c> instruction. 508 /// 509 /// \param V1 510 /// A 128-bit vector of [8 x i16]. 511 /// \param V2 512 /// A 128-bit vector of [8 x i16]. 513 /// \param M 514 /// An immediate integer operand, with mask bits [7:0] specifying how the 515 /// values are to be copied. The position of the mask bit corresponds to the 516 /// index of a copied value. When a mask bit is 0, the corresponding 16-bit 517 /// element in operand \a V1 is copied to the same position in the result. 518 /// When a mask bit is 1, the corresponding 16-bit element in operand \a V2 519 /// is copied to the same position in the result. 520 /// \returns A 128-bit vector of [8 x i16] containing the copied values. 521 #define _mm_blend_epi16(V1, V2, M) \ 522 ((__m128i)__builtin_ia32_pblendw128((__v8hi)(__m128i)(V1), \ 523 (__v8hi)(__m128i)(V2), (int)(M))) 524 525 /* SSE4 Dword Multiply Instructions. */ 526 /// Multiples corresponding elements of two 128-bit vectors of [4 x i32] 527 /// and returns the lower 32 bits of the each product in a 128-bit vector of 528 /// [4 x i32]. 529 /// 530 /// \headerfile <x86intrin.h> 531 /// 532 /// This intrinsic corresponds to the <c> VPMULLD / PMULLD </c> instruction. 533 /// 534 /// \param __V1 535 /// A 128-bit integer vector. 536 /// \param __V2 537 /// A 128-bit integer vector. 538 /// \returns A 128-bit integer vector containing the products of both operands. 539 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi32(__m128i __V1, 540 __m128i __V2) { 541 return (__m128i)((__v4su)__V1 * (__v4su)__V2); 542 } 543 544 /// Multiplies corresponding even-indexed elements of two 128-bit 545 /// vectors of [4 x i32] and returns a 128-bit vector of [2 x i64] 546 /// containing the products. 547 /// 548 /// \headerfile <x86intrin.h> 549 /// 550 /// This intrinsic corresponds to the <c> VPMULDQ / PMULDQ </c> instruction. 551 /// 552 /// \param __V1 553 /// A 128-bit vector of [4 x i32]. 554 /// \param __V2 555 /// A 128-bit vector of [4 x i32]. 556 /// \returns A 128-bit vector of [2 x i64] containing the products of both 557 /// operands. 558 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epi32(__m128i __V1, 559 __m128i __V2) { 560 return (__m128i)__builtin_ia32_pmuldq128((__v4si)__V1, (__v4si)__V2); 561 } 562 563 /* SSE4 Floating Point Dot Product Instructions. */ 564 /// Computes the dot product of the two 128-bit vectors of [4 x float] 565 /// and returns it in the elements of the 128-bit result vector of 566 /// [4 x float]. 567 /// 568 /// The immediate integer operand controls which input elements 569 /// will contribute to the dot product, and where the final results are 570 /// returned. 571 /// 572 /// \headerfile <x86intrin.h> 573 /// 574 /// \code 575 /// __m128 _mm_dp_ps(__m128 X, __m128 Y, const int M); 576 /// \endcode 577 /// 578 /// This intrinsic corresponds to the <c> VDPPS / DPPS </c> instruction. 579 /// 580 /// \param X 581 /// A 128-bit vector of [4 x float]. 582 /// \param Y 583 /// A 128-bit vector of [4 x float]. 584 /// \param M 585 /// An immediate integer operand. Mask bits [7:4] determine which elements 586 /// of the input vectors are used, with bit [4] corresponding to the lowest 587 /// element and bit [7] corresponding to the highest element of each [4 x 588 /// float] vector. If a bit is set, the corresponding elements from the two 589 /// input vectors are used as an input for dot product; otherwise that input 590 /// is treated as zero. Bits [3:0] determine which elements of the result 591 /// will receive a copy of the final dot product, with bit [0] corresponding 592 /// to the lowest element and bit [3] corresponding to the highest element of 593 /// each [4 x float] subvector. If a bit is set, the dot product is returned 594 /// in the corresponding element; otherwise that element is set to zero. 595 /// \returns A 128-bit vector of [4 x float] containing the dot product. 596 #define _mm_dp_ps(X, Y, M) \ 597 ((__m128)__builtin_ia32_dpps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (M))) 598 599 /// Computes the dot product of the two 128-bit vectors of [2 x double] 600 /// and returns it in the elements of the 128-bit result vector of 601 /// [2 x double]. 602 /// 603 /// The immediate integer operand controls which input 604 /// elements will contribute to the dot product, and where the final results 605 /// are returned. 606 /// 607 /// \headerfile <x86intrin.h> 608 /// 609 /// \code 610 /// __m128d _mm_dp_pd(__m128d X, __m128d Y, const int M); 611 /// \endcode 612 /// 613 /// This intrinsic corresponds to the <c> VDPPD / DPPD </c> instruction. 614 /// 615 /// \param X 616 /// A 128-bit vector of [2 x double]. 617 /// \param Y 618 /// A 128-bit vector of [2 x double]. 619 /// \param M 620 /// An immediate integer operand. Mask bits [5:4] determine which elements 621 /// of the input vectors are used, with bit [4] corresponding to the lowest 622 /// element and bit [5] corresponding to the highest element of each of [2 x 623 /// double] vector. If a bit is set, the corresponding elements from the two 624 /// input vectors are used as an input for dot product; otherwise that input 625 /// is treated as zero. Bits [1:0] determine which elements of the result 626 /// will receive a copy of the final dot product, with bit [0] corresponding 627 /// to the lowest element and bit [1] corresponding to the highest element of 628 /// each [2 x double] vector. If a bit is set, the dot product is returned in 629 /// the corresponding element; otherwise that element is set to zero. 630 #define _mm_dp_pd(X, Y, M) \ 631 ((__m128d)__builtin_ia32_dppd((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), \ 632 (M))) 633 634 /* SSE4 Streaming Load Hint Instruction. */ 635 /// Loads integer values from a 128-bit aligned memory location to a 636 /// 128-bit integer vector. 637 /// 638 /// \headerfile <x86intrin.h> 639 /// 640 /// This intrinsic corresponds to the <c> VMOVNTDQA / MOVNTDQA </c> instruction. 641 /// 642 /// \param __V 643 /// A pointer to a 128-bit aligned memory location that contains the integer 644 /// values. 645 /// \returns A 128-bit integer vector containing the data stored at the 646 /// specified memory location. 647 static __inline__ __m128i __DEFAULT_FN_ATTRS 648 _mm_stream_load_si128(const void *__V) { 649 return (__m128i)__builtin_nontemporal_load((const __v2di *)__V); 650 } 651 652 /* SSE4 Packed Integer Min/Max Instructions. */ 653 /// Compares the corresponding elements of two 128-bit vectors of 654 /// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the lesser 655 /// of the two values. 656 /// 657 /// \headerfile <x86intrin.h> 658 /// 659 /// This intrinsic corresponds to the <c> VPMINSB / PMINSB </c> instruction. 660 /// 661 /// \param __V1 662 /// A 128-bit vector of [16 x i8]. 663 /// \param __V2 664 /// A 128-bit vector of [16 x i8] 665 /// \returns A 128-bit vector of [16 x i8] containing the lesser values. 666 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi8(__m128i __V1, 667 __m128i __V2) { 668 return (__m128i)__builtin_elementwise_min((__v16qs)__V1, (__v16qs)__V2); 669 } 670 671 /// Compares the corresponding elements of two 128-bit vectors of 672 /// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the 673 /// greater value of the two. 674 /// 675 /// \headerfile <x86intrin.h> 676 /// 677 /// This intrinsic corresponds to the <c> VPMAXSB / PMAXSB </c> instruction. 678 /// 679 /// \param __V1 680 /// A 128-bit vector of [16 x i8]. 681 /// \param __V2 682 /// A 128-bit vector of [16 x i8]. 683 /// \returns A 128-bit vector of [16 x i8] containing the greater values. 684 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi8(__m128i __V1, 685 __m128i __V2) { 686 return (__m128i)__builtin_elementwise_max((__v16qs)__V1, (__v16qs)__V2); 687 } 688 689 /// Compares the corresponding elements of two 128-bit vectors of 690 /// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the lesser 691 /// value of the two. 692 /// 693 /// \headerfile <x86intrin.h> 694 /// 695 /// This intrinsic corresponds to the <c> VPMINUW / PMINUW </c> instruction. 696 /// 697 /// \param __V1 698 /// A 128-bit vector of [8 x u16]. 699 /// \param __V2 700 /// A 128-bit vector of [8 x u16]. 701 /// \returns A 128-bit vector of [8 x u16] containing the lesser values. 702 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu16(__m128i __V1, 703 __m128i __V2) { 704 return (__m128i)__builtin_elementwise_min((__v8hu)__V1, (__v8hu)__V2); 705 } 706 707 /// Compares the corresponding elements of two 128-bit vectors of 708 /// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the 709 /// greater value of the two. 710 /// 711 /// \headerfile <x86intrin.h> 712 /// 713 /// This intrinsic corresponds to the <c> VPMAXUW / PMAXUW </c> instruction. 714 /// 715 /// \param __V1 716 /// A 128-bit vector of [8 x u16]. 717 /// \param __V2 718 /// A 128-bit vector of [8 x u16]. 719 /// \returns A 128-bit vector of [8 x u16] containing the greater values. 720 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu16(__m128i __V1, 721 __m128i __V2) { 722 return (__m128i)__builtin_elementwise_max((__v8hu)__V1, (__v8hu)__V2); 723 } 724 725 /// Compares the corresponding elements of two 128-bit vectors of 726 /// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the lesser 727 /// value of the two. 728 /// 729 /// \headerfile <x86intrin.h> 730 /// 731 /// This intrinsic corresponds to the <c> VPMINSD / PMINSD </c> instruction. 732 /// 733 /// \param __V1 734 /// A 128-bit vector of [4 x i32]. 735 /// \param __V2 736 /// A 128-bit vector of [4 x i32]. 737 /// \returns A 128-bit vector of [4 x i32] containing the lesser values. 738 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi32(__m128i __V1, 739 __m128i __V2) { 740 return (__m128i)__builtin_elementwise_min((__v4si)__V1, (__v4si)__V2); 741 } 742 743 /// Compares the corresponding elements of two 128-bit vectors of 744 /// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the 745 /// greater value of the two. 746 /// 747 /// \headerfile <x86intrin.h> 748 /// 749 /// This intrinsic corresponds to the <c> VPMAXSD / PMAXSD </c> instruction. 750 /// 751 /// \param __V1 752 /// A 128-bit vector of [4 x i32]. 753 /// \param __V2 754 /// A 128-bit vector of [4 x i32]. 755 /// \returns A 128-bit vector of [4 x i32] containing the greater values. 756 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi32(__m128i __V1, 757 __m128i __V2) { 758 return (__m128i)__builtin_elementwise_max((__v4si)__V1, (__v4si)__V2); 759 } 760 761 /// Compares the corresponding elements of two 128-bit vectors of 762 /// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the lesser 763 /// value of the two. 764 /// 765 /// \headerfile <x86intrin.h> 766 /// 767 /// This intrinsic corresponds to the <c> VPMINUD / PMINUD </c> instruction. 768 /// 769 /// \param __V1 770 /// A 128-bit vector of [4 x u32]. 771 /// \param __V2 772 /// A 128-bit vector of [4 x u32]. 773 /// \returns A 128-bit vector of [4 x u32] containing the lesser values. 774 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu32(__m128i __V1, 775 __m128i __V2) { 776 return (__m128i)__builtin_elementwise_min((__v4su)__V1, (__v4su)__V2); 777 } 778 779 /// Compares the corresponding elements of two 128-bit vectors of 780 /// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the 781 /// greater value of the two. 782 /// 783 /// \headerfile <x86intrin.h> 784 /// 785 /// This intrinsic corresponds to the <c> VPMAXUD / PMAXUD </c> instruction. 786 /// 787 /// \param __V1 788 /// A 128-bit vector of [4 x u32]. 789 /// \param __V2 790 /// A 128-bit vector of [4 x u32]. 791 /// \returns A 128-bit vector of [4 x u32] containing the greater values. 792 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu32(__m128i __V1, 793 __m128i __V2) { 794 return (__m128i)__builtin_elementwise_max((__v4su)__V1, (__v4su)__V2); 795 } 796 797 /* SSE4 Insertion and Extraction from XMM Register Instructions. */ 798 /// Takes the first argument \a X and inserts an element from the second 799 /// argument \a Y as selected by the third argument \a N. That result then 800 /// has elements zeroed out also as selected by the third argument \a N. The 801 /// resulting 128-bit vector of [4 x float] is then returned. 802 /// 803 /// \headerfile <x86intrin.h> 804 /// 805 /// \code 806 /// __m128 _mm_insert_ps(__m128 X, __m128 Y, const int N); 807 /// \endcode 808 /// 809 /// This intrinsic corresponds to the <c> VINSERTPS </c> instruction. 810 /// 811 /// \param X 812 /// A 128-bit vector source operand of [4 x float]. With the exception of 813 /// those bits in the result copied from parameter \a Y and zeroed by bits 814 /// [3:0] of \a N, all bits from this parameter are copied to the result. 815 /// \param Y 816 /// A 128-bit vector source operand of [4 x float]. One single-precision 817 /// floating-point element from this source, as determined by the immediate 818 /// parameter, is copied to the result. 819 /// \param N 820 /// Specifies which bits from operand \a Y will be copied, which bits in the 821 /// result they will be copied to, and which bits in the result will be 822 /// cleared. The following assignments are made: \n 823 /// Bits [7:6] specify the bits to copy from operand \a Y: \n 824 /// 00: Selects bits [31:0] from operand \a Y. \n 825 /// 01: Selects bits [63:32] from operand \a Y. \n 826 /// 10: Selects bits [95:64] from operand \a Y. \n 827 /// 11: Selects bits [127:96] from operand \a Y. \n 828 /// Bits [5:4] specify the bits in the result to which the selected bits 829 /// from operand \a Y are copied: \n 830 /// 00: Copies the selected bits from \a Y to result bits [31:0]. \n 831 /// 01: Copies the selected bits from \a Y to result bits [63:32]. \n 832 /// 10: Copies the selected bits from \a Y to result bits [95:64]. \n 833 /// 11: Copies the selected bits from \a Y to result bits [127:96]. \n 834 /// Bits[3:0]: If any of these bits are set, the corresponding result 835 /// element is cleared. 836 /// \returns A 128-bit vector of [4 x float] containing the copied 837 /// single-precision floating point elements from the operands. 838 #define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N)) 839 840 /// Extracts a 32-bit integer from a 128-bit vector of [4 x float] and 841 /// returns it, using the immediate value parameter \a N as a selector. 842 /// 843 /// \headerfile <x86intrin.h> 844 /// 845 /// \code 846 /// int _mm_extract_ps(__m128 X, const int N); 847 /// \endcode 848 /// 849 /// This intrinsic corresponds to the <c> VEXTRACTPS / EXTRACTPS </c> 850 /// instruction. 851 /// 852 /// \param X 853 /// A 128-bit vector of [4 x float]. 854 /// \param N 855 /// An immediate value. Bits [1:0] determines which bits from the argument 856 /// \a X are extracted and returned: \n 857 /// 00: Bits [31:0] of parameter \a X are returned. \n 858 /// 01: Bits [63:32] of parameter \a X are returned. \n 859 /// 10: Bits [95:64] of parameter \a X are returned. \n 860 /// 11: Bits [127:96] of parameter \a X are returned. 861 /// \returns A 32-bit integer containing the extracted 32 bits of float data. 862 #define _mm_extract_ps(X, N) \ 863 __builtin_bit_cast( \ 864 int, __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N))) 865 866 /* Miscellaneous insert and extract macros. */ 867 /* Extract a single-precision float from X at index N into D. */ 868 #define _MM_EXTRACT_FLOAT(D, X, N) \ 869 do { \ 870 (D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); \ 871 } while (0) 872 873 /* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create 874 an index suitable for _mm_insert_ps. */ 875 #define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z)) 876 877 /* Extract a float from X at index N into the first index of the return. */ 878 #define _MM_PICK_OUT_PS(X, N) \ 879 _mm_insert_ps(_mm_setzero_ps(), (X), _MM_MK_INSERTPS_NDX((N), 0, 0x0e)) 880 881 /* Insert int into packed integer array at index. */ 882 /// Constructs a 128-bit vector of [16 x i8] by first making a copy of 883 /// the 128-bit integer vector parameter, and then inserting the lower 8 bits 884 /// of an integer parameter \a I into an offset specified by the immediate 885 /// value parameter \a N. 886 /// 887 /// \headerfile <x86intrin.h> 888 /// 889 /// \code 890 /// __m128i _mm_insert_epi8(__m128i X, int I, const int N); 891 /// \endcode 892 /// 893 /// This intrinsic corresponds to the <c> VPINSRB / PINSRB </c> instruction. 894 /// 895 /// \param X 896 /// A 128-bit integer vector of [16 x i8]. This vector is copied to the 897 /// result and then one of the sixteen elements in the result vector is 898 /// replaced by the lower 8 bits of \a I. 899 /// \param I 900 /// An integer. The lower 8 bits of this operand are written to the result 901 /// beginning at the offset specified by \a N. 902 /// \param N 903 /// An immediate value. Bits [3:0] specify the bit offset in the result at 904 /// which the lower 8 bits of \a I are written. \n 905 /// 0000: Bits [7:0] of the result are used for insertion. \n 906 /// 0001: Bits [15:8] of the result are used for insertion. \n 907 /// 0010: Bits [23:16] of the result are used for insertion. \n 908 /// 0011: Bits [31:24] of the result are used for insertion. \n 909 /// 0100: Bits [39:32] of the result are used for insertion. \n 910 /// 0101: Bits [47:40] of the result are used for insertion. \n 911 /// 0110: Bits [55:48] of the result are used for insertion. \n 912 /// 0111: Bits [63:56] of the result are used for insertion. \n 913 /// 1000: Bits [71:64] of the result are used for insertion. \n 914 /// 1001: Bits [79:72] of the result are used for insertion. \n 915 /// 1010: Bits [87:80] of the result are used for insertion. \n 916 /// 1011: Bits [95:88] of the result are used for insertion. \n 917 /// 1100: Bits [103:96] of the result are used for insertion. \n 918 /// 1101: Bits [111:104] of the result are used for insertion. \n 919 /// 1110: Bits [119:112] of the result are used for insertion. \n 920 /// 1111: Bits [127:120] of the result are used for insertion. 921 /// \returns A 128-bit integer vector containing the constructed values. 922 #define _mm_insert_epi8(X, I, N) \ 923 ((__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), (int)(I), \ 924 (int)(N))) 925 926 /// Constructs a 128-bit vector of [4 x i32] by first making a copy of 927 /// the 128-bit integer vector parameter, and then inserting the 32-bit 928 /// integer parameter \a I at the offset specified by the immediate value 929 /// parameter \a N. 930 /// 931 /// \headerfile <x86intrin.h> 932 /// 933 /// \code 934 /// __m128i _mm_insert_epi32(__m128i X, int I, const int N); 935 /// \endcode 936 /// 937 /// This intrinsic corresponds to the <c> VPINSRD / PINSRD </c> instruction. 938 /// 939 /// \param X 940 /// A 128-bit integer vector of [4 x i32]. This vector is copied to the 941 /// result and then one of the four elements in the result vector is 942 /// replaced by \a I. 943 /// \param I 944 /// A 32-bit integer that is written to the result beginning at the offset 945 /// specified by \a N. 946 /// \param N 947 /// An immediate value. Bits [1:0] specify the bit offset in the result at 948 /// which the integer \a I is written. \n 949 /// 00: Bits [31:0] of the result are used for insertion. \n 950 /// 01: Bits [63:32] of the result are used for insertion. \n 951 /// 10: Bits [95:64] of the result are used for insertion. \n 952 /// 11: Bits [127:96] of the result are used for insertion. 953 /// \returns A 128-bit integer vector containing the constructed values. 954 #define _mm_insert_epi32(X, I, N) \ 955 ((__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), (int)(I), \ 956 (int)(N))) 957 958 #ifdef __x86_64__ 959 /// Constructs a 128-bit vector of [2 x i64] by first making a copy of 960 /// the 128-bit integer vector parameter, and then inserting the 64-bit 961 /// integer parameter \a I, using the immediate value parameter \a N as an 962 /// insertion location selector. 963 /// 964 /// \headerfile <x86intrin.h> 965 /// 966 /// \code 967 /// __m128i _mm_insert_epi64(__m128i X, long long I, const int N); 968 /// \endcode 969 /// 970 /// This intrinsic corresponds to the <c> VPINSRQ / PINSRQ </c> instruction. 971 /// 972 /// \param X 973 /// A 128-bit integer vector of [2 x i64]. This vector is copied to the 974 /// result and then one of the two elements in the result vector is replaced 975 /// by \a I. 976 /// \param I 977 /// A 64-bit integer that is written to the result beginning at the offset 978 /// specified by \a N. 979 /// \param N 980 /// An immediate value. Bit [0] specifies the bit offset in the result at 981 /// which the integer \a I is written. \n 982 /// 0: Bits [63:0] of the result are used for insertion. \n 983 /// 1: Bits [127:64] of the result are used for insertion. \n 984 /// \returns A 128-bit integer vector containing the constructed values. 985 #define _mm_insert_epi64(X, I, N) \ 986 ((__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), (long long)(I), \ 987 (int)(N))) 988 #endif /* __x86_64__ */ 989 990 /* Extract int from packed integer array at index. This returns the element 991 * as a zero extended value, so it is unsigned. 992 */ 993 /// Extracts an 8-bit element from the 128-bit integer vector of 994 /// [16 x i8], using the immediate value parameter \a N as a selector. 995 /// 996 /// \headerfile <x86intrin.h> 997 /// 998 /// \code 999 /// int _mm_extract_epi8(__m128i X, const int N); 1000 /// \endcode 1001 /// 1002 /// This intrinsic corresponds to the <c> VPEXTRB / PEXTRB </c> instruction. 1003 /// 1004 /// \param X 1005 /// A 128-bit integer vector. 1006 /// \param N 1007 /// An immediate value. Bits [3:0] specify which 8-bit vector element from 1008 /// the argument \a X to extract and copy to the result. \n 1009 /// 0000: Bits [7:0] of parameter \a X are extracted. \n 1010 /// 0001: Bits [15:8] of the parameter \a X are extracted. \n 1011 /// 0010: Bits [23:16] of the parameter \a X are extracted. \n 1012 /// 0011: Bits [31:24] of the parameter \a X are extracted. \n 1013 /// 0100: Bits [39:32] of the parameter \a X are extracted. \n 1014 /// 0101: Bits [47:40] of the parameter \a X are extracted. \n 1015 /// 0110: Bits [55:48] of the parameter \a X are extracted. \n 1016 /// 0111: Bits [63:56] of the parameter \a X are extracted. \n 1017 /// 1000: Bits [71:64] of the parameter \a X are extracted. \n 1018 /// 1001: Bits [79:72] of the parameter \a X are extracted. \n 1019 /// 1010: Bits [87:80] of the parameter \a X are extracted. \n 1020 /// 1011: Bits [95:88] of the parameter \a X are extracted. \n 1021 /// 1100: Bits [103:96] of the parameter \a X are extracted. \n 1022 /// 1101: Bits [111:104] of the parameter \a X are extracted. \n 1023 /// 1110: Bits [119:112] of the parameter \a X are extracted. \n 1024 /// 1111: Bits [127:120] of the parameter \a X are extracted. 1025 /// \returns An unsigned integer, whose lower 8 bits are selected from the 1026 /// 128-bit integer vector parameter and the remaining bits are assigned 1027 /// zeros. 1028 #define _mm_extract_epi8(X, N) \ 1029 ((int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X), \ 1030 (int)(N))) 1031 1032 /// Extracts a 32-bit element from the 128-bit integer vector of 1033 /// [4 x i32], using the immediate value parameter \a N as a selector. 1034 /// 1035 /// \headerfile <x86intrin.h> 1036 /// 1037 /// \code 1038 /// int _mm_extract_epi32(__m128i X, const int N); 1039 /// \endcode 1040 /// 1041 /// This intrinsic corresponds to the <c> VPEXTRD / PEXTRD </c> instruction. 1042 /// 1043 /// \param X 1044 /// A 128-bit integer vector. 1045 /// \param N 1046 /// An immediate value. Bits [1:0] specify which 32-bit vector element from 1047 /// the argument \a X to extract and copy to the result. \n 1048 /// 00: Bits [31:0] of the parameter \a X are extracted. \n 1049 /// 01: Bits [63:32] of the parameter \a X are extracted. \n 1050 /// 10: Bits [95:64] of the parameter \a X are extracted. \n 1051 /// 11: Bits [127:96] of the parameter \a X are exracted. 1052 /// \returns An integer, whose lower 32 bits are selected from the 128-bit 1053 /// integer vector parameter and the remaining bits are assigned zeros. 1054 #define _mm_extract_epi32(X, N) \ 1055 ((int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N))) 1056 1057 /// Extracts a 64-bit element from the 128-bit integer vector of 1058 /// [2 x i64], using the immediate value parameter \a N as a selector. 1059 /// 1060 /// \headerfile <x86intrin.h> 1061 /// 1062 /// \code 1063 /// long long _mm_extract_epi64(__m128i X, const int N); 1064 /// \endcode 1065 /// 1066 /// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction 1067 /// in 64-bit mode. 1068 /// 1069 /// \param X 1070 /// A 128-bit integer vector. 1071 /// \param N 1072 /// An immediate value. Bit [0] specifies which 64-bit vector element from 1073 /// the argument \a X to return. \n 1074 /// 0: Bits [63:0] are returned. \n 1075 /// 1: Bits [127:64] are returned. \n 1076 /// \returns A 64-bit integer. 1077 #define _mm_extract_epi64(X, N) \ 1078 ((long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N))) 1079 1080 /* SSE4 128-bit Packed Integer Comparisons. */ 1081 /// Tests whether the specified bits in a 128-bit integer vector are all 1082 /// zeros. 1083 /// 1084 /// \headerfile <x86intrin.h> 1085 /// 1086 /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 1087 /// 1088 /// \param __M 1089 /// A 128-bit integer vector containing the bits to be tested. 1090 /// \param __V 1091 /// A 128-bit integer vector selecting which bits to test in operand \a __M. 1092 /// \returns TRUE if the specified bits are all zeros; FALSE otherwise. 1093 static __inline__ int __DEFAULT_FN_ATTRS _mm_testz_si128(__m128i __M, 1094 __m128i __V) { 1095 return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V); 1096 } 1097 1098 /// Tests whether the specified bits in a 128-bit integer vector are all 1099 /// ones. 1100 /// 1101 /// \headerfile <x86intrin.h> 1102 /// 1103 /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 1104 /// 1105 /// \param __M 1106 /// A 128-bit integer vector containing the bits to be tested. 1107 /// \param __V 1108 /// A 128-bit integer vector selecting which bits to test in operand \a __M. 1109 /// \returns TRUE if the specified bits are all ones; FALSE otherwise. 1110 static __inline__ int __DEFAULT_FN_ATTRS _mm_testc_si128(__m128i __M, 1111 __m128i __V) { 1112 return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V); 1113 } 1114 1115 /// Tests whether the specified bits in a 128-bit integer vector are 1116 /// neither all zeros nor all ones. 1117 /// 1118 /// \headerfile <x86intrin.h> 1119 /// 1120 /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 1121 /// 1122 /// \param __M 1123 /// A 128-bit integer vector containing the bits to be tested. 1124 /// \param __V 1125 /// A 128-bit integer vector selecting which bits to test in operand \a __M. 1126 /// \returns TRUE if the specified bits are neither all zeros nor all ones; 1127 /// FALSE otherwise. 1128 static __inline__ int __DEFAULT_FN_ATTRS _mm_testnzc_si128(__m128i __M, 1129 __m128i __V) { 1130 return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V); 1131 } 1132 1133 /// Tests whether the specified bits in a 128-bit integer vector are all 1134 /// ones. 1135 /// 1136 /// \headerfile <x86intrin.h> 1137 /// 1138 /// \code 1139 /// int _mm_test_all_ones(__m128i V); 1140 /// \endcode 1141 /// 1142 /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 1143 /// 1144 /// \param V 1145 /// A 128-bit integer vector containing the bits to be tested. 1146 /// \returns TRUE if the bits specified in the operand are all set to 1; FALSE 1147 /// otherwise. 1148 #define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_set1_epi32(-1)) 1149 1150 /// Tests whether the specified bits in a 128-bit integer vector are 1151 /// neither all zeros nor all ones. 1152 /// 1153 /// \headerfile <x86intrin.h> 1154 /// 1155 /// \code 1156 /// int _mm_test_mix_ones_zeros(__m128i M, __m128i V); 1157 /// \endcode 1158 /// 1159 /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 1160 /// 1161 /// \param M 1162 /// A 128-bit integer vector containing the bits to be tested. 1163 /// \param V 1164 /// A 128-bit integer vector selecting which bits to test in operand \a M. 1165 /// \returns TRUE if the specified bits are neither all zeros nor all ones; 1166 /// FALSE otherwise. 1167 #define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V)) 1168 1169 /// Tests whether the specified bits in a 128-bit integer vector are all 1170 /// zeros. 1171 /// 1172 /// \headerfile <x86intrin.h> 1173 /// 1174 /// \code 1175 /// int _mm_test_all_zeros(__m128i M, __m128i V); 1176 /// \endcode 1177 /// 1178 /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 1179 /// 1180 /// \param M 1181 /// A 128-bit integer vector containing the bits to be tested. 1182 /// \param V 1183 /// A 128-bit integer vector selecting which bits to test in operand \a M. 1184 /// \returns TRUE if the specified bits are all zeros; FALSE otherwise. 1185 #define _mm_test_all_zeros(M, V) _mm_testz_si128((M), (V)) 1186 1187 /* SSE4 64-bit Packed Integer Comparisons. */ 1188 /// Compares each of the corresponding 64-bit values of the 128-bit 1189 /// integer vectors for equality. 1190 /// 1191 /// \headerfile <x86intrin.h> 1192 /// 1193 /// This intrinsic corresponds to the <c> VPCMPEQQ / PCMPEQQ </c> instruction. 1194 /// 1195 /// \param __V1 1196 /// A 128-bit integer vector. 1197 /// \param __V2 1198 /// A 128-bit integer vector. 1199 /// \returns A 128-bit integer vector containing the comparison results. 1200 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi64(__m128i __V1, 1201 __m128i __V2) { 1202 return (__m128i)((__v2di)__V1 == (__v2di)__V2); 1203 } 1204 1205 /* SSE4 Packed Integer Sign-Extension. */ 1206 /// Sign-extends each of the lower eight 8-bit integer elements of a 1207 /// 128-bit vector of [16 x i8] to 16-bit values and returns them in a 1208 /// 128-bit vector of [8 x i16]. The upper eight elements of the input vector 1209 /// are unused. 1210 /// 1211 /// \headerfile <x86intrin.h> 1212 /// 1213 /// This intrinsic corresponds to the <c> VPMOVSXBW / PMOVSXBW </c> instruction. 1214 /// 1215 /// \param __V 1216 /// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are 1217 /// sign-extended to 16-bit values. 1218 /// \returns A 128-bit vector of [8 x i16] containing the sign-extended values. 1219 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi16(__m128i __V) { 1220 /* This function always performs a signed extension, but __v16qi is a char 1221 which may be signed or unsigned, so use __v16qs. */ 1222 return (__m128i) __builtin_convertvector( 1223 __builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 1224 7), 1225 __v8hi); 1226 } 1227 1228 /// Sign-extends each of the lower four 8-bit integer elements of a 1229 /// 128-bit vector of [16 x i8] to 32-bit values and returns them in a 1230 /// 128-bit vector of [4 x i32]. The upper twelve elements of the input 1231 /// vector are unused. 1232 /// 1233 /// \headerfile <x86intrin.h> 1234 /// 1235 /// This intrinsic corresponds to the <c> VPMOVSXBD / PMOVSXBD </c> instruction. 1236 /// 1237 /// \param __V 1238 /// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are 1239 /// sign-extended to 32-bit values. 1240 /// \returns A 128-bit vector of [4 x i32] containing the sign-extended values. 1241 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi32(__m128i __V) { 1242 /* This function always performs a signed extension, but __v16qi is a char 1243 which may be signed or unsigned, so use __v16qs. */ 1244 return (__m128i) __builtin_convertvector( 1245 __builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si); 1246 } 1247 1248 /// Sign-extends each of the lower two 8-bit integer elements of a 1249 /// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in 1250 /// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input 1251 /// vector are unused. 1252 /// 1253 /// \headerfile <x86intrin.h> 1254 /// 1255 /// This intrinsic corresponds to the <c> VPMOVSXBQ / PMOVSXBQ </c> instruction. 1256 /// 1257 /// \param __V 1258 /// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are 1259 /// sign-extended to 64-bit values. 1260 /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. 1261 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi64(__m128i __V) { 1262 /* This function always performs a signed extension, but __v16qi is a char 1263 which may be signed or unsigned, so use __v16qs. */ 1264 return (__m128i) __builtin_convertvector( 1265 __builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di); 1266 } 1267 1268 /// Sign-extends each of the lower four 16-bit integer elements of a 1269 /// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in 1270 /// a 128-bit vector of [4 x i32]. The upper four elements of the input 1271 /// vector are unused. 1272 /// 1273 /// \headerfile <x86intrin.h> 1274 /// 1275 /// This intrinsic corresponds to the <c> VPMOVSXWD / PMOVSXWD </c> instruction. 1276 /// 1277 /// \param __V 1278 /// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are 1279 /// sign-extended to 32-bit values. 1280 /// \returns A 128-bit vector of [4 x i32] containing the sign-extended values. 1281 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi32(__m128i __V) { 1282 return (__m128i) __builtin_convertvector( 1283 __builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si); 1284 } 1285 1286 /// Sign-extends each of the lower two 16-bit integer elements of a 1287 /// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in 1288 /// a 128-bit vector of [2 x i64]. The upper six elements of the input 1289 /// vector are unused. 1290 /// 1291 /// \headerfile <x86intrin.h> 1292 /// 1293 /// This intrinsic corresponds to the <c> VPMOVSXWQ / PMOVSXWQ </c> instruction. 1294 /// 1295 /// \param __V 1296 /// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are 1297 /// sign-extended to 64-bit values. 1298 /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. 1299 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi64(__m128i __V) { 1300 return (__m128i) __builtin_convertvector( 1301 __builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di); 1302 } 1303 1304 /// Sign-extends each of the lower two 32-bit integer elements of a 1305 /// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in 1306 /// a 128-bit vector of [2 x i64]. The upper two elements of the input vector 1307 /// are unused. 1308 /// 1309 /// \headerfile <x86intrin.h> 1310 /// 1311 /// This intrinsic corresponds to the <c> VPMOVSXDQ / PMOVSXDQ </c> instruction. 1312 /// 1313 /// \param __V 1314 /// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are 1315 /// sign-extended to 64-bit values. 1316 /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. 1317 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi32_epi64(__m128i __V) { 1318 return (__m128i) __builtin_convertvector( 1319 __builtin_shufflevector((__v4si)__V, (__v4si)__V, 0, 1), __v2di); 1320 } 1321 1322 /* SSE4 Packed Integer Zero-Extension. */ 1323 /// Zero-extends each of the lower eight 8-bit integer elements of a 1324 /// 128-bit vector of [16 x i8] to 16-bit values and returns them in a 1325 /// 128-bit vector of [8 x i16]. The upper eight elements of the input vector 1326 /// are unused. 1327 /// 1328 /// \headerfile <x86intrin.h> 1329 /// 1330 /// This intrinsic corresponds to the <c> VPMOVZXBW / PMOVZXBW </c> instruction. 1331 /// 1332 /// \param __V 1333 /// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are 1334 /// zero-extended to 16-bit values. 1335 /// \returns A 128-bit vector of [8 x i16] containing the zero-extended values. 1336 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi16(__m128i __V) { 1337 return (__m128i) __builtin_convertvector( 1338 __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 1339 7), 1340 __v8hi); 1341 } 1342 1343 /// Zero-extends each of the lower four 8-bit integer elements of a 1344 /// 128-bit vector of [16 x i8] to 32-bit values and returns them in a 1345 /// 128-bit vector of [4 x i32]. The upper twelve elements of the input 1346 /// vector are unused. 1347 /// 1348 /// \headerfile <x86intrin.h> 1349 /// 1350 /// This intrinsic corresponds to the <c> VPMOVZXBD / PMOVZXBD </c> instruction. 1351 /// 1352 /// \param __V 1353 /// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are 1354 /// zero-extended to 32-bit values. 1355 /// \returns A 128-bit vector of [4 x i32] containing the zero-extended values. 1356 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi32(__m128i __V) { 1357 return (__m128i) __builtin_convertvector( 1358 __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si); 1359 } 1360 1361 /// Zero-extends each of the lower two 8-bit integer elements of a 1362 /// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in 1363 /// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input 1364 /// vector are unused. 1365 /// 1366 /// \headerfile <x86intrin.h> 1367 /// 1368 /// This intrinsic corresponds to the <c> VPMOVZXBQ / PMOVZXBQ </c> instruction. 1369 /// 1370 /// \param __V 1371 /// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are 1372 /// zero-extended to 64-bit values. 1373 /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. 1374 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi64(__m128i __V) { 1375 return (__m128i) __builtin_convertvector( 1376 __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di); 1377 } 1378 1379 /// Zero-extends each of the lower four 16-bit integer elements of a 1380 /// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in 1381 /// a 128-bit vector of [4 x i32]. The upper four elements of the input 1382 /// vector are unused. 1383 /// 1384 /// \headerfile <x86intrin.h> 1385 /// 1386 /// This intrinsic corresponds to the <c> VPMOVZXWD / PMOVZXWD </c> instruction. 1387 /// 1388 /// \param __V 1389 /// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are 1390 /// zero-extended to 32-bit values. 1391 /// \returns A 128-bit vector of [4 x i32] containing the zero-extended values. 1392 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi32(__m128i __V) { 1393 return (__m128i) __builtin_convertvector( 1394 __builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si); 1395 } 1396 1397 /// Zero-extends each of the lower two 16-bit integer elements of a 1398 /// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in 1399 /// a 128-bit vector of [2 x i64]. The upper six elements of the input vector 1400 /// are unused. 1401 /// 1402 /// \headerfile <x86intrin.h> 1403 /// 1404 /// This intrinsic corresponds to the <c> VPMOVZXWQ / PMOVZXWQ </c> instruction. 1405 /// 1406 /// \param __V 1407 /// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are 1408 /// zero-extended to 64-bit values. 1409 /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. 1410 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi64(__m128i __V) { 1411 return (__m128i) __builtin_convertvector( 1412 __builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di); 1413 } 1414 1415 /// Zero-extends each of the lower two 32-bit integer elements of a 1416 /// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in 1417 /// a 128-bit vector of [2 x i64]. The upper two elements of the input vector 1418 /// are unused. 1419 /// 1420 /// \headerfile <x86intrin.h> 1421 /// 1422 /// This intrinsic corresponds to the <c> VPMOVZXDQ / PMOVZXDQ </c> instruction. 1423 /// 1424 /// \param __V 1425 /// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are 1426 /// zero-extended to 64-bit values. 1427 /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. 1428 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu32_epi64(__m128i __V) { 1429 return (__m128i) __builtin_convertvector( 1430 __builtin_shufflevector((__v4su)__V, (__v4su)__V, 0, 1), __v2di); 1431 } 1432 1433 /* SSE4 Pack with Unsigned Saturation. */ 1434 /// Converts 32-bit signed integers from both 128-bit integer vector 1435 /// operands into 16-bit unsigned integers, and returns the packed result. 1436 /// Values greater than 0xFFFF are saturated to 0xFFFF. Values less than 1437 /// 0x0000 are saturated to 0x0000. 1438 /// 1439 /// \headerfile <x86intrin.h> 1440 /// 1441 /// This intrinsic corresponds to the <c> VPACKUSDW / PACKUSDW </c> instruction. 1442 /// 1443 /// \param __V1 1444 /// A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a 1445 /// signed integer and is converted to a 16-bit unsigned integer with 1446 /// saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values 1447 /// less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values 1448 /// are written to the lower 64 bits of the result. 1449 /// \param __V2 1450 /// A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a 1451 /// signed integer and is converted to a 16-bit unsigned integer with 1452 /// saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values 1453 /// less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values 1454 /// are written to the higher 64 bits of the result. 1455 /// \returns A 128-bit vector of [8 x i16] containing the converted values. 1456 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi32(__m128i __V1, 1457 __m128i __V2) { 1458 return (__m128i)__builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2); 1459 } 1460 1461 /* SSE4 Multiple Packed Sums of Absolute Difference. */ 1462 /// Subtracts 8-bit unsigned integer values and computes the absolute 1463 /// values of the differences to the corresponding bits in the destination. 1464 /// Then sums of the absolute differences are returned according to the bit 1465 /// fields in the immediate operand. 1466 /// 1467 /// \headerfile <x86intrin.h> 1468 /// 1469 /// \code 1470 /// __m128i _mm_mpsadbw_epu8(__m128i X, __m128i Y, const int M); 1471 /// \endcode 1472 /// 1473 /// This intrinsic corresponds to the <c> VMPSADBW / MPSADBW </c> instruction. 1474 /// 1475 /// \param X 1476 /// A 128-bit vector of [16 x i8]. 1477 /// \param Y 1478 /// A 128-bit vector of [16 x i8]. 1479 /// \param M 1480 /// An 8-bit immediate operand specifying how the absolute differences are to 1481 /// be calculated, according to the following algorithm: 1482 /// \code 1483 /// // M2 represents bit 2 of the immediate operand 1484 /// // M10 represents bits [1:0] of the immediate operand 1485 /// i = M2 * 4; 1486 /// j = M10 * 4; 1487 /// for (k = 0; k < 8; k = k + 1) { 1488 /// d0 = abs(X[i + k + 0] - Y[j + 0]); 1489 /// d1 = abs(X[i + k + 1] - Y[j + 1]); 1490 /// d2 = abs(X[i + k + 2] - Y[j + 2]); 1491 /// d3 = abs(X[i + k + 3] - Y[j + 3]); 1492 /// r[k] = d0 + d1 + d2 + d3; 1493 /// } 1494 /// \endcode 1495 /// \returns A 128-bit integer vector containing the sums of the sets of 1496 /// absolute differences between both operands. 1497 #define _mm_mpsadbw_epu8(X, Y, M) \ 1498 ((__m128i)__builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \ 1499 (__v16qi)(__m128i)(Y), (M))) 1500 1501 /// Finds the minimum unsigned 16-bit element in the input 128-bit 1502 /// vector of [8 x u16] and returns it and along with its index. 1503 /// 1504 /// \headerfile <x86intrin.h> 1505 /// 1506 /// This intrinsic corresponds to the <c> VPHMINPOSUW / PHMINPOSUW </c> 1507 /// instruction. 1508 /// 1509 /// \param __V 1510 /// A 128-bit vector of [8 x u16]. 1511 /// \returns A 128-bit value where bits [15:0] contain the minimum value found 1512 /// in parameter \a __V, bits [18:16] contain the index of the minimum value 1513 /// and the remaining bits are set to 0. 1514 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_minpos_epu16(__m128i __V) { 1515 return (__m128i)__builtin_ia32_phminposuw128((__v8hi)__V); 1516 } 1517 1518 /* Handle the sse4.2 definitions here. */ 1519 1520 /* These definitions are normally in nmmintrin.h, but gcc puts them in here 1521 so we'll do the same. */ 1522 1523 #undef __DEFAULT_FN_ATTRS 1524 #define __DEFAULT_FN_ATTRS \ 1525 __attribute__((__always_inline__, __nodebug__, __target__("sse4.2"))) 1526 1527 /* These specify the type of data that we're comparing. */ 1528 #define _SIDD_UBYTE_OPS 0x00 1529 #define _SIDD_UWORD_OPS 0x01 1530 #define _SIDD_SBYTE_OPS 0x02 1531 #define _SIDD_SWORD_OPS 0x03 1532 1533 /* These specify the type of comparison operation. */ 1534 #define _SIDD_CMP_EQUAL_ANY 0x00 1535 #define _SIDD_CMP_RANGES 0x04 1536 #define _SIDD_CMP_EQUAL_EACH 0x08 1537 #define _SIDD_CMP_EQUAL_ORDERED 0x0c 1538 1539 /* These macros specify the polarity of the operation. */ 1540 #define _SIDD_POSITIVE_POLARITY 0x00 1541 #define _SIDD_NEGATIVE_POLARITY 0x10 1542 #define _SIDD_MASKED_POSITIVE_POLARITY 0x20 1543 #define _SIDD_MASKED_NEGATIVE_POLARITY 0x30 1544 1545 /* These macros are used in _mm_cmpXstri() to specify the return. */ 1546 #define _SIDD_LEAST_SIGNIFICANT 0x00 1547 #define _SIDD_MOST_SIGNIFICANT 0x40 1548 1549 /* These macros are used in _mm_cmpXstri() to specify the return. */ 1550 #define _SIDD_BIT_MASK 0x00 1551 #define _SIDD_UNIT_MASK 0x40 1552 1553 /* SSE4.2 Packed Comparison Intrinsics. */ 1554 /// Uses the immediate operand \a M to perform a comparison of string 1555 /// data with implicitly defined lengths that is contained in source operands 1556 /// \a A and \a B. Returns a 128-bit integer vector representing the result 1557 /// mask of the comparison. 1558 /// 1559 /// \headerfile <x86intrin.h> 1560 /// 1561 /// \code 1562 /// __m128i _mm_cmpistrm(__m128i A, __m128i B, const int M); 1563 /// \endcode 1564 /// 1565 /// This intrinsic corresponds to the <c> VPCMPISTRM / PCMPISTRM </c> 1566 /// instruction. 1567 /// 1568 /// \param A 1569 /// A 128-bit integer vector containing one of the source operands to be 1570 /// compared. 1571 /// \param B 1572 /// A 128-bit integer vector containing one of the source operands to be 1573 /// compared. 1574 /// \param M 1575 /// An 8-bit immediate operand specifying whether the characters are bytes or 1576 /// words, the type of comparison to perform, and the format of the return 1577 /// value. \n 1578 /// Bits [1:0]: Determine source data format. \n 1579 /// 00: 16 unsigned bytes \n 1580 /// 01: 8 unsigned words \n 1581 /// 10: 16 signed bytes \n 1582 /// 11: 8 signed words \n 1583 /// Bits [3:2]: Determine comparison type and aggregation method. \n 1584 /// 00: Subset: Each character in \a B is compared for equality with all 1585 /// the characters in \a A. \n 1586 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1587 /// basis is greater than or equal for even-indexed elements in \a A, 1588 /// and less than or equal for odd-indexed elements in \a A. \n 1589 /// 10: Match: Compare each pair of corresponding characters in \a A and 1590 /// \a B for equality. \n 1591 /// 11: Substring: Search \a B for substring matches of \a A. \n 1592 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 1593 /// mask of the comparison results. \n 1594 /// 00: No effect. \n 1595 /// 01: Negate the bit mask. \n 1596 /// 10: No effect. \n 1597 /// 11: Negate the bit mask only for bits with an index less than or equal 1598 /// to the size of \a A or \a B. \n 1599 /// Bit [6]: Determines whether the result is zero-extended or expanded to 16 1600 /// bytes. \n 1601 /// 0: The result is zero-extended to 16 bytes. \n 1602 /// 1: The result is expanded to 16 bytes (this expansion is performed by 1603 /// repeating each bit 8 or 16 times). 1604 /// \returns Returns a 128-bit integer vector representing the result mask of 1605 /// the comparison. 1606 #define _mm_cmpistrm(A, B, M) \ 1607 ((__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \ 1608 (__v16qi)(__m128i)(B), (int)(M))) 1609 1610 /// Uses the immediate operand \a M to perform a comparison of string 1611 /// data with implicitly defined lengths that is contained in source operands 1612 /// \a A and \a B. Returns an integer representing the result index of the 1613 /// comparison. 1614 /// 1615 /// \headerfile <x86intrin.h> 1616 /// 1617 /// \code 1618 /// int _mm_cmpistri(__m128i A, __m128i B, const int M); 1619 /// \endcode 1620 /// 1621 /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 1622 /// instruction. 1623 /// 1624 /// \param A 1625 /// A 128-bit integer vector containing one of the source operands to be 1626 /// compared. 1627 /// \param B 1628 /// A 128-bit integer vector containing one of the source operands to be 1629 /// compared. 1630 /// \param M 1631 /// An 8-bit immediate operand specifying whether the characters are bytes or 1632 /// words, the type of comparison to perform, and the format of the return 1633 /// value. \n 1634 /// Bits [1:0]: Determine source data format. \n 1635 /// 00: 16 unsigned bytes \n 1636 /// 01: 8 unsigned words \n 1637 /// 10: 16 signed bytes \n 1638 /// 11: 8 signed words \n 1639 /// Bits [3:2]: Determine comparison type and aggregation method. \n 1640 /// 00: Subset: Each character in \a B is compared for equality with all 1641 /// the characters in \a A. \n 1642 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1643 /// basis is greater than or equal for even-indexed elements in \a A, 1644 /// and less than or equal for odd-indexed elements in \a A. \n 1645 /// 10: Match: Compare each pair of corresponding characters in \a A and 1646 /// \a B for equality. \n 1647 /// 11: Substring: Search B for substring matches of \a A. \n 1648 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 1649 /// mask of the comparison results. \n 1650 /// 00: No effect. \n 1651 /// 01: Negate the bit mask. \n 1652 /// 10: No effect. \n 1653 /// 11: Negate the bit mask only for bits with an index less than or equal 1654 /// to the size of \a A or \a B. \n 1655 /// Bit [6]: Determines whether the index of the lowest set bit or the 1656 /// highest set bit is returned. \n 1657 /// 0: The index of the least significant set bit. \n 1658 /// 1: The index of the most significant set bit. \n 1659 /// \returns Returns an integer representing the result index of the comparison. 1660 #define _mm_cmpistri(A, B, M) \ 1661 ((int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \ 1662 (__v16qi)(__m128i)(B), (int)(M))) 1663 1664 /// Uses the immediate operand \a M to perform a comparison of string 1665 /// data with explicitly defined lengths that is contained in source operands 1666 /// \a A and \a B. Returns a 128-bit integer vector representing the result 1667 /// mask of the comparison. 1668 /// 1669 /// \headerfile <x86intrin.h> 1670 /// 1671 /// \code 1672 /// __m128i _mm_cmpestrm(__m128i A, int LA, __m128i B, int LB, const int M); 1673 /// \endcode 1674 /// 1675 /// This intrinsic corresponds to the <c> VPCMPESTRM / PCMPESTRM </c> 1676 /// instruction. 1677 /// 1678 /// \param A 1679 /// A 128-bit integer vector containing one of the source operands to be 1680 /// compared. 1681 /// \param LA 1682 /// An integer that specifies the length of the string in \a A. 1683 /// \param B 1684 /// A 128-bit integer vector containing one of the source operands to be 1685 /// compared. 1686 /// \param LB 1687 /// An integer that specifies the length of the string in \a B. 1688 /// \param M 1689 /// An 8-bit immediate operand specifying whether the characters are bytes or 1690 /// words, the type of comparison to perform, and the format of the return 1691 /// value. \n 1692 /// Bits [1:0]: Determine source data format. \n 1693 /// 00: 16 unsigned bytes \n 1694 /// 01: 8 unsigned words \n 1695 /// 10: 16 signed bytes \n 1696 /// 11: 8 signed words \n 1697 /// Bits [3:2]: Determine comparison type and aggregation method. \n 1698 /// 00: Subset: Each character in \a B is compared for equality with all 1699 /// the characters in \a A. \n 1700 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1701 /// basis is greater than or equal for even-indexed elements in \a A, 1702 /// and less than or equal for odd-indexed elements in \a A. \n 1703 /// 10: Match: Compare each pair of corresponding characters in \a A and 1704 /// \a B for equality. \n 1705 /// 11: Substring: Search \a B for substring matches of \a A. \n 1706 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 1707 /// mask of the comparison results. \n 1708 /// 00: No effect. \n 1709 /// 01: Negate the bit mask. \n 1710 /// 10: No effect. \n 1711 /// 11: Negate the bit mask only for bits with an index less than or equal 1712 /// to the size of \a A or \a B. \n 1713 /// Bit [6]: Determines whether the result is zero-extended or expanded to 16 1714 /// bytes. \n 1715 /// 0: The result is zero-extended to 16 bytes. \n 1716 /// 1: The result is expanded to 16 bytes (this expansion is performed by 1717 /// repeating each bit 8 or 16 times). \n 1718 /// \returns Returns a 128-bit integer vector representing the result mask of 1719 /// the comparison. 1720 #define _mm_cmpestrm(A, LA, B, LB, M) \ 1721 ((__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \ 1722 (__v16qi)(__m128i)(B), (int)(LB), \ 1723 (int)(M))) 1724 1725 /// Uses the immediate operand \a M to perform a comparison of string 1726 /// data with explicitly defined lengths that is contained in source operands 1727 /// \a A and \a B. Returns an integer representing the result index of the 1728 /// comparison. 1729 /// 1730 /// \headerfile <x86intrin.h> 1731 /// 1732 /// \code 1733 /// int _mm_cmpestri(__m128i A, int LA, __m128i B, int LB, const int M); 1734 /// \endcode 1735 /// 1736 /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> 1737 /// instruction. 1738 /// 1739 /// \param A 1740 /// A 128-bit integer vector containing one of the source operands to be 1741 /// compared. 1742 /// \param LA 1743 /// An integer that specifies the length of the string in \a A. 1744 /// \param B 1745 /// A 128-bit integer vector containing one of the source operands to be 1746 /// compared. 1747 /// \param LB 1748 /// An integer that specifies the length of the string in \a B. 1749 /// \param M 1750 /// An 8-bit immediate operand specifying whether the characters are bytes or 1751 /// words, the type of comparison to perform, and the format of the return 1752 /// value. \n 1753 /// Bits [1:0]: Determine source data format. \n 1754 /// 00: 16 unsigned bytes \n 1755 /// 01: 8 unsigned words \n 1756 /// 10: 16 signed bytes \n 1757 /// 11: 8 signed words \n 1758 /// Bits [3:2]: Determine comparison type and aggregation method. \n 1759 /// 00: Subset: Each character in \a B is compared for equality with all 1760 /// the characters in \a A. \n 1761 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1762 /// basis is greater than or equal for even-indexed elements in \a A, 1763 /// and less than or equal for odd-indexed elements in \a A. \n 1764 /// 10: Match: Compare each pair of corresponding characters in \a A and 1765 /// \a B for equality. \n 1766 /// 11: Substring: Search B for substring matches of \a A. \n 1767 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 1768 /// mask of the comparison results. \n 1769 /// 00: No effect. \n 1770 /// 01: Negate the bit mask. \n 1771 /// 10: No effect. \n 1772 /// 11: Negate the bit mask only for bits with an index less than or equal 1773 /// to the size of \a A or \a B. \n 1774 /// Bit [6]: Determines whether the index of the lowest set bit or the 1775 /// highest set bit is returned. \n 1776 /// 0: The index of the least significant set bit. \n 1777 /// 1: The index of the most significant set bit. \n 1778 /// \returns Returns an integer representing the result index of the comparison. 1779 #define _mm_cmpestri(A, LA, B, LB, M) \ 1780 ((int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \ 1781 (__v16qi)(__m128i)(B), (int)(LB), \ 1782 (int)(M))) 1783 1784 /* SSE4.2 Packed Comparison Intrinsics and EFlag Reading. */ 1785 /// Uses the immediate operand \a M to perform a comparison of string 1786 /// data with implicitly defined lengths that is contained in source operands 1787 /// \a A and \a B. Returns 1 if the bit mask is zero and the length of the 1788 /// string in \a B is the maximum, otherwise, returns 0. 1789 /// 1790 /// \headerfile <x86intrin.h> 1791 /// 1792 /// \code 1793 /// int _mm_cmpistra(__m128i A, __m128i B, const int M); 1794 /// \endcode 1795 /// 1796 /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 1797 /// instruction. 1798 /// 1799 /// \param A 1800 /// A 128-bit integer vector containing one of the source operands to be 1801 /// compared. 1802 /// \param B 1803 /// A 128-bit integer vector containing one of the source operands to be 1804 /// compared. 1805 /// \param M 1806 /// An 8-bit immediate operand specifying whether the characters are bytes or 1807 /// words and the type of comparison to perform. \n 1808 /// Bits [1:0]: Determine source data format. \n 1809 /// 00: 16 unsigned bytes \n 1810 /// 01: 8 unsigned words \n 1811 /// 10: 16 signed bytes \n 1812 /// 11: 8 signed words \n 1813 /// Bits [3:2]: Determine comparison type and aggregation method. \n 1814 /// 00: Subset: Each character in \a B is compared for equality with all 1815 /// the characters in \a A. \n 1816 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1817 /// basis is greater than or equal for even-indexed elements in \a A, 1818 /// and less than or equal for odd-indexed elements in \a A. \n 1819 /// 10: Match: Compare each pair of corresponding characters in \a A and 1820 /// \a B for equality. \n 1821 /// 11: Substring: Search \a B for substring matches of \a A. \n 1822 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 1823 /// mask of the comparison results. \n 1824 /// 00: No effect. \n 1825 /// 01: Negate the bit mask. \n 1826 /// 10: No effect. \n 1827 /// 11: Negate the bit mask only for bits with an index less than or equal 1828 /// to the size of \a A or \a B. \n 1829 /// \returns Returns 1 if the bit mask is zero and the length of the string in 1830 /// \a B is the maximum; otherwise, returns 0. 1831 #define _mm_cmpistra(A, B, M) \ 1832 ((int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \ 1833 (__v16qi)(__m128i)(B), (int)(M))) 1834 1835 /// Uses the immediate operand \a M to perform a comparison of string 1836 /// data with implicitly defined lengths that is contained in source operands 1837 /// \a A and \a B. Returns 1 if the bit mask is non-zero, otherwise, returns 1838 /// 0. 1839 /// 1840 /// \headerfile <x86intrin.h> 1841 /// 1842 /// \code 1843 /// int _mm_cmpistrc(__m128i A, __m128i B, const int M); 1844 /// \endcode 1845 /// 1846 /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 1847 /// instruction. 1848 /// 1849 /// \param A 1850 /// A 128-bit integer vector containing one of the source operands to be 1851 /// compared. 1852 /// \param B 1853 /// A 128-bit integer vector containing one of the source operands to be 1854 /// compared. 1855 /// \param M 1856 /// An 8-bit immediate operand specifying whether the characters are bytes or 1857 /// words and the type of comparison to perform. \n 1858 /// Bits [1:0]: Determine source data format. \n 1859 /// 00: 16 unsigned bytes \n 1860 /// 01: 8 unsigned words \n 1861 /// 10: 16 signed bytes \n 1862 /// 11: 8 signed words \n 1863 /// Bits [3:2]: Determine comparison type and aggregation method. \n 1864 /// 00: Subset: Each character in \a B is compared for equality with all 1865 /// the characters in \a A. \n 1866 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1867 /// basis is greater than or equal for even-indexed elements in \a A, 1868 /// and less than or equal for odd-indexed elements in \a A. \n 1869 /// 10: Match: Compare each pair of corresponding characters in \a A and 1870 /// \a B for equality. \n 1871 /// 11: Substring: Search B for substring matches of \a A. \n 1872 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 1873 /// mask of the comparison results. \n 1874 /// 00: No effect. \n 1875 /// 01: Negate the bit mask. \n 1876 /// 10: No effect. \n 1877 /// 11: Negate the bit mask only for bits with an index less than or equal 1878 /// to the size of \a A or \a B. 1879 /// \returns Returns 1 if the bit mask is non-zero, otherwise, returns 0. 1880 #define _mm_cmpistrc(A, B, M) \ 1881 ((int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \ 1882 (__v16qi)(__m128i)(B), (int)(M))) 1883 1884 /// Uses the immediate operand \a M to perform a comparison of string 1885 /// data with implicitly defined lengths that is contained in source operands 1886 /// \a A and \a B. Returns bit 0 of the resulting bit mask. 1887 /// 1888 /// \headerfile <x86intrin.h> 1889 /// 1890 /// \code 1891 /// int _mm_cmpistro(__m128i A, __m128i B, const int M); 1892 /// \endcode 1893 /// 1894 /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 1895 /// instruction. 1896 /// 1897 /// \param A 1898 /// A 128-bit integer vector containing one of the source operands to be 1899 /// compared. 1900 /// \param B 1901 /// A 128-bit integer vector containing one of the source operands to be 1902 /// compared. 1903 /// \param M 1904 /// An 8-bit immediate operand specifying whether the characters are bytes or 1905 /// words and the type of comparison to perform. \n 1906 /// Bits [1:0]: Determine source data format. \n 1907 /// 00: 16 unsigned bytes \n 1908 /// 01: 8 unsigned words \n 1909 /// 10: 16 signed bytes \n 1910 /// 11: 8 signed words \n 1911 /// Bits [3:2]: Determine comparison type and aggregation method. \n 1912 /// 00: Subset: Each character in \a B is compared for equality with all 1913 /// the characters in \a A. \n 1914 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1915 /// basis is greater than or equal for even-indexed elements in \a A, 1916 /// and less than or equal for odd-indexed elements in \a A. \n 1917 /// 10: Match: Compare each pair of corresponding characters in \a A and 1918 /// \a B for equality. \n 1919 /// 11: Substring: Search B for substring matches of \a A. \n 1920 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 1921 /// mask of the comparison results. \n 1922 /// 00: No effect. \n 1923 /// 01: Negate the bit mask. \n 1924 /// 10: No effect. \n 1925 /// 11: Negate the bit mask only for bits with an index less than or equal 1926 /// to the size of \a A or \a B. \n 1927 /// \returns Returns bit 0 of the resulting bit mask. 1928 #define _mm_cmpistro(A, B, M) \ 1929 ((int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \ 1930 (__v16qi)(__m128i)(B), (int)(M))) 1931 1932 /// Uses the immediate operand \a M to perform a comparison of string 1933 /// data with implicitly defined lengths that is contained in source operands 1934 /// \a A and \a B. Returns 1 if the length of the string in \a A is less than 1935 /// the maximum, otherwise, returns 0. 1936 /// 1937 /// \headerfile <x86intrin.h> 1938 /// 1939 /// \code 1940 /// int _mm_cmpistrs(__m128i A, __m128i B, const int M); 1941 /// \endcode 1942 /// 1943 /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 1944 /// instruction. 1945 /// 1946 /// \param A 1947 /// A 128-bit integer vector containing one of the source operands to be 1948 /// compared. 1949 /// \param B 1950 /// A 128-bit integer vector containing one of the source operands to be 1951 /// compared. 1952 /// \param M 1953 /// An 8-bit immediate operand specifying whether the characters are bytes or 1954 /// words and the type of comparison to perform. \n 1955 /// Bits [1:0]: Determine source data format. \n 1956 /// 00: 16 unsigned bytes \n 1957 /// 01: 8 unsigned words \n 1958 /// 10: 16 signed bytes \n 1959 /// 11: 8 signed words \n 1960 /// Bits [3:2]: Determine comparison type and aggregation method. \n 1961 /// 00: Subset: Each character in \a B is compared for equality with all 1962 /// the characters in \a A. \n 1963 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1964 /// basis is greater than or equal for even-indexed elements in \a A, 1965 /// and less than or equal for odd-indexed elements in \a A. \n 1966 /// 10: Match: Compare each pair of corresponding characters in \a A and 1967 /// \a B for equality. \n 1968 /// 11: Substring: Search \a B for substring matches of \a A. \n 1969 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 1970 /// mask of the comparison results. \n 1971 /// 00: No effect. \n 1972 /// 01: Negate the bit mask. \n 1973 /// 10: No effect. \n 1974 /// 11: Negate the bit mask only for bits with an index less than or equal 1975 /// to the size of \a A or \a B. \n 1976 /// \returns Returns 1 if the length of the string in \a A is less than the 1977 /// maximum, otherwise, returns 0. 1978 #define _mm_cmpistrs(A, B, M) \ 1979 ((int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \ 1980 (__v16qi)(__m128i)(B), (int)(M))) 1981 1982 /// Uses the immediate operand \a M to perform a comparison of string 1983 /// data with implicitly defined lengths that is contained in source operands 1984 /// \a A and \a B. Returns 1 if the length of the string in \a B is less than 1985 /// the maximum, otherwise, returns 0. 1986 /// 1987 /// \headerfile <x86intrin.h> 1988 /// 1989 /// \code 1990 /// int _mm_cmpistrz(__m128i A, __m128i B, const int M); 1991 /// \endcode 1992 /// 1993 /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 1994 /// instruction. 1995 /// 1996 /// \param A 1997 /// A 128-bit integer vector containing one of the source operands to be 1998 /// compared. 1999 /// \param B 2000 /// A 128-bit integer vector containing one of the source operands to be 2001 /// compared. 2002 /// \param M 2003 /// An 8-bit immediate operand specifying whether the characters are bytes or 2004 /// words and the type of comparison to perform. \n 2005 /// Bits [1:0]: Determine source data format. \n 2006 /// 00: 16 unsigned bytes \n 2007 /// 01: 8 unsigned words \n 2008 /// 10: 16 signed bytes \n 2009 /// 11: 8 signed words \n 2010 /// Bits [3:2]: Determine comparison type and aggregation method. \n 2011 /// 00: Subset: Each character in \a B is compared for equality with all 2012 /// the characters in \a A. \n 2013 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2014 /// basis is greater than or equal for even-indexed elements in \a A, 2015 /// and less than or equal for odd-indexed elements in \a A. \n 2016 /// 10: Match: Compare each pair of corresponding characters in \a A and 2017 /// \a B for equality. \n 2018 /// 11: Substring: Search \a B for substring matches of \a A. \n 2019 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 2020 /// mask of the comparison results. \n 2021 /// 00: No effect. \n 2022 /// 01: Negate the bit mask. \n 2023 /// 10: No effect. \n 2024 /// 11: Negate the bit mask only for bits with an index less than or equal 2025 /// to the size of \a A or \a B. 2026 /// \returns Returns 1 if the length of the string in \a B is less than the 2027 /// maximum, otherwise, returns 0. 2028 #define _mm_cmpistrz(A, B, M) \ 2029 ((int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \ 2030 (__v16qi)(__m128i)(B), (int)(M))) 2031 2032 /// Uses the immediate operand \a M to perform a comparison of string 2033 /// data with explicitly defined lengths that is contained in source operands 2034 /// \a A and \a B. Returns 1 if the bit mask is zero and the length of the 2035 /// string in \a B is the maximum, otherwise, returns 0. 2036 /// 2037 /// \headerfile <x86intrin.h> 2038 /// 2039 /// \code 2040 /// int _mm_cmpestra(__m128i A, int LA, __m128i B, int LB, const int M); 2041 /// \endcode 2042 /// 2043 /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> 2044 /// instruction. 2045 /// 2046 /// \param A 2047 /// A 128-bit integer vector containing one of the source operands to be 2048 /// compared. 2049 /// \param LA 2050 /// An integer that specifies the length of the string in \a A. 2051 /// \param B 2052 /// A 128-bit integer vector containing one of the source operands to be 2053 /// compared. 2054 /// \param LB 2055 /// An integer that specifies the length of the string in \a B. 2056 /// \param M 2057 /// An 8-bit immediate operand specifying whether the characters are bytes or 2058 /// words and the type of comparison to perform. \n 2059 /// Bits [1:0]: Determine source data format. \n 2060 /// 00: 16 unsigned bytes \n 2061 /// 01: 8 unsigned words \n 2062 /// 10: 16 signed bytes \n 2063 /// 11: 8 signed words \n 2064 /// Bits [3:2]: Determine comparison type and aggregation method. \n 2065 /// 00: Subset: Each character in \a B is compared for equality with all 2066 /// the characters in \a A. \n 2067 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2068 /// basis is greater than or equal for even-indexed elements in \a A, 2069 /// and less than or equal for odd-indexed elements in \a A. \n 2070 /// 10: Match: Compare each pair of corresponding characters in \a A and 2071 /// \a B for equality. \n 2072 /// 11: Substring: Search \a B for substring matches of \a A. \n 2073 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 2074 /// mask of the comparison results. \n 2075 /// 00: No effect. \n 2076 /// 01: Negate the bit mask. \n 2077 /// 10: No effect. \n 2078 /// 11: Negate the bit mask only for bits with an index less than or equal 2079 /// to the size of \a A or \a B. 2080 /// \returns Returns 1 if the bit mask is zero and the length of the string in 2081 /// \a B is the maximum, otherwise, returns 0. 2082 #define _mm_cmpestra(A, LA, B, LB, M) \ 2083 ((int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \ 2084 (__v16qi)(__m128i)(B), (int)(LB), \ 2085 (int)(M))) 2086 2087 /// Uses the immediate operand \a M to perform a comparison of string 2088 /// data with explicitly defined lengths that is contained in source operands 2089 /// \a A and \a B. Returns 1 if the resulting mask is non-zero, otherwise, 2090 /// returns 0. 2091 /// 2092 /// \headerfile <x86intrin.h> 2093 /// 2094 /// \code 2095 /// int _mm_cmpestrc(__m128i A, int LA, __m128i B, int LB, const int M); 2096 /// \endcode 2097 /// 2098 /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> 2099 /// instruction. 2100 /// 2101 /// \param A 2102 /// A 128-bit integer vector containing one of the source operands to be 2103 /// compared. 2104 /// \param LA 2105 /// An integer that specifies the length of the string in \a A. 2106 /// \param B 2107 /// A 128-bit integer vector containing one of the source operands to be 2108 /// compared. 2109 /// \param LB 2110 /// An integer that specifies the length of the string in \a B. 2111 /// \param M 2112 /// An 8-bit immediate operand specifying whether the characters are bytes or 2113 /// words and the type of comparison to perform. \n 2114 /// Bits [1:0]: Determine source data format. \n 2115 /// 00: 16 unsigned bytes \n 2116 /// 01: 8 unsigned words \n 2117 /// 10: 16 signed bytes \n 2118 /// 11: 8 signed words \n 2119 /// Bits [3:2]: Determine comparison type and aggregation method. \n 2120 /// 00: Subset: Each character in \a B is compared for equality with all 2121 /// the characters in \a A. \n 2122 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2123 /// basis is greater than or equal for even-indexed elements in \a A, 2124 /// and less than or equal for odd-indexed elements in \a A. \n 2125 /// 10: Match: Compare each pair of corresponding characters in \a A and 2126 /// \a B for equality. \n 2127 /// 11: Substring: Search \a B for substring matches of \a A. \n 2128 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 2129 /// mask of the comparison results. \n 2130 /// 00: No effect. \n 2131 /// 01: Negate the bit mask. \n 2132 /// 10: No effect. \n 2133 /// 11: Negate the bit mask only for bits with an index less than or equal 2134 /// to the size of \a A or \a B. \n 2135 /// \returns Returns 1 if the resulting mask is non-zero, otherwise, returns 0. 2136 #define _mm_cmpestrc(A, LA, B, LB, M) \ 2137 ((int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \ 2138 (__v16qi)(__m128i)(B), (int)(LB), \ 2139 (int)(M))) 2140 2141 /// Uses the immediate operand \a M to perform a comparison of string 2142 /// data with explicitly defined lengths that is contained in source operands 2143 /// \a A and \a B. Returns bit 0 of the resulting bit mask. 2144 /// 2145 /// \headerfile <x86intrin.h> 2146 /// 2147 /// \code 2148 /// int _mm_cmpestro(__m128i A, int LA, __m128i B, int LB, const int M); 2149 /// \endcode 2150 /// 2151 /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> 2152 /// instruction. 2153 /// 2154 /// \param A 2155 /// A 128-bit integer vector containing one of the source operands to be 2156 /// compared. 2157 /// \param LA 2158 /// An integer that specifies the length of the string in \a A. 2159 /// \param B 2160 /// A 128-bit integer vector containing one of the source operands to be 2161 /// compared. 2162 /// \param LB 2163 /// An integer that specifies the length of the string in \a B. 2164 /// \param M 2165 /// An 8-bit immediate operand specifying whether the characters are bytes or 2166 /// words and the type of comparison to perform. \n 2167 /// Bits [1:0]: Determine source data format. \n 2168 /// 00: 16 unsigned bytes \n 2169 /// 01: 8 unsigned words \n 2170 /// 10: 16 signed bytes \n 2171 /// 11: 8 signed words \n 2172 /// Bits [3:2]: Determine comparison type and aggregation method. \n 2173 /// 00: Subset: Each character in \a B is compared for equality with all 2174 /// the characters in \a A. \n 2175 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2176 /// basis is greater than or equal for even-indexed elements in \a A, 2177 /// and less than or equal for odd-indexed elements in \a A. \n 2178 /// 10: Match: Compare each pair of corresponding characters in \a A and 2179 /// \a B for equality. \n 2180 /// 11: Substring: Search \a B for substring matches of \a A. \n 2181 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 2182 /// mask of the comparison results. \n 2183 /// 00: No effect. \n 2184 /// 01: Negate the bit mask. \n 2185 /// 10: No effect. \n 2186 /// 11: Negate the bit mask only for bits with an index less than or equal 2187 /// to the size of \a A or \a B. 2188 /// \returns Returns bit 0 of the resulting bit mask. 2189 #define _mm_cmpestro(A, LA, B, LB, M) \ 2190 ((int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \ 2191 (__v16qi)(__m128i)(B), (int)(LB), \ 2192 (int)(M))) 2193 2194 /// Uses the immediate operand \a M to perform a comparison of string 2195 /// data with explicitly defined lengths that is contained in source operands 2196 /// \a A and \a B. Returns 1 if the length of the string in \a A is less than 2197 /// the maximum, otherwise, returns 0. 2198 /// 2199 /// \headerfile <x86intrin.h> 2200 /// 2201 /// \code 2202 /// int _mm_cmpestrs(__m128i A, int LA, __m128i B, int LB, const int M); 2203 /// \endcode 2204 /// 2205 /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> 2206 /// instruction. 2207 /// 2208 /// \param A 2209 /// A 128-bit integer vector containing one of the source operands to be 2210 /// compared. 2211 /// \param LA 2212 /// An integer that specifies the length of the string in \a A. 2213 /// \param B 2214 /// A 128-bit integer vector containing one of the source operands to be 2215 /// compared. 2216 /// \param LB 2217 /// An integer that specifies the length of the string in \a B. 2218 /// \param M 2219 /// An 8-bit immediate operand specifying whether the characters are bytes or 2220 /// words and the type of comparison to perform. \n 2221 /// Bits [1:0]: Determine source data format. \n 2222 /// 00: 16 unsigned bytes \n 2223 /// 01: 8 unsigned words \n 2224 /// 10: 16 signed bytes \n 2225 /// 11: 8 signed words \n 2226 /// Bits [3:2]: Determine comparison type and aggregation method. \n 2227 /// 00: Subset: Each character in \a B is compared for equality with all 2228 /// the characters in \a A. \n 2229 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2230 /// basis is greater than or equal for even-indexed elements in \a A, 2231 /// and less than or equal for odd-indexed elements in \a A. \n 2232 /// 10: Match: Compare each pair of corresponding characters in \a A and 2233 /// \a B for equality. \n 2234 /// 11: Substring: Search \a B for substring matches of \a A. \n 2235 /// Bits [5:4]: Determine whether to perform a one's complement in the bit 2236 /// mask of the comparison results. \n 2237 /// 00: No effect. \n 2238 /// 01: Negate the bit mask. \n 2239 /// 10: No effect. \n 2240 /// 11: Negate the bit mask only for bits with an index less than or equal 2241 /// to the size of \a A or \a B. \n 2242 /// \returns Returns 1 if the length of the string in \a A is less than the 2243 /// maximum, otherwise, returns 0. 2244 #define _mm_cmpestrs(A, LA, B, LB, M) \ 2245 ((int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \ 2246 (__v16qi)(__m128i)(B), (int)(LB), \ 2247 (int)(M))) 2248 2249 /// Uses the immediate operand \a M to perform a comparison of string 2250 /// data with explicitly defined lengths that is contained in source operands 2251 /// \a A and \a B. Returns 1 if the length of the string in \a B is less than 2252 /// the maximum, otherwise, returns 0. 2253 /// 2254 /// \headerfile <x86intrin.h> 2255 /// 2256 /// \code 2257 /// int _mm_cmpestrz(__m128i A, int LA, __m128i B, int LB, const int M); 2258 /// \endcode 2259 /// 2260 /// This intrinsic corresponds to the <c> VPCMPESTRI </c> instruction. 2261 /// 2262 /// \param A 2263 /// A 128-bit integer vector containing one of the source operands to be 2264 /// compared. 2265 /// \param LA 2266 /// An integer that specifies the length of the string in \a A. 2267 /// \param B 2268 /// A 128-bit integer vector containing one of the source operands to be 2269 /// compared. 2270 /// \param LB 2271 /// An integer that specifies the length of the string in \a B. 2272 /// \param M 2273 /// An 8-bit immediate operand specifying whether the characters are bytes or 2274 /// words and the type of comparison to perform. \n 2275 /// Bits [1:0]: Determine source data format. \n 2276 /// 00: 16 unsigned bytes \n 2277 /// 01: 8 unsigned words \n 2278 /// 10: 16 signed bytes \n 2279 /// 11: 8 signed words \n 2280 /// Bits [3:2]: Determine comparison type and aggregation method. \n 2281 /// 00: Subset: Each character in \a B is compared for equality with all 2282 /// the characters in \a A. \n 2283 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2284 /// basis is greater than or equal for even-indexed elements in \a A, 2285 /// and less than or equal for odd-indexed elements in \a A. \n 2286 /// 10: Match: Compare each pair of corresponding characters in \a A and 2287 /// \a B for equality. \n 2288 /// 11: Substring: Search \a B for substring matches of \a A. \n 2289 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 2290 /// mask of the comparison results. \n 2291 /// 00: No effect. \n 2292 /// 01: Negate the bit mask. \n 2293 /// 10: No effect. \n 2294 /// 11: Negate the bit mask only for bits with an index less than or equal 2295 /// to the size of \a A or \a B. 2296 /// \returns Returns 1 if the length of the string in \a B is less than the 2297 /// maximum, otherwise, returns 0. 2298 #define _mm_cmpestrz(A, LA, B, LB, M) \ 2299 ((int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \ 2300 (__v16qi)(__m128i)(B), (int)(LB), \ 2301 (int)(M))) 2302 2303 /* SSE4.2 Compare Packed Data -- Greater Than. */ 2304 /// Compares each of the corresponding 64-bit values of the 128-bit 2305 /// integer vectors to determine if the values in the first operand are 2306 /// greater than those in the second operand. 2307 /// 2308 /// \headerfile <x86intrin.h> 2309 /// 2310 /// This intrinsic corresponds to the <c> VPCMPGTQ / PCMPGTQ </c> instruction. 2311 /// 2312 /// \param __V1 2313 /// A 128-bit integer vector. 2314 /// \param __V2 2315 /// A 128-bit integer vector. 2316 /// \returns A 128-bit integer vector containing the comparison results. 2317 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi64(__m128i __V1, 2318 __m128i __V2) { 2319 return (__m128i)((__v2di)__V1 > (__v2di)__V2); 2320 } 2321 2322 #undef __DEFAULT_FN_ATTRS 2323 2324 #include <popcntintrin.h> 2325 2326 #include <crc32intrin.h> 2327 2328 #endif /* __SMMINTRIN_H */ 2329