1 /*===---- smmintrin.h - SSE4 intrinsics ------------------------------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10 #ifndef __SMMINTRIN_H 11 #define __SMMINTRIN_H 12 13 #if !defined(__i386__) && !defined(__x86_64__) 14 #error "This header is only meant to be used on x86 and x64 architecture" 15 #endif 16 17 #include <tmmintrin.h> 18 19 /* Define the default attributes for the functions in this file. */ 20 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.1"), __min_vector_width__(128))) 21 22 /* SSE4 Rounding macros. */ 23 #define _MM_FROUND_TO_NEAREST_INT 0x00 24 #define _MM_FROUND_TO_NEG_INF 0x01 25 #define _MM_FROUND_TO_POS_INF 0x02 26 #define _MM_FROUND_TO_ZERO 0x03 27 #define _MM_FROUND_CUR_DIRECTION 0x04 28 29 #define _MM_FROUND_RAISE_EXC 0x00 30 #define _MM_FROUND_NO_EXC 0x08 31 32 #define _MM_FROUND_NINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT) 33 #define _MM_FROUND_FLOOR (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF) 34 #define _MM_FROUND_CEIL (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF) 35 #define _MM_FROUND_TRUNC (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO) 36 #define _MM_FROUND_RINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION) 37 #define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION) 38 39 /// Rounds up each element of the 128-bit vector of [4 x float] to an 40 /// integer and returns the rounded values in a 128-bit vector of 41 /// [4 x float]. 42 /// 43 /// \headerfile <x86intrin.h> 44 /// 45 /// \code 46 /// __m128 _mm_ceil_ps(__m128 X); 47 /// \endcode 48 /// 49 /// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction. 50 /// 51 /// \param X 52 /// A 128-bit vector of [4 x float] values to be rounded up. 53 /// \returns A 128-bit vector of [4 x float] containing the rounded values. 54 #define _mm_ceil_ps(X) _mm_round_ps((X), _MM_FROUND_CEIL) 55 56 /// Rounds up each element of the 128-bit vector of [2 x double] to an 57 /// integer and returns the rounded values in a 128-bit vector of 58 /// [2 x double]. 59 /// 60 /// \headerfile <x86intrin.h> 61 /// 62 /// \code 63 /// __m128d _mm_ceil_pd(__m128d X); 64 /// \endcode 65 /// 66 /// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction. 67 /// 68 /// \param X 69 /// A 128-bit vector of [2 x double] values to be rounded up. 70 /// \returns A 128-bit vector of [2 x double] containing the rounded values. 71 #define _mm_ceil_pd(X) _mm_round_pd((X), _MM_FROUND_CEIL) 72 73 /// Copies three upper elements of the first 128-bit vector operand to 74 /// the corresponding three upper elements of the 128-bit result vector of 75 /// [4 x float]. Rounds up the lowest element of the second 128-bit vector 76 /// operand to an integer and copies it to the lowest element of the 128-bit 77 /// result vector of [4 x float]. 78 /// 79 /// \headerfile <x86intrin.h> 80 /// 81 /// \code 82 /// __m128 _mm_ceil_ss(__m128 X, __m128 Y); 83 /// \endcode 84 /// 85 /// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction. 86 /// 87 /// \param X 88 /// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are 89 /// copied to the corresponding bits of the result. 90 /// \param Y 91 /// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is 92 /// rounded up to the nearest integer and copied to the corresponding bits 93 /// of the result. 94 /// \returns A 128-bit vector of [4 x float] containing the copied and rounded 95 /// values. 96 #define _mm_ceil_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_CEIL) 97 98 /// Copies the upper element of the first 128-bit vector operand to the 99 /// corresponding upper element of the 128-bit result vector of [2 x double]. 100 /// Rounds up the lower element of the second 128-bit vector operand to an 101 /// integer and copies it to the lower element of the 128-bit result vector 102 /// of [2 x double]. 103 /// 104 /// \headerfile <x86intrin.h> 105 /// 106 /// \code 107 /// __m128d _mm_ceil_sd(__m128d X, __m128d Y); 108 /// \endcode 109 /// 110 /// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction. 111 /// 112 /// \param X 113 /// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is 114 /// copied to the corresponding bits of the result. 115 /// \param Y 116 /// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is 117 /// rounded up to the nearest integer and copied to the corresponding bits 118 /// of the result. 119 /// \returns A 128-bit vector of [2 x double] containing the copied and rounded 120 /// values. 121 #define _mm_ceil_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_CEIL) 122 123 /// Rounds down each element of the 128-bit vector of [4 x float] to an 124 /// an integer and returns the rounded values in a 128-bit vector of 125 /// [4 x float]. 126 /// 127 /// \headerfile <x86intrin.h> 128 /// 129 /// \code 130 /// __m128 _mm_floor_ps(__m128 X); 131 /// \endcode 132 /// 133 /// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction. 134 /// 135 /// \param X 136 /// A 128-bit vector of [4 x float] values to be rounded down. 137 /// \returns A 128-bit vector of [4 x float] containing the rounded values. 138 #define _mm_floor_ps(X) _mm_round_ps((X), _MM_FROUND_FLOOR) 139 140 /// Rounds down each element of the 128-bit vector of [2 x double] to an 141 /// integer and returns the rounded values in a 128-bit vector of 142 /// [2 x double]. 143 /// 144 /// \headerfile <x86intrin.h> 145 /// 146 /// \code 147 /// __m128d _mm_floor_pd(__m128d X); 148 /// \endcode 149 /// 150 /// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction. 151 /// 152 /// \param X 153 /// A 128-bit vector of [2 x double]. 154 /// \returns A 128-bit vector of [2 x double] containing the rounded values. 155 #define _mm_floor_pd(X) _mm_round_pd((X), _MM_FROUND_FLOOR) 156 157 /// Copies three upper elements of the first 128-bit vector operand to 158 /// the corresponding three upper elements of the 128-bit result vector of 159 /// [4 x float]. Rounds down the lowest element of the second 128-bit vector 160 /// operand to an integer and copies it to the lowest element of the 128-bit 161 /// result vector of [4 x float]. 162 /// 163 /// \headerfile <x86intrin.h> 164 /// 165 /// \code 166 /// __m128 _mm_floor_ss(__m128 X, __m128 Y); 167 /// \endcode 168 /// 169 /// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction. 170 /// 171 /// \param X 172 /// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are 173 /// copied to the corresponding bits of the result. 174 /// \param Y 175 /// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is 176 /// rounded down to the nearest integer and copied to the corresponding bits 177 /// of the result. 178 /// \returns A 128-bit vector of [4 x float] containing the copied and rounded 179 /// values. 180 #define _mm_floor_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_FLOOR) 181 182 /// Copies the upper element of the first 128-bit vector operand to the 183 /// corresponding upper element of the 128-bit result vector of [2 x double]. 184 /// Rounds down the lower element of the second 128-bit vector operand to an 185 /// integer and copies it to the lower element of the 128-bit result vector 186 /// of [2 x double]. 187 /// 188 /// \headerfile <x86intrin.h> 189 /// 190 /// \code 191 /// __m128d _mm_floor_sd(__m128d X, __m128d Y); 192 /// \endcode 193 /// 194 /// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction. 195 /// 196 /// \param X 197 /// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is 198 /// copied to the corresponding bits of the result. 199 /// \param Y 200 /// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is 201 /// rounded down to the nearest integer and copied to the corresponding bits 202 /// of the result. 203 /// \returns A 128-bit vector of [2 x double] containing the copied and rounded 204 /// values. 205 #define _mm_floor_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_FLOOR) 206 207 /// Rounds each element of the 128-bit vector of [4 x float] to an 208 /// integer value according to the rounding control specified by the second 209 /// argument and returns the rounded values in a 128-bit vector of 210 /// [4 x float]. 211 /// 212 /// \headerfile <x86intrin.h> 213 /// 214 /// \code 215 /// __m128 _mm_round_ps(__m128 X, const int M); 216 /// \endcode 217 /// 218 /// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction. 219 /// 220 /// \param X 221 /// A 128-bit vector of [4 x float]. 222 /// \param M 223 /// An integer value that specifies the rounding operation. \n 224 /// Bits [7:4] are reserved. \n 225 /// Bit [3] is a precision exception value: \n 226 /// 0: A normal PE exception is used \n 227 /// 1: The PE field is not updated \n 228 /// Bit [2] is the rounding control source: \n 229 /// 0: Use bits [1:0] of \a M \n 230 /// 1: Use the current MXCSR setting \n 231 /// Bits [1:0] contain the rounding control definition: \n 232 /// 00: Nearest \n 233 /// 01: Downward (toward negative infinity) \n 234 /// 10: Upward (toward positive infinity) \n 235 /// 11: Truncated 236 /// \returns A 128-bit vector of [4 x float] containing the rounded values. 237 #define _mm_round_ps(X, M) \ 238 ((__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M))) 239 240 /// Copies three upper elements of the first 128-bit vector operand to 241 /// the corresponding three upper elements of the 128-bit result vector of 242 /// [4 x float]. Rounds the lowest element of the second 128-bit vector 243 /// operand to an integer value according to the rounding control specified 244 /// by the third argument and copies it to the lowest element of the 128-bit 245 /// result vector of [4 x float]. 246 /// 247 /// \headerfile <x86intrin.h> 248 /// 249 /// \code 250 /// __m128 _mm_round_ss(__m128 X, __m128 Y, const int M); 251 /// \endcode 252 /// 253 /// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction. 254 /// 255 /// \param X 256 /// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are 257 /// copied to the corresponding bits of the result. 258 /// \param Y 259 /// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is 260 /// rounded to the nearest integer using the specified rounding control and 261 /// copied to the corresponding bits of the result. 262 /// \param M 263 /// An integer value that specifies the rounding operation. \n 264 /// Bits [7:4] are reserved. \n 265 /// Bit [3] is a precision exception value: \n 266 /// 0: A normal PE exception is used \n 267 /// 1: The PE field is not updated \n 268 /// Bit [2] is the rounding control source: \n 269 /// 0: Use bits [1:0] of \a M \n 270 /// 1: Use the current MXCSR setting \n 271 /// Bits [1:0] contain the rounding control definition: \n 272 /// 00: Nearest \n 273 /// 01: Downward (toward negative infinity) \n 274 /// 10: Upward (toward positive infinity) \n 275 /// 11: Truncated 276 /// \returns A 128-bit vector of [4 x float] containing the copied and rounded 277 /// values. 278 #define _mm_round_ss(X, Y, M) \ 279 ((__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \ 280 (__v4sf)(__m128)(Y), (M))) 281 282 /// Rounds each element of the 128-bit vector of [2 x double] to an 283 /// integer value according to the rounding control specified by the second 284 /// argument and returns the rounded values in a 128-bit vector of 285 /// [2 x double]. 286 /// 287 /// \headerfile <x86intrin.h> 288 /// 289 /// \code 290 /// __m128d _mm_round_pd(__m128d X, const int M); 291 /// \endcode 292 /// 293 /// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction. 294 /// 295 /// \param X 296 /// A 128-bit vector of [2 x double]. 297 /// \param M 298 /// An integer value that specifies the rounding operation. \n 299 /// Bits [7:4] are reserved. \n 300 /// Bit [3] is a precision exception value: \n 301 /// 0: A normal PE exception is used \n 302 /// 1: The PE field is not updated \n 303 /// Bit [2] is the rounding control source: \n 304 /// 0: Use bits [1:0] of \a M \n 305 /// 1: Use the current MXCSR setting \n 306 /// Bits [1:0] contain the rounding control definition: \n 307 /// 00: Nearest \n 308 /// 01: Downward (toward negative infinity) \n 309 /// 10: Upward (toward positive infinity) \n 310 /// 11: Truncated 311 /// \returns A 128-bit vector of [2 x double] containing the rounded values. 312 #define _mm_round_pd(X, M) \ 313 ((__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M))) 314 315 /// Copies the upper element of the first 128-bit vector operand to the 316 /// corresponding upper element of the 128-bit result vector of [2 x double]. 317 /// Rounds the lower element of the second 128-bit vector operand to an 318 /// integer value according to the rounding control specified by the third 319 /// argument and copies it to the lower element of the 128-bit result vector 320 /// of [2 x double]. 321 /// 322 /// \headerfile <x86intrin.h> 323 /// 324 /// \code 325 /// __m128d _mm_round_sd(__m128d X, __m128d Y, const int M); 326 /// \endcode 327 /// 328 /// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction. 329 /// 330 /// \param X 331 /// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is 332 /// copied to the corresponding bits of the result. 333 /// \param Y 334 /// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is 335 /// rounded to the nearest integer using the specified rounding control and 336 /// copied to the corresponding bits of the result. 337 /// \param M 338 /// An integer value that specifies the rounding operation. \n 339 /// Bits [7:4] are reserved. \n 340 /// Bit [3] is a precision exception value: \n 341 /// 0: A normal PE exception is used \n 342 /// 1: The PE field is not updated \n 343 /// Bit [2] is the rounding control source: \n 344 /// 0: Use bits [1:0] of \a M \n 345 /// 1: Use the current MXCSR setting \n 346 /// Bits [1:0] contain the rounding control definition: \n 347 /// 00: Nearest \n 348 /// 01: Downward (toward negative infinity) \n 349 /// 10: Upward (toward positive infinity) \n 350 /// 11: Truncated 351 /// \returns A 128-bit vector of [2 x double] containing the copied and rounded 352 /// values. 353 #define _mm_round_sd(X, Y, M) \ 354 ((__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \ 355 (__v2df)(__m128d)(Y), (M))) 356 357 /* SSE4 Packed Blending Intrinsics. */ 358 /// Returns a 128-bit vector of [2 x double] where the values are 359 /// selected from either the first or second operand as specified by the 360 /// third operand, the control mask. 361 /// 362 /// \headerfile <x86intrin.h> 363 /// 364 /// \code 365 /// __m128d _mm_blend_pd(__m128d V1, __m128d V2, const int M); 366 /// \endcode 367 /// 368 /// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction. 369 /// 370 /// \param V1 371 /// A 128-bit vector of [2 x double]. 372 /// \param V2 373 /// A 128-bit vector of [2 x double]. 374 /// \param M 375 /// An immediate integer operand, with mask bits [1:0] specifying how the 376 /// values are to be copied. The position of the mask bit corresponds to the 377 /// index of a copied value. When a mask bit is 0, the corresponding 64-bit 378 /// element in operand \a V1 is copied to the same position in the result. 379 /// When a mask bit is 1, the corresponding 64-bit element in operand \a V2 380 /// is copied to the same position in the result. 381 /// \returns A 128-bit vector of [2 x double] containing the copied values. 382 #define _mm_blend_pd(V1, V2, M) \ 383 ((__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(V1), \ 384 (__v2df)(__m128d)(V2), (int)(M))) 385 386 /// Returns a 128-bit vector of [4 x float] where the values are selected 387 /// from either the first or second operand as specified by the third 388 /// operand, the control mask. 389 /// 390 /// \headerfile <x86intrin.h> 391 /// 392 /// \code 393 /// __m128 _mm_blend_ps(__m128 V1, __m128 V2, const int M); 394 /// \endcode 395 /// 396 /// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS </c> instruction. 397 /// 398 /// \param V1 399 /// A 128-bit vector of [4 x float]. 400 /// \param V2 401 /// A 128-bit vector of [4 x float]. 402 /// \param M 403 /// An immediate integer operand, with mask bits [3:0] specifying how the 404 /// values are to be copied. The position of the mask bit corresponds to the 405 /// index of a copied value. When a mask bit is 0, the corresponding 32-bit 406 /// element in operand \a V1 is copied to the same position in the result. 407 /// When a mask bit is 1, the corresponding 32-bit element in operand \a V2 408 /// is copied to the same position in the result. 409 /// \returns A 128-bit vector of [4 x float] containing the copied values. 410 #define _mm_blend_ps(V1, V2, M) \ 411 ((__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(V1), \ 412 (__v4sf)(__m128)(V2), (int)(M))) 413 414 /// Returns a 128-bit vector of [2 x double] where the values are 415 /// selected from either the first or second operand as specified by the 416 /// third operand, the control mask. 417 /// 418 /// \headerfile <x86intrin.h> 419 /// 420 /// This intrinsic corresponds to the <c> VBLENDVPD / BLENDVPD </c> instruction. 421 /// 422 /// \param __V1 423 /// A 128-bit vector of [2 x double]. 424 /// \param __V2 425 /// A 128-bit vector of [2 x double]. 426 /// \param __M 427 /// A 128-bit vector operand, with mask bits 127 and 63 specifying how the 428 /// values are to be copied. The position of the mask bit corresponds to the 429 /// most significant bit of a copied value. When a mask bit is 0, the 430 /// corresponding 64-bit element in operand \a __V1 is copied to the same 431 /// position in the result. When a mask bit is 1, the corresponding 64-bit 432 /// element in operand \a __V2 is copied to the same position in the result. 433 /// \returns A 128-bit vector of [2 x double] containing the copied values. 434 static __inline__ __m128d __DEFAULT_FN_ATTRS 435 _mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M) 436 { 437 return (__m128d) __builtin_ia32_blendvpd ((__v2df)__V1, (__v2df)__V2, 438 (__v2df)__M); 439 } 440 441 /// Returns a 128-bit vector of [4 x float] where the values are 442 /// selected from either the first or second operand as specified by the 443 /// third operand, the control mask. 444 /// 445 /// \headerfile <x86intrin.h> 446 /// 447 /// This intrinsic corresponds to the <c> VBLENDVPS / BLENDVPS </c> instruction. 448 /// 449 /// \param __V1 450 /// A 128-bit vector of [4 x float]. 451 /// \param __V2 452 /// A 128-bit vector of [4 x float]. 453 /// \param __M 454 /// A 128-bit vector operand, with mask bits 127, 95, 63, and 31 specifying 455 /// how the values are to be copied. The position of the mask bit corresponds 456 /// to the most significant bit of a copied value. When a mask bit is 0, the 457 /// corresponding 32-bit element in operand \a __V1 is copied to the same 458 /// position in the result. When a mask bit is 1, the corresponding 32-bit 459 /// element in operand \a __V2 is copied to the same position in the result. 460 /// \returns A 128-bit vector of [4 x float] containing the copied values. 461 static __inline__ __m128 __DEFAULT_FN_ATTRS 462 _mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M) 463 { 464 return (__m128) __builtin_ia32_blendvps ((__v4sf)__V1, (__v4sf)__V2, 465 (__v4sf)__M); 466 } 467 468 /// Returns a 128-bit vector of [16 x i8] where the values are selected 469 /// from either of the first or second operand as specified by the third 470 /// operand, the control mask. 471 /// 472 /// \headerfile <x86intrin.h> 473 /// 474 /// This intrinsic corresponds to the <c> VPBLENDVB / PBLENDVB </c> instruction. 475 /// 476 /// \param __V1 477 /// A 128-bit vector of [16 x i8]. 478 /// \param __V2 479 /// A 128-bit vector of [16 x i8]. 480 /// \param __M 481 /// A 128-bit vector operand, with mask bits 127, 119, 111...7 specifying 482 /// how the values are to be copied. The position of the mask bit corresponds 483 /// to the most significant bit of a copied value. When a mask bit is 0, the 484 /// corresponding 8-bit element in operand \a __V1 is copied to the same 485 /// position in the result. When a mask bit is 1, the corresponding 8-bit 486 /// element in operand \a __V2 is copied to the same position in the result. 487 /// \returns A 128-bit vector of [16 x i8] containing the copied values. 488 static __inline__ __m128i __DEFAULT_FN_ATTRS 489 _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M) 490 { 491 return (__m128i) __builtin_ia32_pblendvb128 ((__v16qi)__V1, (__v16qi)__V2, 492 (__v16qi)__M); 493 } 494 495 /// Returns a 128-bit vector of [8 x i16] where the values are selected 496 /// from either of the first or second operand as specified by the third 497 /// operand, the control mask. 498 /// 499 /// \headerfile <x86intrin.h> 500 /// 501 /// \code 502 /// __m128i _mm_blend_epi16(__m128i V1, __m128i V2, const int M); 503 /// \endcode 504 /// 505 /// This intrinsic corresponds to the <c> VPBLENDW / PBLENDW </c> instruction. 506 /// 507 /// \param V1 508 /// A 128-bit vector of [8 x i16]. 509 /// \param V2 510 /// A 128-bit vector of [8 x i16]. 511 /// \param M 512 /// An immediate integer operand, with mask bits [7:0] specifying how the 513 /// values are to be copied. The position of the mask bit corresponds to the 514 /// index of a copied value. When a mask bit is 0, the corresponding 16-bit 515 /// element in operand \a V1 is copied to the same position in the result. 516 /// When a mask bit is 1, the corresponding 16-bit element in operand \a V2 517 /// is copied to the same position in the result. 518 /// \returns A 128-bit vector of [8 x i16] containing the copied values. 519 #define _mm_blend_epi16(V1, V2, M) \ 520 ((__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(V1), \ 521 (__v8hi)(__m128i)(V2), (int)(M))) 522 523 /* SSE4 Dword Multiply Instructions. */ 524 /// Multiples corresponding elements of two 128-bit vectors of [4 x i32] 525 /// and returns the lower 32 bits of the each product in a 128-bit vector of 526 /// [4 x i32]. 527 /// 528 /// \headerfile <x86intrin.h> 529 /// 530 /// This intrinsic corresponds to the <c> VPMULLD / PMULLD </c> instruction. 531 /// 532 /// \param __V1 533 /// A 128-bit integer vector. 534 /// \param __V2 535 /// A 128-bit integer vector. 536 /// \returns A 128-bit integer vector containing the products of both operands. 537 static __inline__ __m128i __DEFAULT_FN_ATTRS 538 _mm_mullo_epi32 (__m128i __V1, __m128i __V2) 539 { 540 return (__m128i) ((__v4su)__V1 * (__v4su)__V2); 541 } 542 543 /// Multiplies corresponding even-indexed elements of two 128-bit 544 /// vectors of [4 x i32] and returns a 128-bit vector of [2 x i64] 545 /// containing the products. 546 /// 547 /// \headerfile <x86intrin.h> 548 /// 549 /// This intrinsic corresponds to the <c> VPMULDQ / PMULDQ </c> instruction. 550 /// 551 /// \param __V1 552 /// A 128-bit vector of [4 x i32]. 553 /// \param __V2 554 /// A 128-bit vector of [4 x i32]. 555 /// \returns A 128-bit vector of [2 x i64] containing the products of both 556 /// operands. 557 static __inline__ __m128i __DEFAULT_FN_ATTRS 558 _mm_mul_epi32 (__m128i __V1, __m128i __V2) 559 { 560 return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__V1, (__v4si)__V2); 561 } 562 563 /* SSE4 Floating Point Dot Product Instructions. */ 564 /// Computes the dot product of the two 128-bit vectors of [4 x float] 565 /// and returns it in the elements of the 128-bit result vector of 566 /// [4 x float]. 567 /// 568 /// The immediate integer operand controls which input elements 569 /// will contribute to the dot product, and where the final results are 570 /// returned. 571 /// 572 /// \headerfile <x86intrin.h> 573 /// 574 /// \code 575 /// __m128 _mm_dp_ps(__m128 X, __m128 Y, const int M); 576 /// \endcode 577 /// 578 /// This intrinsic corresponds to the <c> VDPPS / DPPS </c> instruction. 579 /// 580 /// \param X 581 /// A 128-bit vector of [4 x float]. 582 /// \param Y 583 /// A 128-bit vector of [4 x float]. 584 /// \param M 585 /// An immediate integer operand. Mask bits [7:4] determine which elements 586 /// of the input vectors are used, with bit [4] corresponding to the lowest 587 /// element and bit [7] corresponding to the highest element of each [4 x 588 /// float] vector. If a bit is set, the corresponding elements from the two 589 /// input vectors are used as an input for dot product; otherwise that input 590 /// is treated as zero. Bits [3:0] determine which elements of the result 591 /// will receive a copy of the final dot product, with bit [0] corresponding 592 /// to the lowest element and bit [3] corresponding to the highest element of 593 /// each [4 x float] subvector. If a bit is set, the dot product is returned 594 /// in the corresponding element; otherwise that element is set to zero. 595 /// \returns A 128-bit vector of [4 x float] containing the dot product. 596 #define _mm_dp_ps(X, Y, M) \ 597 ((__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \ 598 (__v4sf)(__m128)(Y), (M))) 599 600 /// Computes the dot product of the two 128-bit vectors of [2 x double] 601 /// and returns it in the elements of the 128-bit result vector of 602 /// [2 x double]. 603 /// 604 /// The immediate integer operand controls which input 605 /// elements will contribute to the dot product, and where the final results 606 /// are returned. 607 /// 608 /// \headerfile <x86intrin.h> 609 /// 610 /// \code 611 /// __m128d _mm_dp_pd(__m128d X, __m128d Y, const int M); 612 /// \endcode 613 /// 614 /// This intrinsic corresponds to the <c> VDPPD / DPPD </c> instruction. 615 /// 616 /// \param X 617 /// A 128-bit vector of [2 x double]. 618 /// \param Y 619 /// A 128-bit vector of [2 x double]. 620 /// \param M 621 /// An immediate integer operand. Mask bits [5:4] determine which elements 622 /// of the input vectors are used, with bit [4] corresponding to the lowest 623 /// element and bit [5] corresponding to the highest element of each of [2 x 624 /// double] vector. If a bit is set, the corresponding elements from the two 625 /// input vectors are used as an input for dot product; otherwise that input 626 /// is treated as zero. Bits [1:0] determine which elements of the result 627 /// will receive a copy of the final dot product, with bit [0] corresponding 628 /// to the lowest element and bit [1] corresponding to the highest element of 629 /// each [2 x double] vector. If a bit is set, the dot product is returned in 630 /// the corresponding element; otherwise that element is set to zero. 631 #define _mm_dp_pd(X, Y, M) \ 632 ((__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \ 633 (__v2df)(__m128d)(Y), (M))) 634 635 /* SSE4 Streaming Load Hint Instruction. */ 636 /// Loads integer values from a 128-bit aligned memory location to a 637 /// 128-bit integer vector. 638 /// 639 /// \headerfile <x86intrin.h> 640 /// 641 /// This intrinsic corresponds to the <c> VMOVNTDQA / MOVNTDQA </c> instruction. 642 /// 643 /// \param __V 644 /// A pointer to a 128-bit aligned memory location that contains the integer 645 /// values. 646 /// \returns A 128-bit integer vector containing the data stored at the 647 /// specified memory location. 648 static __inline__ __m128i __DEFAULT_FN_ATTRS 649 _mm_stream_load_si128 (__m128i const *__V) 650 { 651 return (__m128i) __builtin_nontemporal_load ((const __v2di *) __V); 652 } 653 654 /* SSE4 Packed Integer Min/Max Instructions. */ 655 /// Compares the corresponding elements of two 128-bit vectors of 656 /// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the lesser 657 /// of the two values. 658 /// 659 /// \headerfile <x86intrin.h> 660 /// 661 /// This intrinsic corresponds to the <c> VPMINSB / PMINSB </c> instruction. 662 /// 663 /// \param __V1 664 /// A 128-bit vector of [16 x i8]. 665 /// \param __V2 666 /// A 128-bit vector of [16 x i8] 667 /// \returns A 128-bit vector of [16 x i8] containing the lesser values. 668 static __inline__ __m128i __DEFAULT_FN_ATTRS 669 _mm_min_epi8 (__m128i __V1, __m128i __V2) 670 { 671 return (__m128i) __builtin_elementwise_min((__v16qs) __V1, (__v16qs) __V2); 672 } 673 674 /// Compares the corresponding elements of two 128-bit vectors of 675 /// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the 676 /// greater value of the two. 677 /// 678 /// \headerfile <x86intrin.h> 679 /// 680 /// This intrinsic corresponds to the <c> VPMAXSB / PMAXSB </c> instruction. 681 /// 682 /// \param __V1 683 /// A 128-bit vector of [16 x i8]. 684 /// \param __V2 685 /// A 128-bit vector of [16 x i8]. 686 /// \returns A 128-bit vector of [16 x i8] containing the greater values. 687 static __inline__ __m128i __DEFAULT_FN_ATTRS 688 _mm_max_epi8 (__m128i __V1, __m128i __V2) 689 { 690 return (__m128i) __builtin_elementwise_max((__v16qs) __V1, (__v16qs) __V2); 691 } 692 693 /// Compares the corresponding elements of two 128-bit vectors of 694 /// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the lesser 695 /// value of the two. 696 /// 697 /// \headerfile <x86intrin.h> 698 /// 699 /// This intrinsic corresponds to the <c> VPMINUW / PMINUW </c> instruction. 700 /// 701 /// \param __V1 702 /// A 128-bit vector of [8 x u16]. 703 /// \param __V2 704 /// A 128-bit vector of [8 x u16]. 705 /// \returns A 128-bit vector of [8 x u16] containing the lesser values. 706 static __inline__ __m128i __DEFAULT_FN_ATTRS 707 _mm_min_epu16 (__m128i __V1, __m128i __V2) 708 { 709 return (__m128i) __builtin_elementwise_min((__v8hu) __V1, (__v8hu) __V2); 710 } 711 712 /// Compares the corresponding elements of two 128-bit vectors of 713 /// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the 714 /// greater value of the two. 715 /// 716 /// \headerfile <x86intrin.h> 717 /// 718 /// This intrinsic corresponds to the <c> VPMAXUW / PMAXUW </c> instruction. 719 /// 720 /// \param __V1 721 /// A 128-bit vector of [8 x u16]. 722 /// \param __V2 723 /// A 128-bit vector of [8 x u16]. 724 /// \returns A 128-bit vector of [8 x u16] containing the greater values. 725 static __inline__ __m128i __DEFAULT_FN_ATTRS 726 _mm_max_epu16 (__m128i __V1, __m128i __V2) 727 { 728 return (__m128i) __builtin_elementwise_max((__v8hu) __V1, (__v8hu) __V2); 729 } 730 731 /// Compares the corresponding elements of two 128-bit vectors of 732 /// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the lesser 733 /// value of the two. 734 /// 735 /// \headerfile <x86intrin.h> 736 /// 737 /// This intrinsic corresponds to the <c> VPMINSD / PMINSD </c> instruction. 738 /// 739 /// \param __V1 740 /// A 128-bit vector of [4 x i32]. 741 /// \param __V2 742 /// A 128-bit vector of [4 x i32]. 743 /// \returns A 128-bit vector of [4 x i32] containing the lesser values. 744 static __inline__ __m128i __DEFAULT_FN_ATTRS 745 _mm_min_epi32 (__m128i __V1, __m128i __V2) 746 { 747 return (__m128i) __builtin_elementwise_min((__v4si) __V1, (__v4si) __V2); 748 } 749 750 /// Compares the corresponding elements of two 128-bit vectors of 751 /// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the 752 /// greater value of the two. 753 /// 754 /// \headerfile <x86intrin.h> 755 /// 756 /// This intrinsic corresponds to the <c> VPMAXSD / PMAXSD </c> instruction. 757 /// 758 /// \param __V1 759 /// A 128-bit vector of [4 x i32]. 760 /// \param __V2 761 /// A 128-bit vector of [4 x i32]. 762 /// \returns A 128-bit vector of [4 x i32] containing the greater values. 763 static __inline__ __m128i __DEFAULT_FN_ATTRS 764 _mm_max_epi32 (__m128i __V1, __m128i __V2) 765 { 766 return (__m128i) __builtin_elementwise_max((__v4si) __V1, (__v4si) __V2); 767 } 768 769 /// Compares the corresponding elements of two 128-bit vectors of 770 /// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the lesser 771 /// value of the two. 772 /// 773 /// \headerfile <x86intrin.h> 774 /// 775 /// This intrinsic corresponds to the <c> VPMINUD / PMINUD </c> instruction. 776 /// 777 /// \param __V1 778 /// A 128-bit vector of [4 x u32]. 779 /// \param __V2 780 /// A 128-bit vector of [4 x u32]. 781 /// \returns A 128-bit vector of [4 x u32] containing the lesser values. 782 static __inline__ __m128i __DEFAULT_FN_ATTRS 783 _mm_min_epu32 (__m128i __V1, __m128i __V2) 784 { 785 return (__m128i) __builtin_elementwise_min((__v4su) __V1, (__v4su) __V2); 786 } 787 788 /// Compares the corresponding elements of two 128-bit vectors of 789 /// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the 790 /// greater value of the two. 791 /// 792 /// \headerfile <x86intrin.h> 793 /// 794 /// This intrinsic corresponds to the <c> VPMAXUD / PMAXUD </c> instruction. 795 /// 796 /// \param __V1 797 /// A 128-bit vector of [4 x u32]. 798 /// \param __V2 799 /// A 128-bit vector of [4 x u32]. 800 /// \returns A 128-bit vector of [4 x u32] containing the greater values. 801 static __inline__ __m128i __DEFAULT_FN_ATTRS 802 _mm_max_epu32 (__m128i __V1, __m128i __V2) 803 { 804 return (__m128i) __builtin_elementwise_max((__v4su) __V1, (__v4su) __V2); 805 } 806 807 /* SSE4 Insertion and Extraction from XMM Register Instructions. */ 808 /// Takes the first argument \a X and inserts an element from the second 809 /// argument \a Y as selected by the third argument \a N. That result then 810 /// has elements zeroed out also as selected by the third argument \a N. The 811 /// resulting 128-bit vector of [4 x float] is then returned. 812 /// 813 /// \headerfile <x86intrin.h> 814 /// 815 /// \code 816 /// __m128 _mm_insert_ps(__m128 X, __m128 Y, const int N); 817 /// \endcode 818 /// 819 /// This intrinsic corresponds to the <c> VINSERTPS </c> instruction. 820 /// 821 /// \param X 822 /// A 128-bit vector source operand of [4 x float]. With the exception of 823 /// those bits in the result copied from parameter \a Y and zeroed by bits 824 /// [3:0] of \a N, all bits from this parameter are copied to the result. 825 /// \param Y 826 /// A 128-bit vector source operand of [4 x float]. One single-precision 827 /// floating-point element from this source, as determined by the immediate 828 /// parameter, is copied to the result. 829 /// \param N 830 /// Specifies which bits from operand \a Y will be copied, which bits in the 831 /// result they will be be copied to, and which bits in the result will be 832 /// cleared. The following assignments are made: \n 833 /// Bits [7:6] specify the bits to copy from operand \a Y: \n 834 /// 00: Selects bits [31:0] from operand \a Y. \n 835 /// 01: Selects bits [63:32] from operand \a Y. \n 836 /// 10: Selects bits [95:64] from operand \a Y. \n 837 /// 11: Selects bits [127:96] from operand \a Y. \n 838 /// Bits [5:4] specify the bits in the result to which the selected bits 839 /// from operand \a Y are copied: \n 840 /// 00: Copies the selected bits from \a Y to result bits [31:0]. \n 841 /// 01: Copies the selected bits from \a Y to result bits [63:32]. \n 842 /// 10: Copies the selected bits from \a Y to result bits [95:64]. \n 843 /// 11: Copies the selected bits from \a Y to result bits [127:96]. \n 844 /// Bits[3:0]: If any of these bits are set, the corresponding result 845 /// element is cleared. 846 /// \returns A 128-bit vector of [4 x float] containing the copied 847 /// single-precision floating point elements from the operands. 848 #define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N)) 849 850 /// Extracts a 32-bit integer from a 128-bit vector of [4 x float] and 851 /// returns it, using the immediate value parameter \a N as a selector. 852 /// 853 /// \headerfile <x86intrin.h> 854 /// 855 /// \code 856 /// int _mm_extract_ps(__m128 X, const int N); 857 /// \endcode 858 /// 859 /// This intrinsic corresponds to the <c> VEXTRACTPS / EXTRACTPS </c> 860 /// instruction. 861 /// 862 /// \param X 863 /// A 128-bit vector of [4 x float]. 864 /// \param N 865 /// An immediate value. Bits [1:0] determines which bits from the argument 866 /// \a X are extracted and returned: \n 867 /// 00: Bits [31:0] of parameter \a X are returned. \n 868 /// 01: Bits [63:32] of parameter \a X are returned. \n 869 /// 10: Bits [95:64] of parameter \a X are returned. \n 870 /// 11: Bits [127:96] of parameter \a X are returned. 871 /// \returns A 32-bit integer containing the extracted 32 bits of float data. 872 #define _mm_extract_ps(X, N) \ 873 __builtin_bit_cast(int, __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N))) 874 875 /* Miscellaneous insert and extract macros. */ 876 /* Extract a single-precision float from X at index N into D. */ 877 #define _MM_EXTRACT_FLOAT(D, X, N) \ 878 do { (D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); } while (0) 879 880 /* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create 881 an index suitable for _mm_insert_ps. */ 882 #define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z)) 883 884 /* Extract a float from X at index N into the first index of the return. */ 885 #define _MM_PICK_OUT_PS(X, N) _mm_insert_ps (_mm_setzero_ps(), (X), \ 886 _MM_MK_INSERTPS_NDX((N), 0, 0x0e)) 887 888 /* Insert int into packed integer array at index. */ 889 /// Constructs a 128-bit vector of [16 x i8] by first making a copy of 890 /// the 128-bit integer vector parameter, and then inserting the lower 8 bits 891 /// of an integer parameter \a I into an offset specified by the immediate 892 /// value parameter \a N. 893 /// 894 /// \headerfile <x86intrin.h> 895 /// 896 /// \code 897 /// __m128i _mm_insert_epi8(__m128i X, int I, const int N); 898 /// \endcode 899 /// 900 /// This intrinsic corresponds to the <c> VPINSRB / PINSRB </c> instruction. 901 /// 902 /// \param X 903 /// A 128-bit integer vector of [16 x i8]. This vector is copied to the 904 /// result and then one of the sixteen elements in the result vector is 905 /// replaced by the lower 8 bits of \a I. 906 /// \param I 907 /// An integer. The lower 8 bits of this operand are written to the result 908 /// beginning at the offset specified by \a N. 909 /// \param N 910 /// An immediate value. Bits [3:0] specify the bit offset in the result at 911 /// which the lower 8 bits of \a I are written. \n 912 /// 0000: Bits [7:0] of the result are used for insertion. \n 913 /// 0001: Bits [15:8] of the result are used for insertion. \n 914 /// 0010: Bits [23:16] of the result are used for insertion. \n 915 /// 0011: Bits [31:24] of the result are used for insertion. \n 916 /// 0100: Bits [39:32] of the result are used for insertion. \n 917 /// 0101: Bits [47:40] of the result are used for insertion. \n 918 /// 0110: Bits [55:48] of the result are used for insertion. \n 919 /// 0111: Bits [63:56] of the result are used for insertion. \n 920 /// 1000: Bits [71:64] of the result are used for insertion. \n 921 /// 1001: Bits [79:72] of the result are used for insertion. \n 922 /// 1010: Bits [87:80] of the result are used for insertion. \n 923 /// 1011: Bits [95:88] of the result are used for insertion. \n 924 /// 1100: Bits [103:96] of the result are used for insertion. \n 925 /// 1101: Bits [111:104] of the result are used for insertion. \n 926 /// 1110: Bits [119:112] of the result are used for insertion. \n 927 /// 1111: Bits [127:120] of the result are used for insertion. 928 /// \returns A 128-bit integer vector containing the constructed values. 929 #define _mm_insert_epi8(X, I, N) \ 930 ((__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), \ 931 (int)(I), (int)(N))) 932 933 /// Constructs a 128-bit vector of [4 x i32] by first making a copy of 934 /// the 128-bit integer vector parameter, and then inserting the 32-bit 935 /// integer parameter \a I at the offset specified by the immediate value 936 /// parameter \a N. 937 /// 938 /// \headerfile <x86intrin.h> 939 /// 940 /// \code 941 /// __m128i _mm_insert_epi32(__m128i X, int I, const int N); 942 /// \endcode 943 /// 944 /// This intrinsic corresponds to the <c> VPINSRD / PINSRD </c> instruction. 945 /// 946 /// \param X 947 /// A 128-bit integer vector of [4 x i32]. This vector is copied to the 948 /// result and then one of the four elements in the result vector is 949 /// replaced by \a I. 950 /// \param I 951 /// A 32-bit integer that is written to the result beginning at the offset 952 /// specified by \a N. 953 /// \param N 954 /// An immediate value. Bits [1:0] specify the bit offset in the result at 955 /// which the integer \a I is written. \n 956 /// 00: Bits [31:0] of the result are used for insertion. \n 957 /// 01: Bits [63:32] of the result are used for insertion. \n 958 /// 10: Bits [95:64] of the result are used for insertion. \n 959 /// 11: Bits [127:96] of the result are used for insertion. 960 /// \returns A 128-bit integer vector containing the constructed values. 961 #define _mm_insert_epi32(X, I, N) \ 962 ((__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), \ 963 (int)(I), (int)(N))) 964 965 #ifdef __x86_64__ 966 /// Constructs a 128-bit vector of [2 x i64] by first making a copy of 967 /// the 128-bit integer vector parameter, and then inserting the 64-bit 968 /// integer parameter \a I, using the immediate value parameter \a N as an 969 /// insertion location selector. 970 /// 971 /// \headerfile <x86intrin.h> 972 /// 973 /// \code 974 /// __m128i _mm_insert_epi64(__m128i X, long long I, const int N); 975 /// \endcode 976 /// 977 /// This intrinsic corresponds to the <c> VPINSRQ / PINSRQ </c> instruction. 978 /// 979 /// \param X 980 /// A 128-bit integer vector of [2 x i64]. This vector is copied to the 981 /// result and then one of the two elements in the result vector is replaced 982 /// by \a I. 983 /// \param I 984 /// A 64-bit integer that is written to the result beginning at the offset 985 /// specified by \a N. 986 /// \param N 987 /// An immediate value. Bit [0] specifies the bit offset in the result at 988 /// which the integer \a I is written. \n 989 /// 0: Bits [63:0] of the result are used for insertion. \n 990 /// 1: Bits [127:64] of the result are used for insertion. \n 991 /// \returns A 128-bit integer vector containing the constructed values. 992 #define _mm_insert_epi64(X, I, N) \ 993 ((__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), \ 994 (long long)(I), (int)(N))) 995 #endif /* __x86_64__ */ 996 997 /* Extract int from packed integer array at index. This returns the element 998 * as a zero extended value, so it is unsigned. 999 */ 1000 /// Extracts an 8-bit element from the 128-bit integer vector of 1001 /// [16 x i8], using the immediate value parameter \a N as a selector. 1002 /// 1003 /// \headerfile <x86intrin.h> 1004 /// 1005 /// \code 1006 /// int _mm_extract_epi8(__m128i X, const int N); 1007 /// \endcode 1008 /// 1009 /// This intrinsic corresponds to the <c> VPEXTRB / PEXTRB </c> instruction. 1010 /// 1011 /// \param X 1012 /// A 128-bit integer vector. 1013 /// \param N 1014 /// An immediate value. Bits [3:0] specify which 8-bit vector element from 1015 /// the argument \a X to extract and copy to the result. \n 1016 /// 0000: Bits [7:0] of parameter \a X are extracted. \n 1017 /// 0001: Bits [15:8] of the parameter \a X are extracted. \n 1018 /// 0010: Bits [23:16] of the parameter \a X are extracted. \n 1019 /// 0011: Bits [31:24] of the parameter \a X are extracted. \n 1020 /// 0100: Bits [39:32] of the parameter \a X are extracted. \n 1021 /// 0101: Bits [47:40] of the parameter \a X are extracted. \n 1022 /// 0110: Bits [55:48] of the parameter \a X are extracted. \n 1023 /// 0111: Bits [63:56] of the parameter \a X are extracted. \n 1024 /// 1000: Bits [71:64] of the parameter \a X are extracted. \n 1025 /// 1001: Bits [79:72] of the parameter \a X are extracted. \n 1026 /// 1010: Bits [87:80] of the parameter \a X are extracted. \n 1027 /// 1011: Bits [95:88] of the parameter \a X are extracted. \n 1028 /// 1100: Bits [103:96] of the parameter \a X are extracted. \n 1029 /// 1101: Bits [111:104] of the parameter \a X are extracted. \n 1030 /// 1110: Bits [119:112] of the parameter \a X are extracted. \n 1031 /// 1111: Bits [127:120] of the parameter \a X are extracted. 1032 /// \returns An unsigned integer, whose lower 8 bits are selected from the 1033 /// 128-bit integer vector parameter and the remaining bits are assigned 1034 /// zeros. 1035 #define _mm_extract_epi8(X, N) \ 1036 ((int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X), \ 1037 (int)(N))) 1038 1039 /// Extracts a 32-bit element from the 128-bit integer vector of 1040 /// [4 x i32], using the immediate value parameter \a N as a selector. 1041 /// 1042 /// \headerfile <x86intrin.h> 1043 /// 1044 /// \code 1045 /// int _mm_extract_epi32(__m128i X, const int N); 1046 /// \endcode 1047 /// 1048 /// This intrinsic corresponds to the <c> VPEXTRD / PEXTRD </c> instruction. 1049 /// 1050 /// \param X 1051 /// A 128-bit integer vector. 1052 /// \param N 1053 /// An immediate value. Bits [1:0] specify which 32-bit vector element from 1054 /// the argument \a X to extract and copy to the result. \n 1055 /// 00: Bits [31:0] of the parameter \a X are extracted. \n 1056 /// 01: Bits [63:32] of the parameter \a X are extracted. \n 1057 /// 10: Bits [95:64] of the parameter \a X are extracted. \n 1058 /// 11: Bits [127:96] of the parameter \a X are exracted. 1059 /// \returns An integer, whose lower 32 bits are selected from the 128-bit 1060 /// integer vector parameter and the remaining bits are assigned zeros. 1061 #define _mm_extract_epi32(X, N) \ 1062 ((int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N))) 1063 1064 #ifdef __x86_64__ 1065 /// Extracts a 64-bit element from the 128-bit integer vector of 1066 /// [2 x i64], using the immediate value parameter \a N as a selector. 1067 /// 1068 /// \headerfile <x86intrin.h> 1069 /// 1070 /// \code 1071 /// long long _mm_extract_epi64(__m128i X, const int N); 1072 /// \endcode 1073 /// 1074 /// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction. 1075 /// 1076 /// \param X 1077 /// A 128-bit integer vector. 1078 /// \param N 1079 /// An immediate value. Bit [0] specifies which 64-bit vector element from 1080 /// the argument \a X to return. \n 1081 /// 0: Bits [63:0] are returned. \n 1082 /// 1: Bits [127:64] are returned. \n 1083 /// \returns A 64-bit integer. 1084 #define _mm_extract_epi64(X, N) \ 1085 ((long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N))) 1086 #endif /* __x86_64 */ 1087 1088 /* SSE4 128-bit Packed Integer Comparisons. */ 1089 /// Tests whether the specified bits in a 128-bit integer vector are all 1090 /// zeros. 1091 /// 1092 /// \headerfile <x86intrin.h> 1093 /// 1094 /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 1095 /// 1096 /// \param __M 1097 /// A 128-bit integer vector containing the bits to be tested. 1098 /// \param __V 1099 /// A 128-bit integer vector selecting which bits to test in operand \a __M. 1100 /// \returns TRUE if the specified bits are all zeros; FALSE otherwise. 1101 static __inline__ int __DEFAULT_FN_ATTRS 1102 _mm_testz_si128(__m128i __M, __m128i __V) 1103 { 1104 return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V); 1105 } 1106 1107 /// Tests whether the specified bits in a 128-bit integer vector are all 1108 /// ones. 1109 /// 1110 /// \headerfile <x86intrin.h> 1111 /// 1112 /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 1113 /// 1114 /// \param __M 1115 /// A 128-bit integer vector containing the bits to be tested. 1116 /// \param __V 1117 /// A 128-bit integer vector selecting which bits to test in operand \a __M. 1118 /// \returns TRUE if the specified bits are all ones; FALSE otherwise. 1119 static __inline__ int __DEFAULT_FN_ATTRS 1120 _mm_testc_si128(__m128i __M, __m128i __V) 1121 { 1122 return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V); 1123 } 1124 1125 /// Tests whether the specified bits in a 128-bit integer vector are 1126 /// neither all zeros nor all ones. 1127 /// 1128 /// \headerfile <x86intrin.h> 1129 /// 1130 /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 1131 /// 1132 /// \param __M 1133 /// A 128-bit integer vector containing the bits to be tested. 1134 /// \param __V 1135 /// A 128-bit integer vector selecting which bits to test in operand \a __M. 1136 /// \returns TRUE if the specified bits are neither all zeros nor all ones; 1137 /// FALSE otherwise. 1138 static __inline__ int __DEFAULT_FN_ATTRS 1139 _mm_testnzc_si128(__m128i __M, __m128i __V) 1140 { 1141 return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V); 1142 } 1143 1144 /// Tests whether the specified bits in a 128-bit integer vector are all 1145 /// ones. 1146 /// 1147 /// \headerfile <x86intrin.h> 1148 /// 1149 /// \code 1150 /// int _mm_test_all_ones(__m128i V); 1151 /// \endcode 1152 /// 1153 /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 1154 /// 1155 /// \param V 1156 /// A 128-bit integer vector containing the bits to be tested. 1157 /// \returns TRUE if the bits specified in the operand are all set to 1; FALSE 1158 /// otherwise. 1159 #define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V))) 1160 1161 /// Tests whether the specified bits in a 128-bit integer vector are 1162 /// neither all zeros nor all ones. 1163 /// 1164 /// \headerfile <x86intrin.h> 1165 /// 1166 /// \code 1167 /// int _mm_test_mix_ones_zeros(__m128i M, __m128i V); 1168 /// \endcode 1169 /// 1170 /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 1171 /// 1172 /// \param M 1173 /// A 128-bit integer vector containing the bits to be tested. 1174 /// \param V 1175 /// A 128-bit integer vector selecting which bits to test in operand \a M. 1176 /// \returns TRUE if the specified bits are neither all zeros nor all ones; 1177 /// FALSE otherwise. 1178 #define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V)) 1179 1180 /// Tests whether the specified bits in a 128-bit integer vector are all 1181 /// zeros. 1182 /// 1183 /// \headerfile <x86intrin.h> 1184 /// 1185 /// \code 1186 /// int _mm_test_all_zeros(__m128i M, __m128i V); 1187 /// \endcode 1188 /// 1189 /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 1190 /// 1191 /// \param M 1192 /// A 128-bit integer vector containing the bits to be tested. 1193 /// \param V 1194 /// A 128-bit integer vector selecting which bits to test in operand \a M. 1195 /// \returns TRUE if the specified bits are all zeros; FALSE otherwise. 1196 #define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V)) 1197 1198 /* SSE4 64-bit Packed Integer Comparisons. */ 1199 /// Compares each of the corresponding 64-bit values of the 128-bit 1200 /// integer vectors for equality. 1201 /// 1202 /// \headerfile <x86intrin.h> 1203 /// 1204 /// This intrinsic corresponds to the <c> VPCMPEQQ / PCMPEQQ </c> instruction. 1205 /// 1206 /// \param __V1 1207 /// A 128-bit integer vector. 1208 /// \param __V2 1209 /// A 128-bit integer vector. 1210 /// \returns A 128-bit integer vector containing the comparison results. 1211 static __inline__ __m128i __DEFAULT_FN_ATTRS 1212 _mm_cmpeq_epi64(__m128i __V1, __m128i __V2) 1213 { 1214 return (__m128i)((__v2di)__V1 == (__v2di)__V2); 1215 } 1216 1217 /* SSE4 Packed Integer Sign-Extension. */ 1218 /// Sign-extends each of the lower eight 8-bit integer elements of a 1219 /// 128-bit vector of [16 x i8] to 16-bit values and returns them in a 1220 /// 128-bit vector of [8 x i16]. The upper eight elements of the input vector 1221 /// are unused. 1222 /// 1223 /// \headerfile <x86intrin.h> 1224 /// 1225 /// This intrinsic corresponds to the <c> VPMOVSXBW / PMOVSXBW </c> instruction. 1226 /// 1227 /// \param __V 1228 /// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are sign- 1229 /// extended to 16-bit values. 1230 /// \returns A 128-bit vector of [8 x i16] containing the sign-extended values. 1231 static __inline__ __m128i __DEFAULT_FN_ATTRS 1232 _mm_cvtepi8_epi16(__m128i __V) 1233 { 1234 /* This function always performs a signed extension, but __v16qi is a char 1235 which may be signed or unsigned, so use __v16qs. */ 1236 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi); 1237 } 1238 1239 /// Sign-extends each of the lower four 8-bit integer elements of a 1240 /// 128-bit vector of [16 x i8] to 32-bit values and returns them in a 1241 /// 128-bit vector of [4 x i32]. The upper twelve elements of the input 1242 /// vector are unused. 1243 /// 1244 /// \headerfile <x86intrin.h> 1245 /// 1246 /// This intrinsic corresponds to the <c> VPMOVSXBD / PMOVSXBD </c> instruction. 1247 /// 1248 /// \param __V 1249 /// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are 1250 /// sign-extended to 32-bit values. 1251 /// \returns A 128-bit vector of [4 x i32] containing the sign-extended values. 1252 static __inline__ __m128i __DEFAULT_FN_ATTRS 1253 _mm_cvtepi8_epi32(__m128i __V) 1254 { 1255 /* This function always performs a signed extension, but __v16qi is a char 1256 which may be signed or unsigned, so use __v16qs. */ 1257 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si); 1258 } 1259 1260 /// Sign-extends each of the lower two 8-bit integer elements of a 1261 /// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in 1262 /// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input 1263 /// vector are unused. 1264 /// 1265 /// \headerfile <x86intrin.h> 1266 /// 1267 /// This intrinsic corresponds to the <c> VPMOVSXBQ / PMOVSXBQ </c> instruction. 1268 /// 1269 /// \param __V 1270 /// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are 1271 /// sign-extended to 64-bit values. 1272 /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. 1273 static __inline__ __m128i __DEFAULT_FN_ATTRS 1274 _mm_cvtepi8_epi64(__m128i __V) 1275 { 1276 /* This function always performs a signed extension, but __v16qi is a char 1277 which may be signed or unsigned, so use __v16qs. */ 1278 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di); 1279 } 1280 1281 /// Sign-extends each of the lower four 16-bit integer elements of a 1282 /// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in 1283 /// a 128-bit vector of [4 x i32]. The upper four elements of the input 1284 /// vector are unused. 1285 /// 1286 /// \headerfile <x86intrin.h> 1287 /// 1288 /// This intrinsic corresponds to the <c> VPMOVSXWD / PMOVSXWD </c> instruction. 1289 /// 1290 /// \param __V 1291 /// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are 1292 /// sign-extended to 32-bit values. 1293 /// \returns A 128-bit vector of [4 x i32] containing the sign-extended values. 1294 static __inline__ __m128i __DEFAULT_FN_ATTRS 1295 _mm_cvtepi16_epi32(__m128i __V) 1296 { 1297 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si); 1298 } 1299 1300 /// Sign-extends each of the lower two 16-bit integer elements of a 1301 /// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in 1302 /// a 128-bit vector of [2 x i64]. The upper six elements of the input 1303 /// vector are unused. 1304 /// 1305 /// \headerfile <x86intrin.h> 1306 /// 1307 /// This intrinsic corresponds to the <c> VPMOVSXWQ / PMOVSXWQ </c> instruction. 1308 /// 1309 /// \param __V 1310 /// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are 1311 /// sign-extended to 64-bit values. 1312 /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. 1313 static __inline__ __m128i __DEFAULT_FN_ATTRS 1314 _mm_cvtepi16_epi64(__m128i __V) 1315 { 1316 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di); 1317 } 1318 1319 /// Sign-extends each of the lower two 32-bit integer elements of a 1320 /// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in 1321 /// a 128-bit vector of [2 x i64]. The upper two elements of the input vector 1322 /// are unused. 1323 /// 1324 /// \headerfile <x86intrin.h> 1325 /// 1326 /// This intrinsic corresponds to the <c> VPMOVSXDQ / PMOVSXDQ </c> instruction. 1327 /// 1328 /// \param __V 1329 /// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are 1330 /// sign-extended to 64-bit values. 1331 /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. 1332 static __inline__ __m128i __DEFAULT_FN_ATTRS 1333 _mm_cvtepi32_epi64(__m128i __V) 1334 { 1335 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4si)__V, (__v4si)__V, 0, 1), __v2di); 1336 } 1337 1338 /* SSE4 Packed Integer Zero-Extension. */ 1339 /// Zero-extends each of the lower eight 8-bit integer elements of a 1340 /// 128-bit vector of [16 x i8] to 16-bit values and returns them in a 1341 /// 128-bit vector of [8 x i16]. The upper eight elements of the input vector 1342 /// are unused. 1343 /// 1344 /// \headerfile <x86intrin.h> 1345 /// 1346 /// This intrinsic corresponds to the <c> VPMOVZXBW / PMOVZXBW </c> instruction. 1347 /// 1348 /// \param __V 1349 /// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are 1350 /// zero-extended to 16-bit values. 1351 /// \returns A 128-bit vector of [8 x i16] containing the zero-extended values. 1352 static __inline__ __m128i __DEFAULT_FN_ATTRS 1353 _mm_cvtepu8_epi16(__m128i __V) 1354 { 1355 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi); 1356 } 1357 1358 /// Zero-extends each of the lower four 8-bit integer elements of a 1359 /// 128-bit vector of [16 x i8] to 32-bit values and returns them in a 1360 /// 128-bit vector of [4 x i32]. The upper twelve elements of the input 1361 /// vector are unused. 1362 /// 1363 /// \headerfile <x86intrin.h> 1364 /// 1365 /// This intrinsic corresponds to the <c> VPMOVZXBD / PMOVZXBD </c> instruction. 1366 /// 1367 /// \param __V 1368 /// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are 1369 /// zero-extended to 32-bit values. 1370 /// \returns A 128-bit vector of [4 x i32] containing the zero-extended values. 1371 static __inline__ __m128i __DEFAULT_FN_ATTRS 1372 _mm_cvtepu8_epi32(__m128i __V) 1373 { 1374 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si); 1375 } 1376 1377 /// Zero-extends each of the lower two 8-bit integer elements of a 1378 /// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in 1379 /// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input 1380 /// vector are unused. 1381 /// 1382 /// \headerfile <x86intrin.h> 1383 /// 1384 /// This intrinsic corresponds to the <c> VPMOVZXBQ / PMOVZXBQ </c> instruction. 1385 /// 1386 /// \param __V 1387 /// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are 1388 /// zero-extended to 64-bit values. 1389 /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. 1390 static __inline__ __m128i __DEFAULT_FN_ATTRS 1391 _mm_cvtepu8_epi64(__m128i __V) 1392 { 1393 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di); 1394 } 1395 1396 /// Zero-extends each of the lower four 16-bit integer elements of a 1397 /// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in 1398 /// a 128-bit vector of [4 x i32]. The upper four elements of the input 1399 /// vector are unused. 1400 /// 1401 /// \headerfile <x86intrin.h> 1402 /// 1403 /// This intrinsic corresponds to the <c> VPMOVZXWD / PMOVZXWD </c> instruction. 1404 /// 1405 /// \param __V 1406 /// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are 1407 /// zero-extended to 32-bit values. 1408 /// \returns A 128-bit vector of [4 x i32] containing the zero-extended values. 1409 static __inline__ __m128i __DEFAULT_FN_ATTRS 1410 _mm_cvtepu16_epi32(__m128i __V) 1411 { 1412 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si); 1413 } 1414 1415 /// Zero-extends each of the lower two 16-bit integer elements of a 1416 /// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in 1417 /// a 128-bit vector of [2 x i64]. The upper six elements of the input vector 1418 /// are unused. 1419 /// 1420 /// \headerfile <x86intrin.h> 1421 /// 1422 /// This intrinsic corresponds to the <c> VPMOVZXWQ / PMOVZXWQ </c> instruction. 1423 /// 1424 /// \param __V 1425 /// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are 1426 /// zero-extended to 64-bit values. 1427 /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. 1428 static __inline__ __m128i __DEFAULT_FN_ATTRS 1429 _mm_cvtepu16_epi64(__m128i __V) 1430 { 1431 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di); 1432 } 1433 1434 /// Zero-extends each of the lower two 32-bit integer elements of a 1435 /// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in 1436 /// a 128-bit vector of [2 x i64]. The upper two elements of the input vector 1437 /// are unused. 1438 /// 1439 /// \headerfile <x86intrin.h> 1440 /// 1441 /// This intrinsic corresponds to the <c> VPMOVZXDQ / PMOVZXDQ </c> instruction. 1442 /// 1443 /// \param __V 1444 /// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are 1445 /// zero-extended to 64-bit values. 1446 /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. 1447 static __inline__ __m128i __DEFAULT_FN_ATTRS 1448 _mm_cvtepu32_epi64(__m128i __V) 1449 { 1450 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4su)__V, (__v4su)__V, 0, 1), __v2di); 1451 } 1452 1453 /* SSE4 Pack with Unsigned Saturation. */ 1454 /// Converts 32-bit signed integers from both 128-bit integer vector 1455 /// operands into 16-bit unsigned integers, and returns the packed result. 1456 /// Values greater than 0xFFFF are saturated to 0xFFFF. Values less than 1457 /// 0x0000 are saturated to 0x0000. 1458 /// 1459 /// \headerfile <x86intrin.h> 1460 /// 1461 /// This intrinsic corresponds to the <c> VPACKUSDW / PACKUSDW </c> instruction. 1462 /// 1463 /// \param __V1 1464 /// A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a 1465 /// signed integer and is converted to a 16-bit unsigned integer with 1466 /// saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values 1467 /// less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values 1468 /// are written to the lower 64 bits of the result. 1469 /// \param __V2 1470 /// A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a 1471 /// signed integer and is converted to a 16-bit unsigned integer with 1472 /// saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values 1473 /// less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values 1474 /// are written to the higher 64 bits of the result. 1475 /// \returns A 128-bit vector of [8 x i16] containing the converted values. 1476 static __inline__ __m128i __DEFAULT_FN_ATTRS 1477 _mm_packus_epi32(__m128i __V1, __m128i __V2) 1478 { 1479 return (__m128i) __builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2); 1480 } 1481 1482 /* SSE4 Multiple Packed Sums of Absolute Difference. */ 1483 /// Subtracts 8-bit unsigned integer values and computes the absolute 1484 /// values of the differences to the corresponding bits in the destination. 1485 /// Then sums of the absolute differences are returned according to the bit 1486 /// fields in the immediate operand. 1487 /// 1488 /// \headerfile <x86intrin.h> 1489 /// 1490 /// \code 1491 /// __m128i _mm_mpsadbw_epu8(__m128i X, __m128i Y, const int M); 1492 /// \endcode 1493 /// 1494 /// This intrinsic corresponds to the <c> VMPSADBW / MPSADBW </c> instruction. 1495 /// 1496 /// \param X 1497 /// A 128-bit vector of [16 x i8]. 1498 /// \param Y 1499 /// A 128-bit vector of [16 x i8]. 1500 /// \param M 1501 /// An 8-bit immediate operand specifying how the absolute differences are to 1502 /// be calculated, according to the following algorithm: 1503 /// \code 1504 /// // M2 represents bit 2 of the immediate operand 1505 /// // M10 represents bits [1:0] of the immediate operand 1506 /// i = M2 * 4; 1507 /// j = M10 * 4; 1508 /// for (k = 0; k < 8; k = k + 1) { 1509 /// d0 = abs(X[i + k + 0] - Y[j + 0]); 1510 /// d1 = abs(X[i + k + 1] - Y[j + 1]); 1511 /// d2 = abs(X[i + k + 2] - Y[j + 2]); 1512 /// d3 = abs(X[i + k + 3] - Y[j + 3]); 1513 /// r[k] = d0 + d1 + d2 + d3; 1514 /// } 1515 /// \endcode 1516 /// \returns A 128-bit integer vector containing the sums of the sets of 1517 /// absolute differences between both operands. 1518 #define _mm_mpsadbw_epu8(X, Y, M) \ 1519 ((__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \ 1520 (__v16qi)(__m128i)(Y), (M))) 1521 1522 /// Finds the minimum unsigned 16-bit element in the input 128-bit 1523 /// vector of [8 x u16] and returns it and along with its index. 1524 /// 1525 /// \headerfile <x86intrin.h> 1526 /// 1527 /// This intrinsic corresponds to the <c> VPHMINPOSUW / PHMINPOSUW </c> 1528 /// instruction. 1529 /// 1530 /// \param __V 1531 /// A 128-bit vector of [8 x u16]. 1532 /// \returns A 128-bit value where bits [15:0] contain the minimum value found 1533 /// in parameter \a __V, bits [18:16] contain the index of the minimum value 1534 /// and the remaining bits are set to 0. 1535 static __inline__ __m128i __DEFAULT_FN_ATTRS 1536 _mm_minpos_epu16(__m128i __V) 1537 { 1538 return (__m128i) __builtin_ia32_phminposuw128((__v8hi)__V); 1539 } 1540 1541 /* Handle the sse4.2 definitions here. */ 1542 1543 /* These definitions are normally in nmmintrin.h, but gcc puts them in here 1544 so we'll do the same. */ 1545 1546 #undef __DEFAULT_FN_ATTRS 1547 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.2"))) 1548 1549 /* These specify the type of data that we're comparing. */ 1550 #define _SIDD_UBYTE_OPS 0x00 1551 #define _SIDD_UWORD_OPS 0x01 1552 #define _SIDD_SBYTE_OPS 0x02 1553 #define _SIDD_SWORD_OPS 0x03 1554 1555 /* These specify the type of comparison operation. */ 1556 #define _SIDD_CMP_EQUAL_ANY 0x00 1557 #define _SIDD_CMP_RANGES 0x04 1558 #define _SIDD_CMP_EQUAL_EACH 0x08 1559 #define _SIDD_CMP_EQUAL_ORDERED 0x0c 1560 1561 /* These macros specify the polarity of the operation. */ 1562 #define _SIDD_POSITIVE_POLARITY 0x00 1563 #define _SIDD_NEGATIVE_POLARITY 0x10 1564 #define _SIDD_MASKED_POSITIVE_POLARITY 0x20 1565 #define _SIDD_MASKED_NEGATIVE_POLARITY 0x30 1566 1567 /* These macros are used in _mm_cmpXstri() to specify the return. */ 1568 #define _SIDD_LEAST_SIGNIFICANT 0x00 1569 #define _SIDD_MOST_SIGNIFICANT 0x40 1570 1571 /* These macros are used in _mm_cmpXstri() to specify the return. */ 1572 #define _SIDD_BIT_MASK 0x00 1573 #define _SIDD_UNIT_MASK 0x40 1574 1575 /* SSE4.2 Packed Comparison Intrinsics. */ 1576 /// Uses the immediate operand \a M to perform a comparison of string 1577 /// data with implicitly defined lengths that is contained in source operands 1578 /// \a A and \a B. Returns a 128-bit integer vector representing the result 1579 /// mask of the comparison. 1580 /// 1581 /// \headerfile <x86intrin.h> 1582 /// 1583 /// \code 1584 /// __m128i _mm_cmpistrm(__m128i A, __m128i B, const int M); 1585 /// \endcode 1586 /// 1587 /// This intrinsic corresponds to the <c> VPCMPISTRM / PCMPISTRM </c> 1588 /// instruction. 1589 /// 1590 /// \param A 1591 /// A 128-bit integer vector containing one of the source operands to be 1592 /// compared. 1593 /// \param B 1594 /// A 128-bit integer vector containing one of the source operands to be 1595 /// compared. 1596 /// \param M 1597 /// An 8-bit immediate operand specifying whether the characters are bytes or 1598 /// words, the type of comparison to perform, and the format of the return 1599 /// value. \n 1600 /// Bits [1:0]: Determine source data format. \n 1601 /// 00: 16 unsigned bytes \n 1602 /// 01: 8 unsigned words \n 1603 /// 10: 16 signed bytes \n 1604 /// 11: 8 signed words \n 1605 /// Bits [3:2]: Determine comparison type and aggregation method. \n 1606 /// 00: Subset: Each character in \a B is compared for equality with all 1607 /// the characters in \a A. \n 1608 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1609 /// basis is greater than or equal for even-indexed elements in \a A, 1610 /// and less than or equal for odd-indexed elements in \a A. \n 1611 /// 10: Match: Compare each pair of corresponding characters in \a A and 1612 /// \a B for equality. \n 1613 /// 11: Substring: Search \a B for substring matches of \a A. \n 1614 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 1615 /// mask of the comparison results. \n 1616 /// 00: No effect. \n 1617 /// 01: Negate the bit mask. \n 1618 /// 10: No effect. \n 1619 /// 11: Negate the bit mask only for bits with an index less than or equal 1620 /// to the size of \a A or \a B. \n 1621 /// Bit [6]: Determines whether the result is zero-extended or expanded to 16 1622 /// bytes. \n 1623 /// 0: The result is zero-extended to 16 bytes. \n 1624 /// 1: The result is expanded to 16 bytes (this expansion is performed by 1625 /// repeating each bit 8 or 16 times). 1626 /// \returns Returns a 128-bit integer vector representing the result mask of 1627 /// the comparison. 1628 #define _mm_cmpistrm(A, B, M) \ 1629 ((__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \ 1630 (__v16qi)(__m128i)(B), (int)(M))) 1631 1632 /// Uses the immediate operand \a M to perform a comparison of string 1633 /// data with implicitly defined lengths that is contained in source operands 1634 /// \a A and \a B. Returns an integer representing the result index of the 1635 /// comparison. 1636 /// 1637 /// \headerfile <x86intrin.h> 1638 /// 1639 /// \code 1640 /// int _mm_cmpistri(__m128i A, __m128i B, const int M); 1641 /// \endcode 1642 /// 1643 /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 1644 /// instruction. 1645 /// 1646 /// \param A 1647 /// A 128-bit integer vector containing one of the source operands to be 1648 /// compared. 1649 /// \param B 1650 /// A 128-bit integer vector containing one of the source operands to be 1651 /// compared. 1652 /// \param M 1653 /// An 8-bit immediate operand specifying whether the characters are bytes or 1654 /// words, the type of comparison to perform, and the format of the return 1655 /// value. \n 1656 /// Bits [1:0]: Determine source data format. \n 1657 /// 00: 16 unsigned bytes \n 1658 /// 01: 8 unsigned words \n 1659 /// 10: 16 signed bytes \n 1660 /// 11: 8 signed words \n 1661 /// Bits [3:2]: Determine comparison type and aggregation method. \n 1662 /// 00: Subset: Each character in \a B is compared for equality with all 1663 /// the characters in \a A. \n 1664 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1665 /// basis is greater than or equal for even-indexed elements in \a A, 1666 /// and less than or equal for odd-indexed elements in \a A. \n 1667 /// 10: Match: Compare each pair of corresponding characters in \a A and 1668 /// \a B for equality. \n 1669 /// 11: Substring: Search B for substring matches of \a A. \n 1670 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 1671 /// mask of the comparison results. \n 1672 /// 00: No effect. \n 1673 /// 01: Negate the bit mask. \n 1674 /// 10: No effect. \n 1675 /// 11: Negate the bit mask only for bits with an index less than or equal 1676 /// to the size of \a A or \a B. \n 1677 /// Bit [6]: Determines whether the index of the lowest set bit or the 1678 /// highest set bit is returned. \n 1679 /// 0: The index of the least significant set bit. \n 1680 /// 1: The index of the most significant set bit. \n 1681 /// \returns Returns an integer representing the result index of the comparison. 1682 #define _mm_cmpistri(A, B, M) \ 1683 ((int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \ 1684 (__v16qi)(__m128i)(B), (int)(M))) 1685 1686 /// Uses the immediate operand \a M to perform a comparison of string 1687 /// data with explicitly defined lengths that is contained in source operands 1688 /// \a A and \a B. Returns a 128-bit integer vector representing the result 1689 /// mask of the comparison. 1690 /// 1691 /// \headerfile <x86intrin.h> 1692 /// 1693 /// \code 1694 /// __m128i _mm_cmpestrm(__m128i A, int LA, __m128i B, int LB, const int M); 1695 /// \endcode 1696 /// 1697 /// This intrinsic corresponds to the <c> VPCMPESTRM / PCMPESTRM </c> 1698 /// instruction. 1699 /// 1700 /// \param A 1701 /// A 128-bit integer vector containing one of the source operands to be 1702 /// compared. 1703 /// \param LA 1704 /// An integer that specifies the length of the string in \a A. 1705 /// \param B 1706 /// A 128-bit integer vector containing one of the source operands to be 1707 /// compared. 1708 /// \param LB 1709 /// An integer that specifies the length of the string in \a B. 1710 /// \param M 1711 /// An 8-bit immediate operand specifying whether the characters are bytes or 1712 /// words, the type of comparison to perform, and the format of the return 1713 /// value. \n 1714 /// Bits [1:0]: Determine source data format. \n 1715 /// 00: 16 unsigned bytes \n 1716 /// 01: 8 unsigned words \n 1717 /// 10: 16 signed bytes \n 1718 /// 11: 8 signed words \n 1719 /// Bits [3:2]: Determine comparison type and aggregation method. \n 1720 /// 00: Subset: Each character in \a B is compared for equality with all 1721 /// the characters in \a A. \n 1722 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1723 /// basis is greater than or equal for even-indexed elements in \a A, 1724 /// and less than or equal for odd-indexed elements in \a A. \n 1725 /// 10: Match: Compare each pair of corresponding characters in \a A and 1726 /// \a B for equality. \n 1727 /// 11: Substring: Search \a B for substring matches of \a A. \n 1728 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 1729 /// mask of the comparison results. \n 1730 /// 00: No effect. \n 1731 /// 01: Negate the bit mask. \n 1732 /// 10: No effect. \n 1733 /// 11: Negate the bit mask only for bits with an index less than or equal 1734 /// to the size of \a A or \a B. \n 1735 /// Bit [6]: Determines whether the result is zero-extended or expanded to 16 1736 /// bytes. \n 1737 /// 0: The result is zero-extended to 16 bytes. \n 1738 /// 1: The result is expanded to 16 bytes (this expansion is performed by 1739 /// repeating each bit 8 or 16 times). \n 1740 /// \returns Returns a 128-bit integer vector representing the result mask of 1741 /// the comparison. 1742 #define _mm_cmpestrm(A, LA, B, LB, M) \ 1743 ((__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \ 1744 (__v16qi)(__m128i)(B), (int)(LB), \ 1745 (int)(M))) 1746 1747 /// Uses the immediate operand \a M to perform a comparison of string 1748 /// data with explicitly defined lengths that is contained in source operands 1749 /// \a A and \a B. Returns an integer representing the result index of the 1750 /// comparison. 1751 /// 1752 /// \headerfile <x86intrin.h> 1753 /// 1754 /// \code 1755 /// int _mm_cmpestri(__m128i A, int LA, __m128i B, int LB, const int M); 1756 /// \endcode 1757 /// 1758 /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> 1759 /// instruction. 1760 /// 1761 /// \param A 1762 /// A 128-bit integer vector containing one of the source operands to be 1763 /// compared. 1764 /// \param LA 1765 /// An integer that specifies the length of the string in \a A. 1766 /// \param B 1767 /// A 128-bit integer vector containing one of the source operands to be 1768 /// compared. 1769 /// \param LB 1770 /// An integer that specifies the length of the string in \a B. 1771 /// \param M 1772 /// An 8-bit immediate operand specifying whether the characters are bytes or 1773 /// words, the type of comparison to perform, and the format of the return 1774 /// value. \n 1775 /// Bits [1:0]: Determine source data format. \n 1776 /// 00: 16 unsigned bytes \n 1777 /// 01: 8 unsigned words \n 1778 /// 10: 16 signed bytes \n 1779 /// 11: 8 signed words \n 1780 /// Bits [3:2]: Determine comparison type and aggregation method. \n 1781 /// 00: Subset: Each character in \a B is compared for equality with all 1782 /// the characters in \a A. \n 1783 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1784 /// basis is greater than or equal for even-indexed elements in \a A, 1785 /// and less than or equal for odd-indexed elements in \a A. \n 1786 /// 10: Match: Compare each pair of corresponding characters in \a A and 1787 /// \a B for equality. \n 1788 /// 11: Substring: Search B for substring matches of \a A. \n 1789 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 1790 /// mask of the comparison results. \n 1791 /// 00: No effect. \n 1792 /// 01: Negate the bit mask. \n 1793 /// 10: No effect. \n 1794 /// 11: Negate the bit mask only for bits with an index less than or equal 1795 /// to the size of \a A or \a B. \n 1796 /// Bit [6]: Determines whether the index of the lowest set bit or the 1797 /// highest set bit is returned. \n 1798 /// 0: The index of the least significant set bit. \n 1799 /// 1: The index of the most significant set bit. \n 1800 /// \returns Returns an integer representing the result index of the comparison. 1801 #define _mm_cmpestri(A, LA, B, LB, M) \ 1802 ((int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \ 1803 (__v16qi)(__m128i)(B), (int)(LB), \ 1804 (int)(M))) 1805 1806 /* SSE4.2 Packed Comparison Intrinsics and EFlag Reading. */ 1807 /// Uses the immediate operand \a M to perform a comparison of string 1808 /// data with implicitly defined lengths that is contained in source operands 1809 /// \a A and \a B. Returns 1 if the bit mask is zero and the length of the 1810 /// string in \a B is the maximum, otherwise, returns 0. 1811 /// 1812 /// \headerfile <x86intrin.h> 1813 /// 1814 /// \code 1815 /// int _mm_cmpistra(__m128i A, __m128i B, const int M); 1816 /// \endcode 1817 /// 1818 /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 1819 /// instruction. 1820 /// 1821 /// \param A 1822 /// A 128-bit integer vector containing one of the source operands to be 1823 /// compared. 1824 /// \param B 1825 /// A 128-bit integer vector containing one of the source operands to be 1826 /// compared. 1827 /// \param M 1828 /// An 8-bit immediate operand specifying whether the characters are bytes or 1829 /// words and the type of comparison to perform. \n 1830 /// Bits [1:0]: Determine source data format. \n 1831 /// 00: 16 unsigned bytes \n 1832 /// 01: 8 unsigned words \n 1833 /// 10: 16 signed bytes \n 1834 /// 11: 8 signed words \n 1835 /// Bits [3:2]: Determine comparison type and aggregation method. \n 1836 /// 00: Subset: Each character in \a B is compared for equality with all 1837 /// the characters in \a A. \n 1838 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1839 /// basis is greater than or equal for even-indexed elements in \a A, 1840 /// and less than or equal for odd-indexed elements in \a A. \n 1841 /// 10: Match: Compare each pair of corresponding characters in \a A and 1842 /// \a B for equality. \n 1843 /// 11: Substring: Search \a B for substring matches of \a A. \n 1844 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 1845 /// mask of the comparison results. \n 1846 /// 00: No effect. \n 1847 /// 01: Negate the bit mask. \n 1848 /// 10: No effect. \n 1849 /// 11: Negate the bit mask only for bits with an index less than or equal 1850 /// to the size of \a A or \a B. \n 1851 /// \returns Returns 1 if the bit mask is zero and the length of the string in 1852 /// \a B is the maximum; otherwise, returns 0. 1853 #define _mm_cmpistra(A, B, M) \ 1854 ((int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \ 1855 (__v16qi)(__m128i)(B), (int)(M))) 1856 1857 /// Uses the immediate operand \a M to perform a comparison of string 1858 /// data with implicitly defined lengths that is contained in source operands 1859 /// \a A and \a B. Returns 1 if the bit mask is non-zero, otherwise, returns 1860 /// 0. 1861 /// 1862 /// \headerfile <x86intrin.h> 1863 /// 1864 /// \code 1865 /// int _mm_cmpistrc(__m128i A, __m128i B, const int M); 1866 /// \endcode 1867 /// 1868 /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 1869 /// instruction. 1870 /// 1871 /// \param A 1872 /// A 128-bit integer vector containing one of the source operands to be 1873 /// compared. 1874 /// \param B 1875 /// A 128-bit integer vector containing one of the source operands to be 1876 /// compared. 1877 /// \param M 1878 /// An 8-bit immediate operand specifying whether the characters are bytes or 1879 /// words and the type of comparison to perform. \n 1880 /// Bits [1:0]: Determine source data format. \n 1881 /// 00: 16 unsigned bytes \n 1882 /// 01: 8 unsigned words \n 1883 /// 10: 16 signed bytes \n 1884 /// 11: 8 signed words \n 1885 /// Bits [3:2]: Determine comparison type and aggregation method. \n 1886 /// 00: Subset: Each character in \a B is compared for equality with all 1887 /// the characters in \a A. \n 1888 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1889 /// basis is greater than or equal for even-indexed elements in \a A, 1890 /// and less than or equal for odd-indexed elements in \a A. \n 1891 /// 10: Match: Compare each pair of corresponding characters in \a A and 1892 /// \a B for equality. \n 1893 /// 11: Substring: Search B for substring matches of \a A. \n 1894 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 1895 /// mask of the comparison results. \n 1896 /// 00: No effect. \n 1897 /// 01: Negate the bit mask. \n 1898 /// 10: No effect. \n 1899 /// 11: Negate the bit mask only for bits with an index less than or equal 1900 /// to the size of \a A or \a B. 1901 /// \returns Returns 1 if the bit mask is non-zero, otherwise, returns 0. 1902 #define _mm_cmpistrc(A, B, M) \ 1903 ((int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \ 1904 (__v16qi)(__m128i)(B), (int)(M))) 1905 1906 /// Uses the immediate operand \a M to perform a comparison of string 1907 /// data with implicitly defined lengths that is contained in source operands 1908 /// \a A and \a B. Returns bit 0 of the resulting bit mask. 1909 /// 1910 /// \headerfile <x86intrin.h> 1911 /// 1912 /// \code 1913 /// int _mm_cmpistro(__m128i A, __m128i B, const int M); 1914 /// \endcode 1915 /// 1916 /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 1917 /// instruction. 1918 /// 1919 /// \param A 1920 /// A 128-bit integer vector containing one of the source operands to be 1921 /// compared. 1922 /// \param B 1923 /// A 128-bit integer vector containing one of the source operands to be 1924 /// compared. 1925 /// \param M 1926 /// An 8-bit immediate operand specifying whether the characters are bytes or 1927 /// words and the type of comparison to perform. \n 1928 /// Bits [1:0]: Determine source data format. \n 1929 /// 00: 16 unsigned bytes \n 1930 /// 01: 8 unsigned words \n 1931 /// 10: 16 signed bytes \n 1932 /// 11: 8 signed words \n 1933 /// Bits [3:2]: Determine comparison type and aggregation method. \n 1934 /// 00: Subset: Each character in \a B is compared for equality with all 1935 /// the characters in \a A. \n 1936 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1937 /// basis is greater than or equal for even-indexed elements in \a A, 1938 /// and less than or equal for odd-indexed elements in \a A. \n 1939 /// 10: Match: Compare each pair of corresponding characters in \a A and 1940 /// \a B for equality. \n 1941 /// 11: Substring: Search B for substring matches of \a A. \n 1942 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 1943 /// mask of the comparison results. \n 1944 /// 00: No effect. \n 1945 /// 01: Negate the bit mask. \n 1946 /// 10: No effect. \n 1947 /// 11: Negate the bit mask only for bits with an index less than or equal 1948 /// to the size of \a A or \a B. \n 1949 /// \returns Returns bit 0 of the resulting bit mask. 1950 #define _mm_cmpistro(A, B, M) \ 1951 ((int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \ 1952 (__v16qi)(__m128i)(B), (int)(M))) 1953 1954 /// Uses the immediate operand \a M to perform a comparison of string 1955 /// data with implicitly defined lengths that is contained in source operands 1956 /// \a A and \a B. Returns 1 if the length of the string in \a A is less than 1957 /// the maximum, otherwise, returns 0. 1958 /// 1959 /// \headerfile <x86intrin.h> 1960 /// 1961 /// \code 1962 /// int _mm_cmpistrs(__m128i A, __m128i B, const int M); 1963 /// \endcode 1964 /// 1965 /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 1966 /// instruction. 1967 /// 1968 /// \param A 1969 /// A 128-bit integer vector containing one of the source operands to be 1970 /// compared. 1971 /// \param B 1972 /// A 128-bit integer vector containing one of the source operands to be 1973 /// compared. 1974 /// \param M 1975 /// An 8-bit immediate operand specifying whether the characters are bytes or 1976 /// words and the type of comparison to perform. \n 1977 /// Bits [1:0]: Determine source data format. \n 1978 /// 00: 16 unsigned bytes \n 1979 /// 01: 8 unsigned words \n 1980 /// 10: 16 signed bytes \n 1981 /// 11: 8 signed words \n 1982 /// Bits [3:2]: Determine comparison type and aggregation method. \n 1983 /// 00: Subset: Each character in \a B is compared for equality with all 1984 /// the characters in \a A. \n 1985 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1986 /// basis is greater than or equal for even-indexed elements in \a A, 1987 /// and less than or equal for odd-indexed elements in \a A. \n 1988 /// 10: Match: Compare each pair of corresponding characters in \a A and 1989 /// \a B for equality. \n 1990 /// 11: Substring: Search \a B for substring matches of \a A. \n 1991 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 1992 /// mask of the comparison results. \n 1993 /// 00: No effect. \n 1994 /// 01: Negate the bit mask. \n 1995 /// 10: No effect. \n 1996 /// 11: Negate the bit mask only for bits with an index less than or equal 1997 /// to the size of \a A or \a B. \n 1998 /// \returns Returns 1 if the length of the string in \a A is less than the 1999 /// maximum, otherwise, returns 0. 2000 #define _mm_cmpistrs(A, B, M) \ 2001 ((int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \ 2002 (__v16qi)(__m128i)(B), (int)(M))) 2003 2004 /// Uses the immediate operand \a M to perform a comparison of string 2005 /// data with implicitly defined lengths that is contained in source operands 2006 /// \a A and \a B. Returns 1 if the length of the string in \a B is less than 2007 /// the maximum, otherwise, returns 0. 2008 /// 2009 /// \headerfile <x86intrin.h> 2010 /// 2011 /// \code 2012 /// int _mm_cmpistrz(__m128i A, __m128i B, const int M); 2013 /// \endcode 2014 /// 2015 /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 2016 /// instruction. 2017 /// 2018 /// \param A 2019 /// A 128-bit integer vector containing one of the source operands to be 2020 /// compared. 2021 /// \param B 2022 /// A 128-bit integer vector containing one of the source operands to be 2023 /// compared. 2024 /// \param M 2025 /// An 8-bit immediate operand specifying whether the characters are bytes or 2026 /// words and the type of comparison to perform. \n 2027 /// Bits [1:0]: Determine source data format. \n 2028 /// 00: 16 unsigned bytes \n 2029 /// 01: 8 unsigned words \n 2030 /// 10: 16 signed bytes \n 2031 /// 11: 8 signed words \n 2032 /// Bits [3:2]: Determine comparison type and aggregation method. \n 2033 /// 00: Subset: Each character in \a B is compared for equality with all 2034 /// the characters in \a A. \n 2035 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2036 /// basis is greater than or equal for even-indexed elements in \a A, 2037 /// and less than or equal for odd-indexed elements in \a A. \n 2038 /// 10: Match: Compare each pair of corresponding characters in \a A and 2039 /// \a B for equality. \n 2040 /// 11: Substring: Search \a B for substring matches of \a A. \n 2041 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 2042 /// mask of the comparison results. \n 2043 /// 00: No effect. \n 2044 /// 01: Negate the bit mask. \n 2045 /// 10: No effect. \n 2046 /// 11: Negate the bit mask only for bits with an index less than or equal 2047 /// to the size of \a A or \a B. 2048 /// \returns Returns 1 if the length of the string in \a B is less than the 2049 /// maximum, otherwise, returns 0. 2050 #define _mm_cmpistrz(A, B, M) \ 2051 ((int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \ 2052 (__v16qi)(__m128i)(B), (int)(M))) 2053 2054 /// Uses the immediate operand \a M to perform a comparison of string 2055 /// data with explicitly defined lengths that is contained in source operands 2056 /// \a A and \a B. Returns 1 if the bit mask is zero and the length of the 2057 /// string in \a B is the maximum, otherwise, returns 0. 2058 /// 2059 /// \headerfile <x86intrin.h> 2060 /// 2061 /// \code 2062 /// int _mm_cmpestra(__m128i A, int LA, __m128i B, int LB, const int M); 2063 /// \endcode 2064 /// 2065 /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> 2066 /// instruction. 2067 /// 2068 /// \param A 2069 /// A 128-bit integer vector containing one of the source operands to be 2070 /// compared. 2071 /// \param LA 2072 /// An integer that specifies the length of the string in \a A. 2073 /// \param B 2074 /// A 128-bit integer vector containing one of the source operands to be 2075 /// compared. 2076 /// \param LB 2077 /// An integer that specifies the length of the string in \a B. 2078 /// \param M 2079 /// An 8-bit immediate operand specifying whether the characters are bytes or 2080 /// words and the type of comparison to perform. \n 2081 /// Bits [1:0]: Determine source data format. \n 2082 /// 00: 16 unsigned bytes \n 2083 /// 01: 8 unsigned words \n 2084 /// 10: 16 signed bytes \n 2085 /// 11: 8 signed words \n 2086 /// Bits [3:2]: Determine comparison type and aggregation method. \n 2087 /// 00: Subset: Each character in \a B is compared for equality with all 2088 /// the characters in \a A. \n 2089 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2090 /// basis is greater than or equal for even-indexed elements in \a A, 2091 /// and less than or equal for odd-indexed elements in \a A. \n 2092 /// 10: Match: Compare each pair of corresponding characters in \a A and 2093 /// \a B for equality. \n 2094 /// 11: Substring: Search \a B for substring matches of \a A. \n 2095 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 2096 /// mask of the comparison results. \n 2097 /// 00: No effect. \n 2098 /// 01: Negate the bit mask. \n 2099 /// 10: No effect. \n 2100 /// 11: Negate the bit mask only for bits with an index less than or equal 2101 /// to the size of \a A or \a B. 2102 /// \returns Returns 1 if the bit mask is zero and the length of the string in 2103 /// \a B is the maximum, otherwise, returns 0. 2104 #define _mm_cmpestra(A, LA, B, LB, M) \ 2105 ((int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \ 2106 (__v16qi)(__m128i)(B), (int)(LB), \ 2107 (int)(M))) 2108 2109 /// Uses the immediate operand \a M to perform a comparison of string 2110 /// data with explicitly defined lengths that is contained in source operands 2111 /// \a A and \a B. Returns 1 if the resulting mask is non-zero, otherwise, 2112 /// returns 0. 2113 /// 2114 /// \headerfile <x86intrin.h> 2115 /// 2116 /// \code 2117 /// int _mm_cmpestrc(__m128i A, int LA, __m128i B, int LB, const int M); 2118 /// \endcode 2119 /// 2120 /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> 2121 /// instruction. 2122 /// 2123 /// \param A 2124 /// A 128-bit integer vector containing one of the source operands to be 2125 /// compared. 2126 /// \param LA 2127 /// An integer that specifies the length of the string in \a A. 2128 /// \param B 2129 /// A 128-bit integer vector containing one of the source operands to be 2130 /// compared. 2131 /// \param LB 2132 /// An integer that specifies the length of the string in \a B. 2133 /// \param M 2134 /// An 8-bit immediate operand specifying whether the characters are bytes or 2135 /// words and the type of comparison to perform. \n 2136 /// Bits [1:0]: Determine source data format. \n 2137 /// 00: 16 unsigned bytes \n 2138 /// 01: 8 unsigned words \n 2139 /// 10: 16 signed bytes \n 2140 /// 11: 8 signed words \n 2141 /// Bits [3:2]: Determine comparison type and aggregation method. \n 2142 /// 00: Subset: Each character in \a B is compared for equality with all 2143 /// the characters in \a A. \n 2144 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2145 /// basis is greater than or equal for even-indexed elements in \a A, 2146 /// and less than or equal for odd-indexed elements in \a A. \n 2147 /// 10: Match: Compare each pair of corresponding characters in \a A and 2148 /// \a B for equality. \n 2149 /// 11: Substring: Search \a B for substring matches of \a A. \n 2150 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 2151 /// mask of the comparison results. \n 2152 /// 00: No effect. \n 2153 /// 01: Negate the bit mask. \n 2154 /// 10: No effect. \n 2155 /// 11: Negate the bit mask only for bits with an index less than or equal 2156 /// to the size of \a A or \a B. \n 2157 /// \returns Returns 1 if the resulting mask is non-zero, otherwise, returns 0. 2158 #define _mm_cmpestrc(A, LA, B, LB, M) \ 2159 ((int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \ 2160 (__v16qi)(__m128i)(B), (int)(LB), \ 2161 (int)(M))) 2162 2163 /// Uses the immediate operand \a M to perform a comparison of string 2164 /// data with explicitly defined lengths that is contained in source operands 2165 /// \a A and \a B. Returns bit 0 of the resulting bit mask. 2166 /// 2167 /// \headerfile <x86intrin.h> 2168 /// 2169 /// \code 2170 /// int _mm_cmpestro(__m128i A, int LA, __m128i B, int LB, const int M); 2171 /// \endcode 2172 /// 2173 /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> 2174 /// instruction. 2175 /// 2176 /// \param A 2177 /// A 128-bit integer vector containing one of the source operands to be 2178 /// compared. 2179 /// \param LA 2180 /// An integer that specifies the length of the string in \a A. 2181 /// \param B 2182 /// A 128-bit integer vector containing one of the source operands to be 2183 /// compared. 2184 /// \param LB 2185 /// An integer that specifies the length of the string in \a B. 2186 /// \param M 2187 /// An 8-bit immediate operand specifying whether the characters are bytes or 2188 /// words and the type of comparison to perform. \n 2189 /// Bits [1:0]: Determine source data format. \n 2190 /// 00: 16 unsigned bytes \n 2191 /// 01: 8 unsigned words \n 2192 /// 10: 16 signed bytes \n 2193 /// 11: 8 signed words \n 2194 /// Bits [3:2]: Determine comparison type and aggregation method. \n 2195 /// 00: Subset: Each character in \a B is compared for equality with all 2196 /// the characters in \a A. \n 2197 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2198 /// basis is greater than or equal for even-indexed elements in \a A, 2199 /// and less than or equal for odd-indexed elements in \a A. \n 2200 /// 10: Match: Compare each pair of corresponding characters in \a A and 2201 /// \a B for equality. \n 2202 /// 11: Substring: Search \a B for substring matches of \a A. \n 2203 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 2204 /// mask of the comparison results. \n 2205 /// 00: No effect. \n 2206 /// 01: Negate the bit mask. \n 2207 /// 10: No effect. \n 2208 /// 11: Negate the bit mask only for bits with an index less than or equal 2209 /// to the size of \a A or \a B. 2210 /// \returns Returns bit 0 of the resulting bit mask. 2211 #define _mm_cmpestro(A, LA, B, LB, M) \ 2212 ((int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \ 2213 (__v16qi)(__m128i)(B), (int)(LB), \ 2214 (int)(M))) 2215 2216 /// Uses the immediate operand \a M to perform a comparison of string 2217 /// data with explicitly defined lengths that is contained in source operands 2218 /// \a A and \a B. Returns 1 if the length of the string in \a A is less than 2219 /// the maximum, otherwise, returns 0. 2220 /// 2221 /// \headerfile <x86intrin.h> 2222 /// 2223 /// \code 2224 /// int _mm_cmpestrs(__m128i A, int LA, __m128i B, int LB, const int M); 2225 /// \endcode 2226 /// 2227 /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> 2228 /// instruction. 2229 /// 2230 /// \param A 2231 /// A 128-bit integer vector containing one of the source operands to be 2232 /// compared. 2233 /// \param LA 2234 /// An integer that specifies the length of the string in \a A. 2235 /// \param B 2236 /// A 128-bit integer vector containing one of the source operands to be 2237 /// compared. 2238 /// \param LB 2239 /// An integer that specifies the length of the string in \a B. 2240 /// \param M 2241 /// An 8-bit immediate operand specifying whether the characters are bytes or 2242 /// words and the type of comparison to perform. \n 2243 /// Bits [1:0]: Determine source data format. \n 2244 /// 00: 16 unsigned bytes \n 2245 /// 01: 8 unsigned words \n 2246 /// 10: 16 signed bytes \n 2247 /// 11: 8 signed words \n 2248 /// Bits [3:2]: Determine comparison type and aggregation method. \n 2249 /// 00: Subset: Each character in \a B is compared for equality with all 2250 /// the characters in \a A. \n 2251 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2252 /// basis is greater than or equal for even-indexed elements in \a A, 2253 /// and less than or equal for odd-indexed elements in \a A. \n 2254 /// 10: Match: Compare each pair of corresponding characters in \a A and 2255 /// \a B for equality. \n 2256 /// 11: Substring: Search \a B for substring matches of \a A. \n 2257 /// Bits [5:4]: Determine whether to perform a one's complement in the bit 2258 /// mask of the comparison results. \n 2259 /// 00: No effect. \n 2260 /// 01: Negate the bit mask. \n 2261 /// 10: No effect. \n 2262 /// 11: Negate the bit mask only for bits with an index less than or equal 2263 /// to the size of \a A or \a B. \n 2264 /// \returns Returns 1 if the length of the string in \a A is less than the 2265 /// maximum, otherwise, returns 0. 2266 #define _mm_cmpestrs(A, LA, B, LB, M) \ 2267 ((int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \ 2268 (__v16qi)(__m128i)(B), (int)(LB), \ 2269 (int)(M))) 2270 2271 /// Uses the immediate operand \a M to perform a comparison of string 2272 /// data with explicitly defined lengths that is contained in source operands 2273 /// \a A and \a B. Returns 1 if the length of the string in \a B is less than 2274 /// the maximum, otherwise, returns 0. 2275 /// 2276 /// \headerfile <x86intrin.h> 2277 /// 2278 /// \code 2279 /// int _mm_cmpestrz(__m128i A, int LA, __m128i B, int LB, const int M); 2280 /// \endcode 2281 /// 2282 /// This intrinsic corresponds to the <c> VPCMPESTRI </c> instruction. 2283 /// 2284 /// \param A 2285 /// A 128-bit integer vector containing one of the source operands to be 2286 /// compared. 2287 /// \param LA 2288 /// An integer that specifies the length of the string in \a A. 2289 /// \param B 2290 /// A 128-bit integer vector containing one of the source operands to be 2291 /// compared. 2292 /// \param LB 2293 /// An integer that specifies the length of the string in \a B. 2294 /// \param M 2295 /// An 8-bit immediate operand specifying whether the characters are bytes or 2296 /// words and the type of comparison to perform. \n 2297 /// Bits [1:0]: Determine source data format. \n 2298 /// 00: 16 unsigned bytes \n 2299 /// 01: 8 unsigned words \n 2300 /// 10: 16 signed bytes \n 2301 /// 11: 8 signed words \n 2302 /// Bits [3:2]: Determine comparison type and aggregation method. \n 2303 /// 00: Subset: Each character in \a B is compared for equality with all 2304 /// the characters in \a A. \n 2305 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2306 /// basis is greater than or equal for even-indexed elements in \a A, 2307 /// and less than or equal for odd-indexed elements in \a A. \n 2308 /// 10: Match: Compare each pair of corresponding characters in \a A and 2309 /// \a B for equality. \n 2310 /// 11: Substring: Search \a B for substring matches of \a A. \n 2311 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 2312 /// mask of the comparison results. \n 2313 /// 00: No effect. \n 2314 /// 01: Negate the bit mask. \n 2315 /// 10: No effect. \n 2316 /// 11: Negate the bit mask only for bits with an index less than or equal 2317 /// to the size of \a A or \a B. 2318 /// \returns Returns 1 if the length of the string in \a B is less than the 2319 /// maximum, otherwise, returns 0. 2320 #define _mm_cmpestrz(A, LA, B, LB, M) \ 2321 ((int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \ 2322 (__v16qi)(__m128i)(B), (int)(LB), \ 2323 (int)(M))) 2324 2325 /* SSE4.2 Compare Packed Data -- Greater Than. */ 2326 /// Compares each of the corresponding 64-bit values of the 128-bit 2327 /// integer vectors to determine if the values in the first operand are 2328 /// greater than those in the second operand. 2329 /// 2330 /// \headerfile <x86intrin.h> 2331 /// 2332 /// This intrinsic corresponds to the <c> VPCMPGTQ / PCMPGTQ </c> instruction. 2333 /// 2334 /// \param __V1 2335 /// A 128-bit integer vector. 2336 /// \param __V2 2337 /// A 128-bit integer vector. 2338 /// \returns A 128-bit integer vector containing the comparison results. 2339 static __inline__ __m128i __DEFAULT_FN_ATTRS 2340 _mm_cmpgt_epi64(__m128i __V1, __m128i __V2) 2341 { 2342 return (__m128i)((__v2di)__V1 > (__v2di)__V2); 2343 } 2344 2345 #undef __DEFAULT_FN_ATTRS 2346 2347 #include <popcntintrin.h> 2348 2349 #include <crc32intrin.h> 2350 2351 #endif /* __SMMINTRIN_H */ 2352