1 /*===---- smmintrin.h - SSE4 intrinsics ------------------------------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10 #ifndef __SMMINTRIN_H 11 #define __SMMINTRIN_H 12 13 #include <tmmintrin.h> 14 15 /* Define the default attributes for the functions in this file. */ 16 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.1"), __min_vector_width__(128))) 17 18 /* SSE4 Rounding macros. */ 19 #define _MM_FROUND_TO_NEAREST_INT 0x00 20 #define _MM_FROUND_TO_NEG_INF 0x01 21 #define _MM_FROUND_TO_POS_INF 0x02 22 #define _MM_FROUND_TO_ZERO 0x03 23 #define _MM_FROUND_CUR_DIRECTION 0x04 24 25 #define _MM_FROUND_RAISE_EXC 0x00 26 #define _MM_FROUND_NO_EXC 0x08 27 28 #define _MM_FROUND_NINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT) 29 #define _MM_FROUND_FLOOR (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF) 30 #define _MM_FROUND_CEIL (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF) 31 #define _MM_FROUND_TRUNC (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO) 32 #define _MM_FROUND_RINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION) 33 #define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION) 34 35 /// Rounds up each element of the 128-bit vector of [4 x float] to an 36 /// integer and returns the rounded values in a 128-bit vector of 37 /// [4 x float]. 38 /// 39 /// \headerfile <x86intrin.h> 40 /// 41 /// \code 42 /// __m128 _mm_ceil_ps(__m128 X); 43 /// \endcode 44 /// 45 /// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction. 46 /// 47 /// \param X 48 /// A 128-bit vector of [4 x float] values to be rounded up. 49 /// \returns A 128-bit vector of [4 x float] containing the rounded values. 50 #define _mm_ceil_ps(X) _mm_round_ps((X), _MM_FROUND_CEIL) 51 52 /// Rounds up each element of the 128-bit vector of [2 x double] to an 53 /// integer and returns the rounded values in a 128-bit vector of 54 /// [2 x double]. 55 /// 56 /// \headerfile <x86intrin.h> 57 /// 58 /// \code 59 /// __m128d _mm_ceil_pd(__m128d X); 60 /// \endcode 61 /// 62 /// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction. 63 /// 64 /// \param X 65 /// A 128-bit vector of [2 x double] values to be rounded up. 66 /// \returns A 128-bit vector of [2 x double] containing the rounded values. 67 #define _mm_ceil_pd(X) _mm_round_pd((X), _MM_FROUND_CEIL) 68 69 /// Copies three upper elements of the first 128-bit vector operand to 70 /// the corresponding three upper elements of the 128-bit result vector of 71 /// [4 x float]. Rounds up the lowest element of the second 128-bit vector 72 /// operand to an integer and copies it to the lowest element of the 128-bit 73 /// result vector of [4 x float]. 74 /// 75 /// \headerfile <x86intrin.h> 76 /// 77 /// \code 78 /// __m128 _mm_ceil_ss(__m128 X, __m128 Y); 79 /// \endcode 80 /// 81 /// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction. 82 /// 83 /// \param X 84 /// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are 85 /// copied to the corresponding bits of the result. 86 /// \param Y 87 /// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is 88 /// rounded up to the nearest integer and copied to the corresponding bits 89 /// of the result. 90 /// \returns A 128-bit vector of [4 x float] containing the copied and rounded 91 /// values. 92 #define _mm_ceil_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_CEIL) 93 94 /// Copies the upper element of the first 128-bit vector operand to the 95 /// corresponding upper element of the 128-bit result vector of [2 x double]. 96 /// Rounds up the lower element of the second 128-bit vector operand to an 97 /// integer and copies it to the lower element of the 128-bit result vector 98 /// of [2 x double]. 99 /// 100 /// \headerfile <x86intrin.h> 101 /// 102 /// \code 103 /// __m128d _mm_ceil_sd(__m128d X, __m128d Y); 104 /// \endcode 105 /// 106 /// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction. 107 /// 108 /// \param X 109 /// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is 110 /// copied to the corresponding bits of the result. 111 /// \param Y 112 /// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is 113 /// rounded up to the nearest integer and copied to the corresponding bits 114 /// of the result. 115 /// \returns A 128-bit vector of [2 x double] containing the copied and rounded 116 /// values. 117 #define _mm_ceil_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_CEIL) 118 119 /// Rounds down each element of the 128-bit vector of [4 x float] to an 120 /// an integer and returns the rounded values in a 128-bit vector of 121 /// [4 x float]. 122 /// 123 /// \headerfile <x86intrin.h> 124 /// 125 /// \code 126 /// __m128 _mm_floor_ps(__m128 X); 127 /// \endcode 128 /// 129 /// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction. 130 /// 131 /// \param X 132 /// A 128-bit vector of [4 x float] values to be rounded down. 133 /// \returns A 128-bit vector of [4 x float] containing the rounded values. 134 #define _mm_floor_ps(X) _mm_round_ps((X), _MM_FROUND_FLOOR) 135 136 /// Rounds down each element of the 128-bit vector of [2 x double] to an 137 /// integer and returns the rounded values in a 128-bit vector of 138 /// [2 x double]. 139 /// 140 /// \headerfile <x86intrin.h> 141 /// 142 /// \code 143 /// __m128d _mm_floor_pd(__m128d X); 144 /// \endcode 145 /// 146 /// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction. 147 /// 148 /// \param X 149 /// A 128-bit vector of [2 x double]. 150 /// \returns A 128-bit vector of [2 x double] containing the rounded values. 151 #define _mm_floor_pd(X) _mm_round_pd((X), _MM_FROUND_FLOOR) 152 153 /// Copies three upper elements of the first 128-bit vector operand to 154 /// the corresponding three upper elements of the 128-bit result vector of 155 /// [4 x float]. Rounds down the lowest element of the second 128-bit vector 156 /// operand to an integer and copies it to the lowest element of the 128-bit 157 /// result vector of [4 x float]. 158 /// 159 /// \headerfile <x86intrin.h> 160 /// 161 /// \code 162 /// __m128 _mm_floor_ss(__m128 X, __m128 Y); 163 /// \endcode 164 /// 165 /// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction. 166 /// 167 /// \param X 168 /// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are 169 /// copied to the corresponding bits of the result. 170 /// \param Y 171 /// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is 172 /// rounded down to the nearest integer and copied to the corresponding bits 173 /// of the result. 174 /// \returns A 128-bit vector of [4 x float] containing the copied and rounded 175 /// values. 176 #define _mm_floor_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_FLOOR) 177 178 /// Copies the upper element of the first 128-bit vector operand to the 179 /// corresponding upper element of the 128-bit result vector of [2 x double]. 180 /// Rounds down the lower element of the second 128-bit vector operand to an 181 /// integer and copies it to the lower element of the 128-bit result vector 182 /// of [2 x double]. 183 /// 184 /// \headerfile <x86intrin.h> 185 /// 186 /// \code 187 /// __m128d _mm_floor_sd(__m128d X, __m128d Y); 188 /// \endcode 189 /// 190 /// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction. 191 /// 192 /// \param X 193 /// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is 194 /// copied to the corresponding bits of the result. 195 /// \param Y 196 /// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is 197 /// rounded down to the nearest integer and copied to the corresponding bits 198 /// of the result. 199 /// \returns A 128-bit vector of [2 x double] containing the copied and rounded 200 /// values. 201 #define _mm_floor_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_FLOOR) 202 203 /// Rounds each element of the 128-bit vector of [4 x float] to an 204 /// integer value according to the rounding control specified by the second 205 /// argument and returns the rounded values in a 128-bit vector of 206 /// [4 x float]. 207 /// 208 /// \headerfile <x86intrin.h> 209 /// 210 /// \code 211 /// __m128 _mm_round_ps(__m128 X, const int M); 212 /// \endcode 213 /// 214 /// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction. 215 /// 216 /// \param X 217 /// A 128-bit vector of [4 x float]. 218 /// \param M 219 /// An integer value that specifies the rounding operation. \n 220 /// Bits [7:4] are reserved. \n 221 /// Bit [3] is a precision exception value: \n 222 /// 0: A normal PE exception is used \n 223 /// 1: The PE field is not updated \n 224 /// Bit [2] is the rounding control source: \n 225 /// 0: Use bits [1:0] of \a M \n 226 /// 1: Use the current MXCSR setting \n 227 /// Bits [1:0] contain the rounding control definition: \n 228 /// 00: Nearest \n 229 /// 01: Downward (toward negative infinity) \n 230 /// 10: Upward (toward positive infinity) \n 231 /// 11: Truncated 232 /// \returns A 128-bit vector of [4 x float] containing the rounded values. 233 #define _mm_round_ps(X, M) \ 234 (__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)) 235 236 /// Copies three upper elements of the first 128-bit vector operand to 237 /// the corresponding three upper elements of the 128-bit result vector of 238 /// [4 x float]. Rounds the lowest element of the second 128-bit vector 239 /// operand to an integer value according to the rounding control specified 240 /// by the third argument and copies it to the lowest element of the 128-bit 241 /// result vector of [4 x float]. 242 /// 243 /// \headerfile <x86intrin.h> 244 /// 245 /// \code 246 /// __m128 _mm_round_ss(__m128 X, __m128 Y, const int M); 247 /// \endcode 248 /// 249 /// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction. 250 /// 251 /// \param X 252 /// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are 253 /// copied to the corresponding bits of the result. 254 /// \param Y 255 /// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is 256 /// rounded to the nearest integer using the specified rounding control and 257 /// copied to the corresponding bits of the result. 258 /// \param M 259 /// An integer value that specifies the rounding operation. \n 260 /// Bits [7:4] are reserved. \n 261 /// Bit [3] is a precision exception value: \n 262 /// 0: A normal PE exception is used \n 263 /// 1: The PE field is not updated \n 264 /// Bit [2] is the rounding control source: \n 265 /// 0: Use bits [1:0] of \a M \n 266 /// 1: Use the current MXCSR setting \n 267 /// Bits [1:0] contain the rounding control definition: \n 268 /// 00: Nearest \n 269 /// 01: Downward (toward negative infinity) \n 270 /// 10: Upward (toward positive infinity) \n 271 /// 11: Truncated 272 /// \returns A 128-bit vector of [4 x float] containing the copied and rounded 273 /// values. 274 #define _mm_round_ss(X, Y, M) \ 275 (__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \ 276 (__v4sf)(__m128)(Y), (M)) 277 278 /// Rounds each element of the 128-bit vector of [2 x double] to an 279 /// integer value according to the rounding control specified by the second 280 /// argument and returns the rounded values in a 128-bit vector of 281 /// [2 x double]. 282 /// 283 /// \headerfile <x86intrin.h> 284 /// 285 /// \code 286 /// __m128d _mm_round_pd(__m128d X, const int M); 287 /// \endcode 288 /// 289 /// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction. 290 /// 291 /// \param X 292 /// A 128-bit vector of [2 x double]. 293 /// \param M 294 /// An integer value that specifies the rounding operation. \n 295 /// Bits [7:4] are reserved. \n 296 /// Bit [3] is a precision exception value: \n 297 /// 0: A normal PE exception is used \n 298 /// 1: The PE field is not updated \n 299 /// Bit [2] is the rounding control source: \n 300 /// 0: Use bits [1:0] of \a M \n 301 /// 1: Use the current MXCSR setting \n 302 /// Bits [1:0] contain the rounding control definition: \n 303 /// 00: Nearest \n 304 /// 01: Downward (toward negative infinity) \n 305 /// 10: Upward (toward positive infinity) \n 306 /// 11: Truncated 307 /// \returns A 128-bit vector of [2 x double] containing the rounded values. 308 #define _mm_round_pd(X, M) \ 309 (__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)) 310 311 /// Copies the upper element of the first 128-bit vector operand to the 312 /// corresponding upper element of the 128-bit result vector of [2 x double]. 313 /// Rounds the lower element of the second 128-bit vector operand to an 314 /// integer value according to the rounding control specified by the third 315 /// argument and copies it to the lower element of the 128-bit result vector 316 /// of [2 x double]. 317 /// 318 /// \headerfile <x86intrin.h> 319 /// 320 /// \code 321 /// __m128d _mm_round_sd(__m128d X, __m128d Y, const int M); 322 /// \endcode 323 /// 324 /// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction. 325 /// 326 /// \param X 327 /// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is 328 /// copied to the corresponding bits of the result. 329 /// \param Y 330 /// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is 331 /// rounded to the nearest integer using the specified rounding control and 332 /// copied to the corresponding bits of the result. 333 /// \param M 334 /// An integer value that specifies the rounding operation. \n 335 /// Bits [7:4] are reserved. \n 336 /// Bit [3] is a precision exception value: \n 337 /// 0: A normal PE exception is used \n 338 /// 1: The PE field is not updated \n 339 /// Bit [2] is the rounding control source: \n 340 /// 0: Use bits [1:0] of \a M \n 341 /// 1: Use the current MXCSR setting \n 342 /// Bits [1:0] contain the rounding control definition: \n 343 /// 00: Nearest \n 344 /// 01: Downward (toward negative infinity) \n 345 /// 10: Upward (toward positive infinity) \n 346 /// 11: Truncated 347 /// \returns A 128-bit vector of [2 x double] containing the copied and rounded 348 /// values. 349 #define _mm_round_sd(X, Y, M) \ 350 (__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \ 351 (__v2df)(__m128d)(Y), (M)) 352 353 /* SSE4 Packed Blending Intrinsics. */ 354 /// Returns a 128-bit vector of [2 x double] where the values are 355 /// selected from either the first or second operand as specified by the 356 /// third operand, the control mask. 357 /// 358 /// \headerfile <x86intrin.h> 359 /// 360 /// \code 361 /// __m128d _mm_blend_pd(__m128d V1, __m128d V2, const int M); 362 /// \endcode 363 /// 364 /// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction. 365 /// 366 /// \param V1 367 /// A 128-bit vector of [2 x double]. 368 /// \param V2 369 /// A 128-bit vector of [2 x double]. 370 /// \param M 371 /// An immediate integer operand, with mask bits [1:0] specifying how the 372 /// values are to be copied. The position of the mask bit corresponds to the 373 /// index of a copied value. When a mask bit is 0, the corresponding 64-bit 374 /// element in operand \a V1 is copied to the same position in the result. 375 /// When a mask bit is 1, the corresponding 64-bit element in operand \a V2 376 /// is copied to the same position in the result. 377 /// \returns A 128-bit vector of [2 x double] containing the copied values. 378 #define _mm_blend_pd(V1, V2, M) \ 379 (__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(V1), \ 380 (__v2df)(__m128d)(V2), (int)(M)) 381 382 /// Returns a 128-bit vector of [4 x float] where the values are selected 383 /// from either the first or second operand as specified by the third 384 /// operand, the control mask. 385 /// 386 /// \headerfile <x86intrin.h> 387 /// 388 /// \code 389 /// __m128 _mm_blend_ps(__m128 V1, __m128 V2, const int M); 390 /// \endcode 391 /// 392 /// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS </c> instruction. 393 /// 394 /// \param V1 395 /// A 128-bit vector of [4 x float]. 396 /// \param V2 397 /// A 128-bit vector of [4 x float]. 398 /// \param M 399 /// An immediate integer operand, with mask bits [3:0] specifying how the 400 /// values are to be copied. The position of the mask bit corresponds to the 401 /// index of a copied value. When a mask bit is 0, the corresponding 32-bit 402 /// element in operand \a V1 is copied to the same position in the result. 403 /// When a mask bit is 1, the corresponding 32-bit element in operand \a V2 404 /// is copied to the same position in the result. 405 /// \returns A 128-bit vector of [4 x float] containing the copied values. 406 #define _mm_blend_ps(V1, V2, M) \ 407 (__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(V1), \ 408 (__v4sf)(__m128)(V2), (int)(M)) 409 410 /// Returns a 128-bit vector of [2 x double] where the values are 411 /// selected from either the first or second operand as specified by the 412 /// third operand, the control mask. 413 /// 414 /// \headerfile <x86intrin.h> 415 /// 416 /// This intrinsic corresponds to the <c> VBLENDVPD / BLENDVPD </c> instruction. 417 /// 418 /// \param __V1 419 /// A 128-bit vector of [2 x double]. 420 /// \param __V2 421 /// A 128-bit vector of [2 x double]. 422 /// \param __M 423 /// A 128-bit vector operand, with mask bits 127 and 63 specifying how the 424 /// values are to be copied. The position of the mask bit corresponds to the 425 /// most significant bit of a copied value. When a mask bit is 0, the 426 /// corresponding 64-bit element in operand \a __V1 is copied to the same 427 /// position in the result. When a mask bit is 1, the corresponding 64-bit 428 /// element in operand \a __V2 is copied to the same position in the result. 429 /// \returns A 128-bit vector of [2 x double] containing the copied values. 430 static __inline__ __m128d __DEFAULT_FN_ATTRS 431 _mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M) 432 { 433 return (__m128d) __builtin_ia32_blendvpd ((__v2df)__V1, (__v2df)__V2, 434 (__v2df)__M); 435 } 436 437 /// Returns a 128-bit vector of [4 x float] where the values are 438 /// selected from either the first or second operand as specified by the 439 /// third operand, the control mask. 440 /// 441 /// \headerfile <x86intrin.h> 442 /// 443 /// This intrinsic corresponds to the <c> VBLENDVPS / BLENDVPS </c> instruction. 444 /// 445 /// \param __V1 446 /// A 128-bit vector of [4 x float]. 447 /// \param __V2 448 /// A 128-bit vector of [4 x float]. 449 /// \param __M 450 /// A 128-bit vector operand, with mask bits 127, 95, 63, and 31 specifying 451 /// how the values are to be copied. The position of the mask bit corresponds 452 /// to the most significant bit of a copied value. When a mask bit is 0, the 453 /// corresponding 32-bit element in operand \a __V1 is copied to the same 454 /// position in the result. When a mask bit is 1, the corresponding 32-bit 455 /// element in operand \a __V2 is copied to the same position in the result. 456 /// \returns A 128-bit vector of [4 x float] containing the copied values. 457 static __inline__ __m128 __DEFAULT_FN_ATTRS 458 _mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M) 459 { 460 return (__m128) __builtin_ia32_blendvps ((__v4sf)__V1, (__v4sf)__V2, 461 (__v4sf)__M); 462 } 463 464 /// Returns a 128-bit vector of [16 x i8] where the values are selected 465 /// from either of the first or second operand as specified by the third 466 /// operand, the control mask. 467 /// 468 /// \headerfile <x86intrin.h> 469 /// 470 /// This intrinsic corresponds to the <c> VPBLENDVB / PBLENDVB </c> instruction. 471 /// 472 /// \param __V1 473 /// A 128-bit vector of [16 x i8]. 474 /// \param __V2 475 /// A 128-bit vector of [16 x i8]. 476 /// \param __M 477 /// A 128-bit vector operand, with mask bits 127, 119, 111...7 specifying 478 /// how the values are to be copied. The position of the mask bit corresponds 479 /// to the most significant bit of a copied value. When a mask bit is 0, the 480 /// corresponding 8-bit element in operand \a __V1 is copied to the same 481 /// position in the result. When a mask bit is 1, the corresponding 8-bit 482 /// element in operand \a __V2 is copied to the same position in the result. 483 /// \returns A 128-bit vector of [16 x i8] containing the copied values. 484 static __inline__ __m128i __DEFAULT_FN_ATTRS 485 _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M) 486 { 487 return (__m128i) __builtin_ia32_pblendvb128 ((__v16qi)__V1, (__v16qi)__V2, 488 (__v16qi)__M); 489 } 490 491 /// Returns a 128-bit vector of [8 x i16] where the values are selected 492 /// from either of the first or second operand as specified by the third 493 /// operand, the control mask. 494 /// 495 /// \headerfile <x86intrin.h> 496 /// 497 /// \code 498 /// __m128i _mm_blend_epi16(__m128i V1, __m128i V2, const int M); 499 /// \endcode 500 /// 501 /// This intrinsic corresponds to the <c> VPBLENDW / PBLENDW </c> instruction. 502 /// 503 /// \param V1 504 /// A 128-bit vector of [8 x i16]. 505 /// \param V2 506 /// A 128-bit vector of [8 x i16]. 507 /// \param M 508 /// An immediate integer operand, with mask bits [7:0] specifying how the 509 /// values are to be copied. The position of the mask bit corresponds to the 510 /// index of a copied value. When a mask bit is 0, the corresponding 16-bit 511 /// element in operand \a V1 is copied to the same position in the result. 512 /// When a mask bit is 1, the corresponding 16-bit element in operand \a V2 513 /// is copied to the same position in the result. 514 /// \returns A 128-bit vector of [8 x i16] containing the copied values. 515 #define _mm_blend_epi16(V1, V2, M) \ 516 (__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(V1), \ 517 (__v8hi)(__m128i)(V2), (int)(M)) 518 519 /* SSE4 Dword Multiply Instructions. */ 520 /// Multiples corresponding elements of two 128-bit vectors of [4 x i32] 521 /// and returns the lower 32 bits of the each product in a 128-bit vector of 522 /// [4 x i32]. 523 /// 524 /// \headerfile <x86intrin.h> 525 /// 526 /// This intrinsic corresponds to the <c> VPMULLD / PMULLD </c> instruction. 527 /// 528 /// \param __V1 529 /// A 128-bit integer vector. 530 /// \param __V2 531 /// A 128-bit integer vector. 532 /// \returns A 128-bit integer vector containing the products of both operands. 533 static __inline__ __m128i __DEFAULT_FN_ATTRS 534 _mm_mullo_epi32 (__m128i __V1, __m128i __V2) 535 { 536 return (__m128i) ((__v4su)__V1 * (__v4su)__V2); 537 } 538 539 /// Multiplies corresponding even-indexed elements of two 128-bit 540 /// vectors of [4 x i32] and returns a 128-bit vector of [2 x i64] 541 /// containing the products. 542 /// 543 /// \headerfile <x86intrin.h> 544 /// 545 /// This intrinsic corresponds to the <c> VPMULDQ / PMULDQ </c> instruction. 546 /// 547 /// \param __V1 548 /// A 128-bit vector of [4 x i32]. 549 /// \param __V2 550 /// A 128-bit vector of [4 x i32]. 551 /// \returns A 128-bit vector of [2 x i64] containing the products of both 552 /// operands. 553 static __inline__ __m128i __DEFAULT_FN_ATTRS 554 _mm_mul_epi32 (__m128i __V1, __m128i __V2) 555 { 556 return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__V1, (__v4si)__V2); 557 } 558 559 /* SSE4 Floating Point Dot Product Instructions. */ 560 /// Computes the dot product of the two 128-bit vectors of [4 x float] 561 /// and returns it in the elements of the 128-bit result vector of 562 /// [4 x float]. 563 /// 564 /// The immediate integer operand controls which input elements 565 /// will contribute to the dot product, and where the final results are 566 /// returned. 567 /// 568 /// \headerfile <x86intrin.h> 569 /// 570 /// \code 571 /// __m128 _mm_dp_ps(__m128 X, __m128 Y, const int M); 572 /// \endcode 573 /// 574 /// This intrinsic corresponds to the <c> VDPPS / DPPS </c> instruction. 575 /// 576 /// \param X 577 /// A 128-bit vector of [4 x float]. 578 /// \param Y 579 /// A 128-bit vector of [4 x float]. 580 /// \param M 581 /// An immediate integer operand. Mask bits [7:4] determine which elements 582 /// of the input vectors are used, with bit [4] corresponding to the lowest 583 /// element and bit [7] corresponding to the highest element of each [4 x 584 /// float] vector. If a bit is set, the corresponding elements from the two 585 /// input vectors are used as an input for dot product; otherwise that input 586 /// is treated as zero. Bits [3:0] determine which elements of the result 587 /// will receive a copy of the final dot product, with bit [0] corresponding 588 /// to the lowest element and bit [3] corresponding to the highest element of 589 /// each [4 x float] subvector. If a bit is set, the dot product is returned 590 /// in the corresponding element; otherwise that element is set to zero. 591 /// \returns A 128-bit vector of [4 x float] containing the dot product. 592 #define _mm_dp_ps(X, Y, M) \ 593 (__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \ 594 (__v4sf)(__m128)(Y), (M)) 595 596 /// Computes the dot product of the two 128-bit vectors of [2 x double] 597 /// and returns it in the elements of the 128-bit result vector of 598 /// [2 x double]. 599 /// 600 /// The immediate integer operand controls which input 601 /// elements will contribute to the dot product, and where the final results 602 /// are returned. 603 /// 604 /// \headerfile <x86intrin.h> 605 /// 606 /// \code 607 /// __m128d _mm_dp_pd(__m128d X, __m128d Y, const int M); 608 /// \endcode 609 /// 610 /// This intrinsic corresponds to the <c> VDPPD / DPPD </c> instruction. 611 /// 612 /// \param X 613 /// A 128-bit vector of [2 x double]. 614 /// \param Y 615 /// A 128-bit vector of [2 x double]. 616 /// \param M 617 /// An immediate integer operand. Mask bits [5:4] determine which elements 618 /// of the input vectors are used, with bit [4] corresponding to the lowest 619 /// element and bit [5] corresponding to the highest element of each of [2 x 620 /// double] vector. If a bit is set, the corresponding elements from the two 621 /// input vectors are used as an input for dot product; otherwise that input 622 /// is treated as zero. Bits [1:0] determine which elements of the result 623 /// will receive a copy of the final dot product, with bit [0] corresponding 624 /// to the lowest element and bit [1] corresponding to the highest element of 625 /// each [2 x double] vector. If a bit is set, the dot product is returned in 626 /// the corresponding element; otherwise that element is set to zero. 627 #define _mm_dp_pd(X, Y, M) \ 628 (__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \ 629 (__v2df)(__m128d)(Y), (M)) 630 631 /* SSE4 Streaming Load Hint Instruction. */ 632 /// Loads integer values from a 128-bit aligned memory location to a 633 /// 128-bit integer vector. 634 /// 635 /// \headerfile <x86intrin.h> 636 /// 637 /// This intrinsic corresponds to the <c> VMOVNTDQA / MOVNTDQA </c> instruction. 638 /// 639 /// \param __V 640 /// A pointer to a 128-bit aligned memory location that contains the integer 641 /// values. 642 /// \returns A 128-bit integer vector containing the data stored at the 643 /// specified memory location. 644 static __inline__ __m128i __DEFAULT_FN_ATTRS 645 _mm_stream_load_si128 (__m128i const *__V) 646 { 647 return (__m128i) __builtin_nontemporal_load ((const __v2di *) __V); 648 } 649 650 /* SSE4 Packed Integer Min/Max Instructions. */ 651 /// Compares the corresponding elements of two 128-bit vectors of 652 /// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the lesser 653 /// of the two values. 654 /// 655 /// \headerfile <x86intrin.h> 656 /// 657 /// This intrinsic corresponds to the <c> VPMINSB / PMINSB </c> instruction. 658 /// 659 /// \param __V1 660 /// A 128-bit vector of [16 x i8]. 661 /// \param __V2 662 /// A 128-bit vector of [16 x i8] 663 /// \returns A 128-bit vector of [16 x i8] containing the lesser values. 664 static __inline__ __m128i __DEFAULT_FN_ATTRS 665 _mm_min_epi8 (__m128i __V1, __m128i __V2) 666 { 667 return (__m128i) __builtin_ia32_pminsb128 ((__v16qi) __V1, (__v16qi) __V2); 668 } 669 670 /// Compares the corresponding elements of two 128-bit vectors of 671 /// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the 672 /// greater value of the two. 673 /// 674 /// \headerfile <x86intrin.h> 675 /// 676 /// This intrinsic corresponds to the <c> VPMAXSB / PMAXSB </c> instruction. 677 /// 678 /// \param __V1 679 /// A 128-bit vector of [16 x i8]. 680 /// \param __V2 681 /// A 128-bit vector of [16 x i8]. 682 /// \returns A 128-bit vector of [16 x i8] containing the greater values. 683 static __inline__ __m128i __DEFAULT_FN_ATTRS 684 _mm_max_epi8 (__m128i __V1, __m128i __V2) 685 { 686 return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi) __V1, (__v16qi) __V2); 687 } 688 689 /// Compares the corresponding elements of two 128-bit vectors of 690 /// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the lesser 691 /// value of the two. 692 /// 693 /// \headerfile <x86intrin.h> 694 /// 695 /// This intrinsic corresponds to the <c> VPMINUW / PMINUW </c> instruction. 696 /// 697 /// \param __V1 698 /// A 128-bit vector of [8 x u16]. 699 /// \param __V2 700 /// A 128-bit vector of [8 x u16]. 701 /// \returns A 128-bit vector of [8 x u16] containing the lesser values. 702 static __inline__ __m128i __DEFAULT_FN_ATTRS 703 _mm_min_epu16 (__m128i __V1, __m128i __V2) 704 { 705 return (__m128i) __builtin_ia32_pminuw128 ((__v8hi) __V1, (__v8hi) __V2); 706 } 707 708 /// Compares the corresponding elements of two 128-bit vectors of 709 /// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the 710 /// greater value of the two. 711 /// 712 /// \headerfile <x86intrin.h> 713 /// 714 /// This intrinsic corresponds to the <c> VPMAXUW / PMAXUW </c> instruction. 715 /// 716 /// \param __V1 717 /// A 128-bit vector of [8 x u16]. 718 /// \param __V2 719 /// A 128-bit vector of [8 x u16]. 720 /// \returns A 128-bit vector of [8 x u16] containing the greater values. 721 static __inline__ __m128i __DEFAULT_FN_ATTRS 722 _mm_max_epu16 (__m128i __V1, __m128i __V2) 723 { 724 return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi) __V1, (__v8hi) __V2); 725 } 726 727 /// Compares the corresponding elements of two 128-bit vectors of 728 /// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the lesser 729 /// value of the two. 730 /// 731 /// \headerfile <x86intrin.h> 732 /// 733 /// This intrinsic corresponds to the <c> VPMINSD / PMINSD </c> instruction. 734 /// 735 /// \param __V1 736 /// A 128-bit vector of [4 x i32]. 737 /// \param __V2 738 /// A 128-bit vector of [4 x i32]. 739 /// \returns A 128-bit vector of [4 x i32] containing the lesser values. 740 static __inline__ __m128i __DEFAULT_FN_ATTRS 741 _mm_min_epi32 (__m128i __V1, __m128i __V2) 742 { 743 return (__m128i) __builtin_ia32_pminsd128 ((__v4si) __V1, (__v4si) __V2); 744 } 745 746 /// Compares the corresponding elements of two 128-bit vectors of 747 /// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the 748 /// greater value of the two. 749 /// 750 /// \headerfile <x86intrin.h> 751 /// 752 /// This intrinsic corresponds to the <c> VPMAXSD / PMAXSD </c> instruction. 753 /// 754 /// \param __V1 755 /// A 128-bit vector of [4 x i32]. 756 /// \param __V2 757 /// A 128-bit vector of [4 x i32]. 758 /// \returns A 128-bit vector of [4 x i32] containing the greater values. 759 static __inline__ __m128i __DEFAULT_FN_ATTRS 760 _mm_max_epi32 (__m128i __V1, __m128i __V2) 761 { 762 return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si) __V1, (__v4si) __V2); 763 } 764 765 /// Compares the corresponding elements of two 128-bit vectors of 766 /// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the lesser 767 /// value of the two. 768 /// 769 /// \headerfile <x86intrin.h> 770 /// 771 /// This intrinsic corresponds to the <c> VPMINUD / PMINUD </c> instruction. 772 /// 773 /// \param __V1 774 /// A 128-bit vector of [4 x u32]. 775 /// \param __V2 776 /// A 128-bit vector of [4 x u32]. 777 /// \returns A 128-bit vector of [4 x u32] containing the lesser values. 778 static __inline__ __m128i __DEFAULT_FN_ATTRS 779 _mm_min_epu32 (__m128i __V1, __m128i __V2) 780 { 781 return (__m128i) __builtin_ia32_pminud128((__v4si) __V1, (__v4si) __V2); 782 } 783 784 /// Compares the corresponding elements of two 128-bit vectors of 785 /// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the 786 /// greater value of the two. 787 /// 788 /// \headerfile <x86intrin.h> 789 /// 790 /// This intrinsic corresponds to the <c> VPMAXUD / PMAXUD </c> instruction. 791 /// 792 /// \param __V1 793 /// A 128-bit vector of [4 x u32]. 794 /// \param __V2 795 /// A 128-bit vector of [4 x u32]. 796 /// \returns A 128-bit vector of [4 x u32] containing the greater values. 797 static __inline__ __m128i __DEFAULT_FN_ATTRS 798 _mm_max_epu32 (__m128i __V1, __m128i __V2) 799 { 800 return (__m128i) __builtin_ia32_pmaxud128((__v4si) __V1, (__v4si) __V2); 801 } 802 803 /* SSE4 Insertion and Extraction from XMM Register Instructions. */ 804 /// Takes the first argument \a X and inserts an element from the second 805 /// argument \a Y as selected by the third argument \a N. That result then 806 /// has elements zeroed out also as selected by the third argument \a N. The 807 /// resulting 128-bit vector of [4 x float] is then returned. 808 /// 809 /// \headerfile <x86intrin.h> 810 /// 811 /// \code 812 /// __m128 _mm_insert_ps(__m128 X, __m128 Y, const int N); 813 /// \endcode 814 /// 815 /// This intrinsic corresponds to the <c> VINSERTPS </c> instruction. 816 /// 817 /// \param X 818 /// A 128-bit vector source operand of [4 x float]. With the exception of 819 /// those bits in the result copied from parameter \a Y and zeroed by bits 820 /// [3:0] of \a N, all bits from this parameter are copied to the result. 821 /// \param Y 822 /// A 128-bit vector source operand of [4 x float]. One single-precision 823 /// floating-point element from this source, as determined by the immediate 824 /// parameter, is copied to the result. 825 /// \param N 826 /// Specifies which bits from operand \a Y will be copied, which bits in the 827 /// result they will be be copied to, and which bits in the result will be 828 /// cleared. The following assignments are made: \n 829 /// Bits [7:6] specify the bits to copy from operand \a Y: \n 830 /// 00: Selects bits [31:0] from operand \a Y. \n 831 /// 01: Selects bits [63:32] from operand \a Y. \n 832 /// 10: Selects bits [95:64] from operand \a Y. \n 833 /// 11: Selects bits [127:96] from operand \a Y. \n 834 /// Bits [5:4] specify the bits in the result to which the selected bits 835 /// from operand \a Y are copied: \n 836 /// 00: Copies the selected bits from \a Y to result bits [31:0]. \n 837 /// 01: Copies the selected bits from \a Y to result bits [63:32]. \n 838 /// 10: Copies the selected bits from \a Y to result bits [95:64]. \n 839 /// 11: Copies the selected bits from \a Y to result bits [127:96]. \n 840 /// Bits[3:0]: If any of these bits are set, the corresponding result 841 /// element is cleared. 842 /// \returns A 128-bit vector of [4 x float] containing the copied 843 /// single-precision floating point elements from the operands. 844 #define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N)) 845 846 /// Extracts a 32-bit integer from a 128-bit vector of [4 x float] and 847 /// returns it, using the immediate value parameter \a N as a selector. 848 /// 849 /// \headerfile <x86intrin.h> 850 /// 851 /// \code 852 /// int _mm_extract_ps(__m128 X, const int N); 853 /// \endcode 854 /// 855 /// This intrinsic corresponds to the <c> VEXTRACTPS / EXTRACTPS </c> 856 /// instruction. 857 /// 858 /// \param X 859 /// A 128-bit vector of [4 x float]. 860 /// \param N 861 /// An immediate value. Bits [1:0] determines which bits from the argument 862 /// \a X are extracted and returned: \n 863 /// 00: Bits [31:0] of parameter \a X are returned. \n 864 /// 01: Bits [63:32] of parameter \a X are returned. \n 865 /// 10: Bits [95:64] of parameter \a X are returned. \n 866 /// 11: Bits [127:96] of parameter \a X are returned. 867 /// \returns A 32-bit integer containing the extracted 32 bits of float data. 868 #define _mm_extract_ps(X, N) (__extension__ \ 869 ({ union { int __i; float __f; } __t; \ 870 __t.__f = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); \ 871 __t.__i;})) 872 873 /* Miscellaneous insert and extract macros. */ 874 /* Extract a single-precision float from X at index N into D. */ 875 #define _MM_EXTRACT_FLOAT(D, X, N) \ 876 { (D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); } 877 878 /* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create 879 an index suitable for _mm_insert_ps. */ 880 #define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z)) 881 882 /* Extract a float from X at index N into the first index of the return. */ 883 #define _MM_PICK_OUT_PS(X, N) _mm_insert_ps (_mm_setzero_ps(), (X), \ 884 _MM_MK_INSERTPS_NDX((N), 0, 0x0e)) 885 886 /* Insert int into packed integer array at index. */ 887 /// Constructs a 128-bit vector of [16 x i8] by first making a copy of 888 /// the 128-bit integer vector parameter, and then inserting the lower 8 bits 889 /// of an integer parameter \a I into an offset specified by the immediate 890 /// value parameter \a N. 891 /// 892 /// \headerfile <x86intrin.h> 893 /// 894 /// \code 895 /// __m128i _mm_insert_epi8(__m128i X, int I, const int N); 896 /// \endcode 897 /// 898 /// This intrinsic corresponds to the <c> VPINSRB / PINSRB </c> instruction. 899 /// 900 /// \param X 901 /// A 128-bit integer vector of [16 x i8]. This vector is copied to the 902 /// result and then one of the sixteen elements in the result vector is 903 /// replaced by the lower 8 bits of \a I. 904 /// \param I 905 /// An integer. The lower 8 bits of this operand are written to the result 906 /// beginning at the offset specified by \a N. 907 /// \param N 908 /// An immediate value. Bits [3:0] specify the bit offset in the result at 909 /// which the lower 8 bits of \a I are written. \n 910 /// 0000: Bits [7:0] of the result are used for insertion. \n 911 /// 0001: Bits [15:8] of the result are used for insertion. \n 912 /// 0010: Bits [23:16] of the result are used for insertion. \n 913 /// 0011: Bits [31:24] of the result are used for insertion. \n 914 /// 0100: Bits [39:32] of the result are used for insertion. \n 915 /// 0101: Bits [47:40] of the result are used for insertion. \n 916 /// 0110: Bits [55:48] of the result are used for insertion. \n 917 /// 0111: Bits [63:56] of the result are used for insertion. \n 918 /// 1000: Bits [71:64] of the result are used for insertion. \n 919 /// 1001: Bits [79:72] of the result are used for insertion. \n 920 /// 1010: Bits [87:80] of the result are used for insertion. \n 921 /// 1011: Bits [95:88] of the result are used for insertion. \n 922 /// 1100: Bits [103:96] of the result are used for insertion. \n 923 /// 1101: Bits [111:104] of the result are used for insertion. \n 924 /// 1110: Bits [119:112] of the result are used for insertion. \n 925 /// 1111: Bits [127:120] of the result are used for insertion. 926 /// \returns A 128-bit integer vector containing the constructed values. 927 #define _mm_insert_epi8(X, I, N) \ 928 (__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), \ 929 (int)(I), (int)(N)) 930 931 /// Constructs a 128-bit vector of [4 x i32] by first making a copy of 932 /// the 128-bit integer vector parameter, and then inserting the 32-bit 933 /// integer parameter \a I at the offset specified by the immediate value 934 /// parameter \a N. 935 /// 936 /// \headerfile <x86intrin.h> 937 /// 938 /// \code 939 /// __m128i _mm_insert_epi32(__m128i X, int I, const int N); 940 /// \endcode 941 /// 942 /// This intrinsic corresponds to the <c> VPINSRD / PINSRD </c> instruction. 943 /// 944 /// \param X 945 /// A 128-bit integer vector of [4 x i32]. This vector is copied to the 946 /// result and then one of the four elements in the result vector is 947 /// replaced by \a I. 948 /// \param I 949 /// A 32-bit integer that is written to the result beginning at the offset 950 /// specified by \a N. 951 /// \param N 952 /// An immediate value. Bits [1:0] specify the bit offset in the result at 953 /// which the integer \a I is written. \n 954 /// 00: Bits [31:0] of the result are used for insertion. \n 955 /// 01: Bits [63:32] of the result are used for insertion. \n 956 /// 10: Bits [95:64] of the result are used for insertion. \n 957 /// 11: Bits [127:96] of the result are used for insertion. 958 /// \returns A 128-bit integer vector containing the constructed values. 959 #define _mm_insert_epi32(X, I, N) \ 960 (__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), \ 961 (int)(I), (int)(N)) 962 963 #ifdef __x86_64__ 964 /// Constructs a 128-bit vector of [2 x i64] by first making a copy of 965 /// the 128-bit integer vector parameter, and then inserting the 64-bit 966 /// integer parameter \a I, using the immediate value parameter \a N as an 967 /// insertion location selector. 968 /// 969 /// \headerfile <x86intrin.h> 970 /// 971 /// \code 972 /// __m128i _mm_insert_epi64(__m128i X, long long I, const int N); 973 /// \endcode 974 /// 975 /// This intrinsic corresponds to the <c> VPINSRQ / PINSRQ </c> instruction. 976 /// 977 /// \param X 978 /// A 128-bit integer vector of [2 x i64]. This vector is copied to the 979 /// result and then one of the two elements in the result vector is replaced 980 /// by \a I. 981 /// \param I 982 /// A 64-bit integer that is written to the result beginning at the offset 983 /// specified by \a N. 984 /// \param N 985 /// An immediate value. Bit [0] specifies the bit offset in the result at 986 /// which the integer \a I is written. \n 987 /// 0: Bits [63:0] of the result are used for insertion. \n 988 /// 1: Bits [127:64] of the result are used for insertion. \n 989 /// \returns A 128-bit integer vector containing the constructed values. 990 #define _mm_insert_epi64(X, I, N) \ 991 (__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), \ 992 (long long)(I), (int)(N)) 993 #endif /* __x86_64__ */ 994 995 /* Extract int from packed integer array at index. This returns the element 996 * as a zero extended value, so it is unsigned. 997 */ 998 /// Extracts an 8-bit element from the 128-bit integer vector of 999 /// [16 x i8], using the immediate value parameter \a N as a selector. 1000 /// 1001 /// \headerfile <x86intrin.h> 1002 /// 1003 /// \code 1004 /// int _mm_extract_epi8(__m128i X, const int N); 1005 /// \endcode 1006 /// 1007 /// This intrinsic corresponds to the <c> VPEXTRB / PEXTRB </c> instruction. 1008 /// 1009 /// \param X 1010 /// A 128-bit integer vector. 1011 /// \param N 1012 /// An immediate value. Bits [3:0] specify which 8-bit vector element from 1013 /// the argument \a X to extract and copy to the result. \n 1014 /// 0000: Bits [7:0] of parameter \a X are extracted. \n 1015 /// 0001: Bits [15:8] of the parameter \a X are extracted. \n 1016 /// 0010: Bits [23:16] of the parameter \a X are extracted. \n 1017 /// 0011: Bits [31:24] of the parameter \a X are extracted. \n 1018 /// 0100: Bits [39:32] of the parameter \a X are extracted. \n 1019 /// 0101: Bits [47:40] of the parameter \a X are extracted. \n 1020 /// 0110: Bits [55:48] of the parameter \a X are extracted. \n 1021 /// 0111: Bits [63:56] of the parameter \a X are extracted. \n 1022 /// 1000: Bits [71:64] of the parameter \a X are extracted. \n 1023 /// 1001: Bits [79:72] of the parameter \a X are extracted. \n 1024 /// 1010: Bits [87:80] of the parameter \a X are extracted. \n 1025 /// 1011: Bits [95:88] of the parameter \a X are extracted. \n 1026 /// 1100: Bits [103:96] of the parameter \a X are extracted. \n 1027 /// 1101: Bits [111:104] of the parameter \a X are extracted. \n 1028 /// 1110: Bits [119:112] of the parameter \a X are extracted. \n 1029 /// 1111: Bits [127:120] of the parameter \a X are extracted. 1030 /// \returns An unsigned integer, whose lower 8 bits are selected from the 1031 /// 128-bit integer vector parameter and the remaining bits are assigned 1032 /// zeros. 1033 #define _mm_extract_epi8(X, N) \ 1034 (int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X), \ 1035 (int)(N)) 1036 1037 /// Extracts a 32-bit element from the 128-bit integer vector of 1038 /// [4 x i32], using the immediate value parameter \a N as a selector. 1039 /// 1040 /// \headerfile <x86intrin.h> 1041 /// 1042 /// \code 1043 /// int _mm_extract_epi32(__m128i X, const int N); 1044 /// \endcode 1045 /// 1046 /// This intrinsic corresponds to the <c> VPEXTRD / PEXTRD </c> instruction. 1047 /// 1048 /// \param X 1049 /// A 128-bit integer vector. 1050 /// \param N 1051 /// An immediate value. Bits [1:0] specify which 32-bit vector element from 1052 /// the argument \a X to extract and copy to the result. \n 1053 /// 00: Bits [31:0] of the parameter \a X are extracted. \n 1054 /// 01: Bits [63:32] of the parameter \a X are extracted. \n 1055 /// 10: Bits [95:64] of the parameter \a X are extracted. \n 1056 /// 11: Bits [127:96] of the parameter \a X are exracted. 1057 /// \returns An integer, whose lower 32 bits are selected from the 128-bit 1058 /// integer vector parameter and the remaining bits are assigned zeros. 1059 #define _mm_extract_epi32(X, N) \ 1060 (int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N)) 1061 1062 #ifdef __x86_64__ 1063 /// Extracts a 64-bit element from the 128-bit integer vector of 1064 /// [2 x i64], using the immediate value parameter \a N as a selector. 1065 /// 1066 /// \headerfile <x86intrin.h> 1067 /// 1068 /// \code 1069 /// long long _mm_extract_epi64(__m128i X, const int N); 1070 /// \endcode 1071 /// 1072 /// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction. 1073 /// 1074 /// \param X 1075 /// A 128-bit integer vector. 1076 /// \param N 1077 /// An immediate value. Bit [0] specifies which 64-bit vector element from 1078 /// the argument \a X to return. \n 1079 /// 0: Bits [63:0] are returned. \n 1080 /// 1: Bits [127:64] are returned. \n 1081 /// \returns A 64-bit integer. 1082 #define _mm_extract_epi64(X, N) \ 1083 (long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N)) 1084 #endif /* __x86_64 */ 1085 1086 /* SSE4 128-bit Packed Integer Comparisons. */ 1087 /// Tests whether the specified bits in a 128-bit integer vector are all 1088 /// zeros. 1089 /// 1090 /// \headerfile <x86intrin.h> 1091 /// 1092 /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 1093 /// 1094 /// \param __M 1095 /// A 128-bit integer vector containing the bits to be tested. 1096 /// \param __V 1097 /// A 128-bit integer vector selecting which bits to test in operand \a __M. 1098 /// \returns TRUE if the specified bits are all zeros; FALSE otherwise. 1099 static __inline__ int __DEFAULT_FN_ATTRS 1100 _mm_testz_si128(__m128i __M, __m128i __V) 1101 { 1102 return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V); 1103 } 1104 1105 /// Tests whether the specified bits in a 128-bit integer vector are all 1106 /// ones. 1107 /// 1108 /// \headerfile <x86intrin.h> 1109 /// 1110 /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 1111 /// 1112 /// \param __M 1113 /// A 128-bit integer vector containing the bits to be tested. 1114 /// \param __V 1115 /// A 128-bit integer vector selecting which bits to test in operand \a __M. 1116 /// \returns TRUE if the specified bits are all ones; FALSE otherwise. 1117 static __inline__ int __DEFAULT_FN_ATTRS 1118 _mm_testc_si128(__m128i __M, __m128i __V) 1119 { 1120 return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V); 1121 } 1122 1123 /// Tests whether the specified bits in a 128-bit integer vector are 1124 /// neither all zeros nor all ones. 1125 /// 1126 /// \headerfile <x86intrin.h> 1127 /// 1128 /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 1129 /// 1130 /// \param __M 1131 /// A 128-bit integer vector containing the bits to be tested. 1132 /// \param __V 1133 /// A 128-bit integer vector selecting which bits to test in operand \a __M. 1134 /// \returns TRUE if the specified bits are neither all zeros nor all ones; 1135 /// FALSE otherwise. 1136 static __inline__ int __DEFAULT_FN_ATTRS 1137 _mm_testnzc_si128(__m128i __M, __m128i __V) 1138 { 1139 return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V); 1140 } 1141 1142 /// Tests whether the specified bits in a 128-bit integer vector are all 1143 /// ones. 1144 /// 1145 /// \headerfile <x86intrin.h> 1146 /// 1147 /// \code 1148 /// int _mm_test_all_ones(__m128i V); 1149 /// \endcode 1150 /// 1151 /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 1152 /// 1153 /// \param V 1154 /// A 128-bit integer vector containing the bits to be tested. 1155 /// \returns TRUE if the bits specified in the operand are all set to 1; FALSE 1156 /// otherwise. 1157 #define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V))) 1158 1159 /// Tests whether the specified bits in a 128-bit integer vector are 1160 /// neither all zeros nor all ones. 1161 /// 1162 /// \headerfile <x86intrin.h> 1163 /// 1164 /// \code 1165 /// int _mm_test_mix_ones_zeros(__m128i M, __m128i V); 1166 /// \endcode 1167 /// 1168 /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 1169 /// 1170 /// \param M 1171 /// A 128-bit integer vector containing the bits to be tested. 1172 /// \param V 1173 /// A 128-bit integer vector selecting which bits to test in operand \a M. 1174 /// \returns TRUE if the specified bits are neither all zeros nor all ones; 1175 /// FALSE otherwise. 1176 #define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V)) 1177 1178 /// Tests whether the specified bits in a 128-bit integer vector are all 1179 /// zeros. 1180 /// 1181 /// \headerfile <x86intrin.h> 1182 /// 1183 /// \code 1184 /// int _mm_test_all_zeros(__m128i M, __m128i V); 1185 /// \endcode 1186 /// 1187 /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 1188 /// 1189 /// \param M 1190 /// A 128-bit integer vector containing the bits to be tested. 1191 /// \param V 1192 /// A 128-bit integer vector selecting which bits to test in operand \a M. 1193 /// \returns TRUE if the specified bits are all zeros; FALSE otherwise. 1194 #define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V)) 1195 1196 /* SSE4 64-bit Packed Integer Comparisons. */ 1197 /// Compares each of the corresponding 64-bit values of the 128-bit 1198 /// integer vectors for equality. 1199 /// 1200 /// \headerfile <x86intrin.h> 1201 /// 1202 /// This intrinsic corresponds to the <c> VPCMPEQQ / PCMPEQQ </c> instruction. 1203 /// 1204 /// \param __V1 1205 /// A 128-bit integer vector. 1206 /// \param __V2 1207 /// A 128-bit integer vector. 1208 /// \returns A 128-bit integer vector containing the comparison results. 1209 static __inline__ __m128i __DEFAULT_FN_ATTRS 1210 _mm_cmpeq_epi64(__m128i __V1, __m128i __V2) 1211 { 1212 return (__m128i)((__v2di)__V1 == (__v2di)__V2); 1213 } 1214 1215 /* SSE4 Packed Integer Sign-Extension. */ 1216 /// Sign-extends each of the lower eight 8-bit integer elements of a 1217 /// 128-bit vector of [16 x i8] to 16-bit values and returns them in a 1218 /// 128-bit vector of [8 x i16]. The upper eight elements of the input vector 1219 /// are unused. 1220 /// 1221 /// \headerfile <x86intrin.h> 1222 /// 1223 /// This intrinsic corresponds to the <c> VPMOVSXBW / PMOVSXBW </c> instruction. 1224 /// 1225 /// \param __V 1226 /// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are sign- 1227 /// extended to 16-bit values. 1228 /// \returns A 128-bit vector of [8 x i16] containing the sign-extended values. 1229 static __inline__ __m128i __DEFAULT_FN_ATTRS 1230 _mm_cvtepi8_epi16(__m128i __V) 1231 { 1232 /* This function always performs a signed extension, but __v16qi is a char 1233 which may be signed or unsigned, so use __v16qs. */ 1234 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi); 1235 } 1236 1237 /// Sign-extends each of the lower four 8-bit integer elements of a 1238 /// 128-bit vector of [16 x i8] to 32-bit values and returns them in a 1239 /// 128-bit vector of [4 x i32]. The upper twelve elements of the input 1240 /// vector are unused. 1241 /// 1242 /// \headerfile <x86intrin.h> 1243 /// 1244 /// This intrinsic corresponds to the <c> VPMOVSXBD / PMOVSXBD </c> instruction. 1245 /// 1246 /// \param __V 1247 /// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are 1248 /// sign-extended to 32-bit values. 1249 /// \returns A 128-bit vector of [4 x i32] containing the sign-extended values. 1250 static __inline__ __m128i __DEFAULT_FN_ATTRS 1251 _mm_cvtepi8_epi32(__m128i __V) 1252 { 1253 /* This function always performs a signed extension, but __v16qi is a char 1254 which may be signed or unsigned, so use __v16qs. */ 1255 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si); 1256 } 1257 1258 /// Sign-extends each of the lower two 8-bit integer elements of a 1259 /// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in 1260 /// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input 1261 /// vector are unused. 1262 /// 1263 /// \headerfile <x86intrin.h> 1264 /// 1265 /// This intrinsic corresponds to the <c> VPMOVSXBQ / PMOVSXBQ </c> instruction. 1266 /// 1267 /// \param __V 1268 /// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are 1269 /// sign-extended to 64-bit values. 1270 /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. 1271 static __inline__ __m128i __DEFAULT_FN_ATTRS 1272 _mm_cvtepi8_epi64(__m128i __V) 1273 { 1274 /* This function always performs a signed extension, but __v16qi is a char 1275 which may be signed or unsigned, so use __v16qs. */ 1276 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di); 1277 } 1278 1279 /// Sign-extends each of the lower four 16-bit integer elements of a 1280 /// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in 1281 /// a 128-bit vector of [4 x i32]. The upper four elements of the input 1282 /// vector are unused. 1283 /// 1284 /// \headerfile <x86intrin.h> 1285 /// 1286 /// This intrinsic corresponds to the <c> VPMOVSXWD / PMOVSXWD </c> instruction. 1287 /// 1288 /// \param __V 1289 /// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are 1290 /// sign-extended to 32-bit values. 1291 /// \returns A 128-bit vector of [4 x i32] containing the sign-extended values. 1292 static __inline__ __m128i __DEFAULT_FN_ATTRS 1293 _mm_cvtepi16_epi32(__m128i __V) 1294 { 1295 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si); 1296 } 1297 1298 /// Sign-extends each of the lower two 16-bit integer elements of a 1299 /// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in 1300 /// a 128-bit vector of [2 x i64]. The upper six elements of the input 1301 /// vector are unused. 1302 /// 1303 /// \headerfile <x86intrin.h> 1304 /// 1305 /// This intrinsic corresponds to the <c> VPMOVSXWQ / PMOVSXWQ </c> instruction. 1306 /// 1307 /// \param __V 1308 /// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are 1309 /// sign-extended to 64-bit values. 1310 /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. 1311 static __inline__ __m128i __DEFAULT_FN_ATTRS 1312 _mm_cvtepi16_epi64(__m128i __V) 1313 { 1314 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di); 1315 } 1316 1317 /// Sign-extends each of the lower two 32-bit integer elements of a 1318 /// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in 1319 /// a 128-bit vector of [2 x i64]. The upper two elements of the input vector 1320 /// are unused. 1321 /// 1322 /// \headerfile <x86intrin.h> 1323 /// 1324 /// This intrinsic corresponds to the <c> VPMOVSXDQ / PMOVSXDQ </c> instruction. 1325 /// 1326 /// \param __V 1327 /// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are 1328 /// sign-extended to 64-bit values. 1329 /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. 1330 static __inline__ __m128i __DEFAULT_FN_ATTRS 1331 _mm_cvtepi32_epi64(__m128i __V) 1332 { 1333 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4si)__V, (__v4si)__V, 0, 1), __v2di); 1334 } 1335 1336 /* SSE4 Packed Integer Zero-Extension. */ 1337 /// Zero-extends each of the lower eight 8-bit integer elements of a 1338 /// 128-bit vector of [16 x i8] to 16-bit values and returns them in a 1339 /// 128-bit vector of [8 x i16]. The upper eight elements of the input vector 1340 /// are unused. 1341 /// 1342 /// \headerfile <x86intrin.h> 1343 /// 1344 /// This intrinsic corresponds to the <c> VPMOVZXBW / PMOVZXBW </c> instruction. 1345 /// 1346 /// \param __V 1347 /// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are 1348 /// zero-extended to 16-bit values. 1349 /// \returns A 128-bit vector of [8 x i16] containing the zero-extended values. 1350 static __inline__ __m128i __DEFAULT_FN_ATTRS 1351 _mm_cvtepu8_epi16(__m128i __V) 1352 { 1353 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi); 1354 } 1355 1356 /// Zero-extends each of the lower four 8-bit integer elements of a 1357 /// 128-bit vector of [16 x i8] to 32-bit values and returns them in a 1358 /// 128-bit vector of [4 x i32]. The upper twelve elements of the input 1359 /// vector are unused. 1360 /// 1361 /// \headerfile <x86intrin.h> 1362 /// 1363 /// This intrinsic corresponds to the <c> VPMOVZXBD / PMOVZXBD </c> instruction. 1364 /// 1365 /// \param __V 1366 /// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are 1367 /// zero-extended to 32-bit values. 1368 /// \returns A 128-bit vector of [4 x i32] containing the zero-extended values. 1369 static __inline__ __m128i __DEFAULT_FN_ATTRS 1370 _mm_cvtepu8_epi32(__m128i __V) 1371 { 1372 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si); 1373 } 1374 1375 /// Zero-extends each of the lower two 8-bit integer elements of a 1376 /// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in 1377 /// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input 1378 /// vector are unused. 1379 /// 1380 /// \headerfile <x86intrin.h> 1381 /// 1382 /// This intrinsic corresponds to the <c> VPMOVZXBQ / PMOVZXBQ </c> instruction. 1383 /// 1384 /// \param __V 1385 /// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are 1386 /// zero-extended to 64-bit values. 1387 /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. 1388 static __inline__ __m128i __DEFAULT_FN_ATTRS 1389 _mm_cvtepu8_epi64(__m128i __V) 1390 { 1391 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di); 1392 } 1393 1394 /// Zero-extends each of the lower four 16-bit integer elements of a 1395 /// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in 1396 /// a 128-bit vector of [4 x i32]. The upper four elements of the input 1397 /// vector are unused. 1398 /// 1399 /// \headerfile <x86intrin.h> 1400 /// 1401 /// This intrinsic corresponds to the <c> VPMOVZXWD / PMOVZXWD </c> instruction. 1402 /// 1403 /// \param __V 1404 /// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are 1405 /// zero-extended to 32-bit values. 1406 /// \returns A 128-bit vector of [4 x i32] containing the zero-extended values. 1407 static __inline__ __m128i __DEFAULT_FN_ATTRS 1408 _mm_cvtepu16_epi32(__m128i __V) 1409 { 1410 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si); 1411 } 1412 1413 /// Zero-extends each of the lower two 16-bit integer elements of a 1414 /// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in 1415 /// a 128-bit vector of [2 x i64]. The upper six elements of the input vector 1416 /// are unused. 1417 /// 1418 /// \headerfile <x86intrin.h> 1419 /// 1420 /// This intrinsic corresponds to the <c> VPMOVZXWQ / PMOVZXWQ </c> instruction. 1421 /// 1422 /// \param __V 1423 /// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are 1424 /// zero-extended to 64-bit values. 1425 /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. 1426 static __inline__ __m128i __DEFAULT_FN_ATTRS 1427 _mm_cvtepu16_epi64(__m128i __V) 1428 { 1429 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di); 1430 } 1431 1432 /// Zero-extends each of the lower two 32-bit integer elements of a 1433 /// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in 1434 /// a 128-bit vector of [2 x i64]. The upper two elements of the input vector 1435 /// are unused. 1436 /// 1437 /// \headerfile <x86intrin.h> 1438 /// 1439 /// This intrinsic corresponds to the <c> VPMOVZXDQ / PMOVZXDQ </c> instruction. 1440 /// 1441 /// \param __V 1442 /// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are 1443 /// zero-extended to 64-bit values. 1444 /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. 1445 static __inline__ __m128i __DEFAULT_FN_ATTRS 1446 _mm_cvtepu32_epi64(__m128i __V) 1447 { 1448 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4su)__V, (__v4su)__V, 0, 1), __v2di); 1449 } 1450 1451 /* SSE4 Pack with Unsigned Saturation. */ 1452 /// Converts 32-bit signed integers from both 128-bit integer vector 1453 /// operands into 16-bit unsigned integers, and returns the packed result. 1454 /// Values greater than 0xFFFF are saturated to 0xFFFF. Values less than 1455 /// 0x0000 are saturated to 0x0000. 1456 /// 1457 /// \headerfile <x86intrin.h> 1458 /// 1459 /// This intrinsic corresponds to the <c> VPACKUSDW / PACKUSDW </c> instruction. 1460 /// 1461 /// \param __V1 1462 /// A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a 1463 /// signed integer and is converted to a 16-bit unsigned integer with 1464 /// saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values 1465 /// less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values 1466 /// are written to the lower 64 bits of the result. 1467 /// \param __V2 1468 /// A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a 1469 /// signed integer and is converted to a 16-bit unsigned integer with 1470 /// saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values 1471 /// less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values 1472 /// are written to the higher 64 bits of the result. 1473 /// \returns A 128-bit vector of [8 x i16] containing the converted values. 1474 static __inline__ __m128i __DEFAULT_FN_ATTRS 1475 _mm_packus_epi32(__m128i __V1, __m128i __V2) 1476 { 1477 return (__m128i) __builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2); 1478 } 1479 1480 /* SSE4 Multiple Packed Sums of Absolute Difference. */ 1481 /// Subtracts 8-bit unsigned integer values and computes the absolute 1482 /// values of the differences to the corresponding bits in the destination. 1483 /// Then sums of the absolute differences are returned according to the bit 1484 /// fields in the immediate operand. 1485 /// 1486 /// \headerfile <x86intrin.h> 1487 /// 1488 /// \code 1489 /// __m128i _mm_mpsadbw_epu8(__m128i X, __m128i Y, const int M); 1490 /// \endcode 1491 /// 1492 /// This intrinsic corresponds to the <c> VMPSADBW / MPSADBW </c> instruction. 1493 /// 1494 /// \param X 1495 /// A 128-bit vector of [16 x i8]. 1496 /// \param Y 1497 /// A 128-bit vector of [16 x i8]. 1498 /// \param M 1499 /// An 8-bit immediate operand specifying how the absolute differences are to 1500 /// be calculated, according to the following algorithm: 1501 /// \code 1502 /// // M2 represents bit 2 of the immediate operand 1503 /// // M10 represents bits [1:0] of the immediate operand 1504 /// i = M2 * 4; 1505 /// j = M10 * 4; 1506 /// for (k = 0; k < 8; k = k + 1) { 1507 /// d0 = abs(X[i + k + 0] - Y[j + 0]); 1508 /// d1 = abs(X[i + k + 1] - Y[j + 1]); 1509 /// d2 = abs(X[i + k + 2] - Y[j + 2]); 1510 /// d3 = abs(X[i + k + 3] - Y[j + 3]); 1511 /// r[k] = d0 + d1 + d2 + d3; 1512 /// } 1513 /// \endcode 1514 /// \returns A 128-bit integer vector containing the sums of the sets of 1515 /// absolute differences between both operands. 1516 #define _mm_mpsadbw_epu8(X, Y, M) \ 1517 (__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \ 1518 (__v16qi)(__m128i)(Y), (M)) 1519 1520 /// Finds the minimum unsigned 16-bit element in the input 128-bit 1521 /// vector of [8 x u16] and returns it and along with its index. 1522 /// 1523 /// \headerfile <x86intrin.h> 1524 /// 1525 /// This intrinsic corresponds to the <c> VPHMINPOSUW / PHMINPOSUW </c> 1526 /// instruction. 1527 /// 1528 /// \param __V 1529 /// A 128-bit vector of [8 x u16]. 1530 /// \returns A 128-bit value where bits [15:0] contain the minimum value found 1531 /// in parameter \a __V, bits [18:16] contain the index of the minimum value 1532 /// and the remaining bits are set to 0. 1533 static __inline__ __m128i __DEFAULT_FN_ATTRS 1534 _mm_minpos_epu16(__m128i __V) 1535 { 1536 return (__m128i) __builtin_ia32_phminposuw128((__v8hi)__V); 1537 } 1538 1539 /* Handle the sse4.2 definitions here. */ 1540 1541 /* These definitions are normally in nmmintrin.h, but gcc puts them in here 1542 so we'll do the same. */ 1543 1544 #undef __DEFAULT_FN_ATTRS 1545 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.2"))) 1546 1547 /* These specify the type of data that we're comparing. */ 1548 #define _SIDD_UBYTE_OPS 0x00 1549 #define _SIDD_UWORD_OPS 0x01 1550 #define _SIDD_SBYTE_OPS 0x02 1551 #define _SIDD_SWORD_OPS 0x03 1552 1553 /* These specify the type of comparison operation. */ 1554 #define _SIDD_CMP_EQUAL_ANY 0x00 1555 #define _SIDD_CMP_RANGES 0x04 1556 #define _SIDD_CMP_EQUAL_EACH 0x08 1557 #define _SIDD_CMP_EQUAL_ORDERED 0x0c 1558 1559 /* These macros specify the polarity of the operation. */ 1560 #define _SIDD_POSITIVE_POLARITY 0x00 1561 #define _SIDD_NEGATIVE_POLARITY 0x10 1562 #define _SIDD_MASKED_POSITIVE_POLARITY 0x20 1563 #define _SIDD_MASKED_NEGATIVE_POLARITY 0x30 1564 1565 /* These macros are used in _mm_cmpXstri() to specify the return. */ 1566 #define _SIDD_LEAST_SIGNIFICANT 0x00 1567 #define _SIDD_MOST_SIGNIFICANT 0x40 1568 1569 /* These macros are used in _mm_cmpXstri() to specify the return. */ 1570 #define _SIDD_BIT_MASK 0x00 1571 #define _SIDD_UNIT_MASK 0x40 1572 1573 /* SSE4.2 Packed Comparison Intrinsics. */ 1574 /// Uses the immediate operand \a M to perform a comparison of string 1575 /// data with implicitly defined lengths that is contained in source operands 1576 /// \a A and \a B. Returns a 128-bit integer vector representing the result 1577 /// mask of the comparison. 1578 /// 1579 /// \headerfile <x86intrin.h> 1580 /// 1581 /// \code 1582 /// __m128i _mm_cmpistrm(__m128i A, __m128i B, const int M); 1583 /// \endcode 1584 /// 1585 /// This intrinsic corresponds to the <c> VPCMPISTRM / PCMPISTRM </c> 1586 /// instruction. 1587 /// 1588 /// \param A 1589 /// A 128-bit integer vector containing one of the source operands to be 1590 /// compared. 1591 /// \param B 1592 /// A 128-bit integer vector containing one of the source operands to be 1593 /// compared. 1594 /// \param M 1595 /// An 8-bit immediate operand specifying whether the characters are bytes or 1596 /// words, the type of comparison to perform, and the format of the return 1597 /// value. \n 1598 /// Bits [1:0]: Determine source data format. \n 1599 /// 00: 16 unsigned bytes \n 1600 /// 01: 8 unsigned words \n 1601 /// 10: 16 signed bytes \n 1602 /// 11: 8 signed words \n 1603 /// Bits [3:2]: Determine comparison type and aggregation method. \n 1604 /// 00: Subset: Each character in \a B is compared for equality with all 1605 /// the characters in \a A. \n 1606 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1607 /// basis is greater than or equal for even-indexed elements in \a A, 1608 /// and less than or equal for odd-indexed elements in \a A. \n 1609 /// 10: Match: Compare each pair of corresponding characters in \a A and 1610 /// \a B for equality. \n 1611 /// 11: Substring: Search \a B for substring matches of \a A. \n 1612 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 1613 /// mask of the comparison results. \n 1614 /// 00: No effect. \n 1615 /// 01: Negate the bit mask. \n 1616 /// 10: No effect. \n 1617 /// 11: Negate the bit mask only for bits with an index less than or equal 1618 /// to the size of \a A or \a B. \n 1619 /// Bit [6]: Determines whether the result is zero-extended or expanded to 16 1620 /// bytes. \n 1621 /// 0: The result is zero-extended to 16 bytes. \n 1622 /// 1: The result is expanded to 16 bytes (this expansion is performed by 1623 /// repeating each bit 8 or 16 times). 1624 /// \returns Returns a 128-bit integer vector representing the result mask of 1625 /// the comparison. 1626 #define _mm_cmpistrm(A, B, M) \ 1627 (__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \ 1628 (__v16qi)(__m128i)(B), (int)(M)) 1629 1630 /// Uses the immediate operand \a M to perform a comparison of string 1631 /// data with implicitly defined lengths that is contained in source operands 1632 /// \a A and \a B. Returns an integer representing the result index of the 1633 /// comparison. 1634 /// 1635 /// \headerfile <x86intrin.h> 1636 /// 1637 /// \code 1638 /// int _mm_cmpistri(__m128i A, __m128i B, const int M); 1639 /// \endcode 1640 /// 1641 /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 1642 /// instruction. 1643 /// 1644 /// \param A 1645 /// A 128-bit integer vector containing one of the source operands to be 1646 /// compared. 1647 /// \param B 1648 /// A 128-bit integer vector containing one of the source operands to be 1649 /// compared. 1650 /// \param M 1651 /// An 8-bit immediate operand specifying whether the characters are bytes or 1652 /// words, the type of comparison to perform, and the format of the return 1653 /// value. \n 1654 /// Bits [1:0]: Determine source data format. \n 1655 /// 00: 16 unsigned bytes \n 1656 /// 01: 8 unsigned words \n 1657 /// 10: 16 signed bytes \n 1658 /// 11: 8 signed words \n 1659 /// Bits [3:2]: Determine comparison type and aggregation method. \n 1660 /// 00: Subset: Each character in \a B is compared for equality with all 1661 /// the characters in \a A. \n 1662 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1663 /// basis is greater than or equal for even-indexed elements in \a A, 1664 /// and less than or equal for odd-indexed elements in \a A. \n 1665 /// 10: Match: Compare each pair of corresponding characters in \a A and 1666 /// \a B for equality. \n 1667 /// 11: Substring: Search B for substring matches of \a A. \n 1668 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 1669 /// mask of the comparison results. \n 1670 /// 00: No effect. \n 1671 /// 01: Negate the bit mask. \n 1672 /// 10: No effect. \n 1673 /// 11: Negate the bit mask only for bits with an index less than or equal 1674 /// to the size of \a A or \a B. \n 1675 /// Bit [6]: Determines whether the index of the lowest set bit or the 1676 /// highest set bit is returned. \n 1677 /// 0: The index of the least significant set bit. \n 1678 /// 1: The index of the most significant set bit. \n 1679 /// \returns Returns an integer representing the result index of the comparison. 1680 #define _mm_cmpistri(A, B, M) \ 1681 (int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \ 1682 (__v16qi)(__m128i)(B), (int)(M)) 1683 1684 /// Uses the immediate operand \a M to perform a comparison of string 1685 /// data with explicitly defined lengths that is contained in source operands 1686 /// \a A and \a B. Returns a 128-bit integer vector representing the result 1687 /// mask of the comparison. 1688 /// 1689 /// \headerfile <x86intrin.h> 1690 /// 1691 /// \code 1692 /// __m128i _mm_cmpestrm(__m128i A, int LA, __m128i B, int LB, const int M); 1693 /// \endcode 1694 /// 1695 /// This intrinsic corresponds to the <c> VPCMPESTRM / PCMPESTRM </c> 1696 /// instruction. 1697 /// 1698 /// \param A 1699 /// A 128-bit integer vector containing one of the source operands to be 1700 /// compared. 1701 /// \param LA 1702 /// An integer that specifies the length of the string in \a A. 1703 /// \param B 1704 /// A 128-bit integer vector containing one of the source operands to be 1705 /// compared. 1706 /// \param LB 1707 /// An integer that specifies the length of the string in \a B. 1708 /// \param M 1709 /// An 8-bit immediate operand specifying whether the characters are bytes or 1710 /// words, the type of comparison to perform, and the format of the return 1711 /// value. \n 1712 /// Bits [1:0]: Determine source data format. \n 1713 /// 00: 16 unsigned bytes \n 1714 /// 01: 8 unsigned words \n 1715 /// 10: 16 signed bytes \n 1716 /// 11: 8 signed words \n 1717 /// Bits [3:2]: Determine comparison type and aggregation method. \n 1718 /// 00: Subset: Each character in \a B is compared for equality with all 1719 /// the characters in \a A. \n 1720 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1721 /// basis is greater than or equal for even-indexed elements in \a A, 1722 /// and less than or equal for odd-indexed elements in \a A. \n 1723 /// 10: Match: Compare each pair of corresponding characters in \a A and 1724 /// \a B for equality. \n 1725 /// 11: Substring: Search \a B for substring matches of \a A. \n 1726 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 1727 /// mask of the comparison results. \n 1728 /// 00: No effect. \n 1729 /// 01: Negate the bit mask. \n 1730 /// 10: No effect. \n 1731 /// 11: Negate the bit mask only for bits with an index less than or equal 1732 /// to the size of \a A or \a B. \n 1733 /// Bit [6]: Determines whether the result is zero-extended or expanded to 16 1734 /// bytes. \n 1735 /// 0: The result is zero-extended to 16 bytes. \n 1736 /// 1: The result is expanded to 16 bytes (this expansion is performed by 1737 /// repeating each bit 8 or 16 times). \n 1738 /// \returns Returns a 128-bit integer vector representing the result mask of 1739 /// the comparison. 1740 #define _mm_cmpestrm(A, LA, B, LB, M) \ 1741 (__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \ 1742 (__v16qi)(__m128i)(B), (int)(LB), \ 1743 (int)(M)) 1744 1745 /// Uses the immediate operand \a M to perform a comparison of string 1746 /// data with explicitly defined lengths that is contained in source operands 1747 /// \a A and \a B. Returns an integer representing the result index of the 1748 /// comparison. 1749 /// 1750 /// \headerfile <x86intrin.h> 1751 /// 1752 /// \code 1753 /// int _mm_cmpestri(__m128i A, int LA, __m128i B, int LB, const int M); 1754 /// \endcode 1755 /// 1756 /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> 1757 /// instruction. 1758 /// 1759 /// \param A 1760 /// A 128-bit integer vector containing one of the source operands to be 1761 /// compared. 1762 /// \param LA 1763 /// An integer that specifies the length of the string in \a A. 1764 /// \param B 1765 /// A 128-bit integer vector containing one of the source operands to be 1766 /// compared. 1767 /// \param LB 1768 /// An integer that specifies the length of the string in \a B. 1769 /// \param M 1770 /// An 8-bit immediate operand specifying whether the characters are bytes or 1771 /// words, the type of comparison to perform, and the format of the return 1772 /// value. \n 1773 /// Bits [1:0]: Determine source data format. \n 1774 /// 00: 16 unsigned bytes \n 1775 /// 01: 8 unsigned words \n 1776 /// 10: 16 signed bytes \n 1777 /// 11: 8 signed words \n 1778 /// Bits [3:2]: Determine comparison type and aggregation method. \n 1779 /// 00: Subset: Each character in \a B is compared for equality with all 1780 /// the characters in \a A. \n 1781 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1782 /// basis is greater than or equal for even-indexed elements in \a A, 1783 /// and less than or equal for odd-indexed elements in \a A. \n 1784 /// 10: Match: Compare each pair of corresponding characters in \a A and 1785 /// \a B for equality. \n 1786 /// 11: Substring: Search B for substring matches of \a A. \n 1787 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 1788 /// mask of the comparison results. \n 1789 /// 00: No effect. \n 1790 /// 01: Negate the bit mask. \n 1791 /// 10: No effect. \n 1792 /// 11: Negate the bit mask only for bits with an index less than or equal 1793 /// to the size of \a A or \a B. \n 1794 /// Bit [6]: Determines whether the index of the lowest set bit or the 1795 /// highest set bit is returned. \n 1796 /// 0: The index of the least significant set bit. \n 1797 /// 1: The index of the most significant set bit. \n 1798 /// \returns Returns an integer representing the result index of the comparison. 1799 #define _mm_cmpestri(A, LA, B, LB, M) \ 1800 (int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \ 1801 (__v16qi)(__m128i)(B), (int)(LB), \ 1802 (int)(M)) 1803 1804 /* SSE4.2 Packed Comparison Intrinsics and EFlag Reading. */ 1805 /// Uses the immediate operand \a M to perform a comparison of string 1806 /// data with implicitly defined lengths that is contained in source operands 1807 /// \a A and \a B. Returns 1 if the bit mask is zero and the length of the 1808 /// string in \a B is the maximum, otherwise, returns 0. 1809 /// 1810 /// \headerfile <x86intrin.h> 1811 /// 1812 /// \code 1813 /// int _mm_cmpistra(__m128i A, __m128i B, const int M); 1814 /// \endcode 1815 /// 1816 /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 1817 /// instruction. 1818 /// 1819 /// \param A 1820 /// A 128-bit integer vector containing one of the source operands to be 1821 /// compared. 1822 /// \param B 1823 /// A 128-bit integer vector containing one of the source operands to be 1824 /// compared. 1825 /// \param M 1826 /// An 8-bit immediate operand specifying whether the characters are bytes or 1827 /// words and the type of comparison to perform. \n 1828 /// Bits [1:0]: Determine source data format. \n 1829 /// 00: 16 unsigned bytes \n 1830 /// 01: 8 unsigned words \n 1831 /// 10: 16 signed bytes \n 1832 /// 11: 8 signed words \n 1833 /// Bits [3:2]: Determine comparison type and aggregation method. \n 1834 /// 00: Subset: Each character in \a B is compared for equality with all 1835 /// the characters in \a A. \n 1836 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1837 /// basis is greater than or equal for even-indexed elements in \a A, 1838 /// and less than or equal for odd-indexed elements in \a A. \n 1839 /// 10: Match: Compare each pair of corresponding characters in \a A and 1840 /// \a B for equality. \n 1841 /// 11: Substring: Search \a B for substring matches of \a A. \n 1842 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 1843 /// mask of the comparison results. \n 1844 /// 00: No effect. \n 1845 /// 01: Negate the bit mask. \n 1846 /// 10: No effect. \n 1847 /// 11: Negate the bit mask only for bits with an index less than or equal 1848 /// to the size of \a A or \a B. \n 1849 /// \returns Returns 1 if the bit mask is zero and the length of the string in 1850 /// \a B is the maximum; otherwise, returns 0. 1851 #define _mm_cmpistra(A, B, M) \ 1852 (int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \ 1853 (__v16qi)(__m128i)(B), (int)(M)) 1854 1855 /// Uses the immediate operand \a M to perform a comparison of string 1856 /// data with implicitly defined lengths that is contained in source operands 1857 /// \a A and \a B. Returns 1 if the bit mask is non-zero, otherwise, returns 1858 /// 0. 1859 /// 1860 /// \headerfile <x86intrin.h> 1861 /// 1862 /// \code 1863 /// int _mm_cmpistrc(__m128i A, __m128i B, const int M); 1864 /// \endcode 1865 /// 1866 /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 1867 /// instruction. 1868 /// 1869 /// \param A 1870 /// A 128-bit integer vector containing one of the source operands to be 1871 /// compared. 1872 /// \param B 1873 /// A 128-bit integer vector containing one of the source operands to be 1874 /// compared. 1875 /// \param M 1876 /// An 8-bit immediate operand specifying whether the characters are bytes or 1877 /// words and the type of comparison to perform. \n 1878 /// Bits [1:0]: Determine source data format. \n 1879 /// 00: 16 unsigned bytes \n 1880 /// 01: 8 unsigned words \n 1881 /// 10: 16 signed bytes \n 1882 /// 11: 8 signed words \n 1883 /// Bits [3:2]: Determine comparison type and aggregation method. \n 1884 /// 00: Subset: Each character in \a B is compared for equality with all 1885 /// the characters in \a A. \n 1886 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1887 /// basis is greater than or equal for even-indexed elements in \a A, 1888 /// and less than or equal for odd-indexed elements in \a A. \n 1889 /// 10: Match: Compare each pair of corresponding characters in \a A and 1890 /// \a B for equality. \n 1891 /// 11: Substring: Search B for substring matches of \a A. \n 1892 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 1893 /// mask of the comparison results. \n 1894 /// 00: No effect. \n 1895 /// 01: Negate the bit mask. \n 1896 /// 10: No effect. \n 1897 /// 11: Negate the bit mask only for bits with an index less than or equal 1898 /// to the size of \a A or \a B. 1899 /// \returns Returns 1 if the bit mask is non-zero, otherwise, returns 0. 1900 #define _mm_cmpistrc(A, B, M) \ 1901 (int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \ 1902 (__v16qi)(__m128i)(B), (int)(M)) 1903 1904 /// Uses the immediate operand \a M to perform a comparison of string 1905 /// data with implicitly defined lengths that is contained in source operands 1906 /// \a A and \a B. Returns bit 0 of the resulting bit mask. 1907 /// 1908 /// \headerfile <x86intrin.h> 1909 /// 1910 /// \code 1911 /// int _mm_cmpistro(__m128i A, __m128i B, const int M); 1912 /// \endcode 1913 /// 1914 /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 1915 /// instruction. 1916 /// 1917 /// \param A 1918 /// A 128-bit integer vector containing one of the source operands to be 1919 /// compared. 1920 /// \param B 1921 /// A 128-bit integer vector containing one of the source operands to be 1922 /// compared. 1923 /// \param M 1924 /// An 8-bit immediate operand specifying whether the characters are bytes or 1925 /// words and the type of comparison to perform. \n 1926 /// Bits [1:0]: Determine source data format. \n 1927 /// 00: 16 unsigned bytes \n 1928 /// 01: 8 unsigned words \n 1929 /// 10: 16 signed bytes \n 1930 /// 11: 8 signed words \n 1931 /// Bits [3:2]: Determine comparison type and aggregation method. \n 1932 /// 00: Subset: Each character in \a B is compared for equality with all 1933 /// the characters in \a A. \n 1934 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1935 /// basis is greater than or equal for even-indexed elements in \a A, 1936 /// and less than or equal for odd-indexed elements in \a A. \n 1937 /// 10: Match: Compare each pair of corresponding characters in \a A and 1938 /// \a B for equality. \n 1939 /// 11: Substring: Search B for substring matches of \a A. \n 1940 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 1941 /// mask of the comparison results. \n 1942 /// 00: No effect. \n 1943 /// 01: Negate the bit mask. \n 1944 /// 10: No effect. \n 1945 /// 11: Negate the bit mask only for bits with an index less than or equal 1946 /// to the size of \a A or \a B. \n 1947 /// \returns Returns bit 0 of the resulting bit mask. 1948 #define _mm_cmpistro(A, B, M) \ 1949 (int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \ 1950 (__v16qi)(__m128i)(B), (int)(M)) 1951 1952 /// Uses the immediate operand \a M to perform a comparison of string 1953 /// data with implicitly defined lengths that is contained in source operands 1954 /// \a A and \a B. Returns 1 if the length of the string in \a A is less than 1955 /// the maximum, otherwise, returns 0. 1956 /// 1957 /// \headerfile <x86intrin.h> 1958 /// 1959 /// \code 1960 /// int _mm_cmpistrs(__m128i A, __m128i B, const int M); 1961 /// \endcode 1962 /// 1963 /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 1964 /// instruction. 1965 /// 1966 /// \param A 1967 /// A 128-bit integer vector containing one of the source operands to be 1968 /// compared. 1969 /// \param B 1970 /// A 128-bit integer vector containing one of the source operands to be 1971 /// compared. 1972 /// \param M 1973 /// An 8-bit immediate operand specifying whether the characters are bytes or 1974 /// words and the type of comparison to perform. \n 1975 /// Bits [1:0]: Determine source data format. \n 1976 /// 00: 16 unsigned bytes \n 1977 /// 01: 8 unsigned words \n 1978 /// 10: 16 signed bytes \n 1979 /// 11: 8 signed words \n 1980 /// Bits [3:2]: Determine comparison type and aggregation method. \n 1981 /// 00: Subset: Each character in \a B is compared for equality with all 1982 /// the characters in \a A. \n 1983 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1984 /// basis is greater than or equal for even-indexed elements in \a A, 1985 /// and less than or equal for odd-indexed elements in \a A. \n 1986 /// 10: Match: Compare each pair of corresponding characters in \a A and 1987 /// \a B for equality. \n 1988 /// 11: Substring: Search \a B for substring matches of \a A. \n 1989 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 1990 /// mask of the comparison results. \n 1991 /// 00: No effect. \n 1992 /// 01: Negate the bit mask. \n 1993 /// 10: No effect. \n 1994 /// 11: Negate the bit mask only for bits with an index less than or equal 1995 /// to the size of \a A or \a B. \n 1996 /// \returns Returns 1 if the length of the string in \a A is less than the 1997 /// maximum, otherwise, returns 0. 1998 #define _mm_cmpistrs(A, B, M) \ 1999 (int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \ 2000 (__v16qi)(__m128i)(B), (int)(M)) 2001 2002 /// Uses the immediate operand \a M to perform a comparison of string 2003 /// data with implicitly defined lengths that is contained in source operands 2004 /// \a A and \a B. Returns 1 if the length of the string in \a B is less than 2005 /// the maximum, otherwise, returns 0. 2006 /// 2007 /// \headerfile <x86intrin.h> 2008 /// 2009 /// \code 2010 /// int _mm_cmpistrz(__m128i A, __m128i B, const int M); 2011 /// \endcode 2012 /// 2013 /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 2014 /// instruction. 2015 /// 2016 /// \param A 2017 /// A 128-bit integer vector containing one of the source operands to be 2018 /// compared. 2019 /// \param B 2020 /// A 128-bit integer vector containing one of the source operands to be 2021 /// compared. 2022 /// \param M 2023 /// An 8-bit immediate operand specifying whether the characters are bytes or 2024 /// words and the type of comparison to perform. \n 2025 /// Bits [1:0]: Determine source data format. \n 2026 /// 00: 16 unsigned bytes \n 2027 /// 01: 8 unsigned words \n 2028 /// 10: 16 signed bytes \n 2029 /// 11: 8 signed words \n 2030 /// Bits [3:2]: Determine comparison type and aggregation method. \n 2031 /// 00: Subset: Each character in \a B is compared for equality with all 2032 /// the characters in \a A. \n 2033 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2034 /// basis is greater than or equal for even-indexed elements in \a A, 2035 /// and less than or equal for odd-indexed elements in \a A. \n 2036 /// 10: Match: Compare each pair of corresponding characters in \a A and 2037 /// \a B for equality. \n 2038 /// 11: Substring: Search \a B for substring matches of \a A. \n 2039 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 2040 /// mask of the comparison results. \n 2041 /// 00: No effect. \n 2042 /// 01: Negate the bit mask. \n 2043 /// 10: No effect. \n 2044 /// 11: Negate the bit mask only for bits with an index less than or equal 2045 /// to the size of \a A or \a B. 2046 /// \returns Returns 1 if the length of the string in \a B is less than the 2047 /// maximum, otherwise, returns 0. 2048 #define _mm_cmpistrz(A, B, M) \ 2049 (int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \ 2050 (__v16qi)(__m128i)(B), (int)(M)) 2051 2052 /// Uses the immediate operand \a M to perform a comparison of string 2053 /// data with explicitly defined lengths that is contained in source operands 2054 /// \a A and \a B. Returns 1 if the bit mask is zero and the length of the 2055 /// string in \a B is the maximum, otherwise, returns 0. 2056 /// 2057 /// \headerfile <x86intrin.h> 2058 /// 2059 /// \code 2060 /// int _mm_cmpestra(__m128i A, int LA, __m128i B, int LB, const int M); 2061 /// \endcode 2062 /// 2063 /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> 2064 /// instruction. 2065 /// 2066 /// \param A 2067 /// A 128-bit integer vector containing one of the source operands to be 2068 /// compared. 2069 /// \param LA 2070 /// An integer that specifies the length of the string in \a A. 2071 /// \param B 2072 /// A 128-bit integer vector containing one of the source operands to be 2073 /// compared. 2074 /// \param LB 2075 /// An integer that specifies the length of the string in \a B. 2076 /// \param M 2077 /// An 8-bit immediate operand specifying whether the characters are bytes or 2078 /// words and the type of comparison to perform. \n 2079 /// Bits [1:0]: Determine source data format. \n 2080 /// 00: 16 unsigned bytes \n 2081 /// 01: 8 unsigned words \n 2082 /// 10: 16 signed bytes \n 2083 /// 11: 8 signed words \n 2084 /// Bits [3:2]: Determine comparison type and aggregation method. \n 2085 /// 00: Subset: Each character in \a B is compared for equality with all 2086 /// the characters in \a A. \n 2087 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2088 /// basis is greater than or equal for even-indexed elements in \a A, 2089 /// and less than or equal for odd-indexed elements in \a A. \n 2090 /// 10: Match: Compare each pair of corresponding characters in \a A and 2091 /// \a B for equality. \n 2092 /// 11: Substring: Search \a B for substring matches of \a A. \n 2093 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 2094 /// mask of the comparison results. \n 2095 /// 00: No effect. \n 2096 /// 01: Negate the bit mask. \n 2097 /// 10: No effect. \n 2098 /// 11: Negate the bit mask only for bits with an index less than or equal 2099 /// to the size of \a A or \a B. 2100 /// \returns Returns 1 if the bit mask is zero and the length of the string in 2101 /// \a B is the maximum, otherwise, returns 0. 2102 #define _mm_cmpestra(A, LA, B, LB, M) \ 2103 (int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \ 2104 (__v16qi)(__m128i)(B), (int)(LB), \ 2105 (int)(M)) 2106 2107 /// Uses the immediate operand \a M to perform a comparison of string 2108 /// data with explicitly defined lengths that is contained in source operands 2109 /// \a A and \a B. Returns 1 if the resulting mask is non-zero, otherwise, 2110 /// returns 0. 2111 /// 2112 /// \headerfile <x86intrin.h> 2113 /// 2114 /// \code 2115 /// int _mm_cmpestrc(__m128i A, int LA, __m128i B, int LB, const int M); 2116 /// \endcode 2117 /// 2118 /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> 2119 /// instruction. 2120 /// 2121 /// \param A 2122 /// A 128-bit integer vector containing one of the source operands to be 2123 /// compared. 2124 /// \param LA 2125 /// An integer that specifies the length of the string in \a A. 2126 /// \param B 2127 /// A 128-bit integer vector containing one of the source operands to be 2128 /// compared. 2129 /// \param LB 2130 /// An integer that specifies the length of the string in \a B. 2131 /// \param M 2132 /// An 8-bit immediate operand specifying whether the characters are bytes or 2133 /// words and the type of comparison to perform. \n 2134 /// Bits [1:0]: Determine source data format. \n 2135 /// 00: 16 unsigned bytes \n 2136 /// 01: 8 unsigned words \n 2137 /// 10: 16 signed bytes \n 2138 /// 11: 8 signed words \n 2139 /// Bits [3:2]: Determine comparison type and aggregation method. \n 2140 /// 00: Subset: Each character in \a B is compared for equality with all 2141 /// the characters in \a A. \n 2142 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2143 /// basis is greater than or equal for even-indexed elements in \a A, 2144 /// and less than or equal for odd-indexed elements in \a A. \n 2145 /// 10: Match: Compare each pair of corresponding characters in \a A and 2146 /// \a B for equality. \n 2147 /// 11: Substring: Search \a B for substring matches of \a A. \n 2148 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 2149 /// mask of the comparison results. \n 2150 /// 00: No effect. \n 2151 /// 01: Negate the bit mask. \n 2152 /// 10: No effect. \n 2153 /// 11: Negate the bit mask only for bits with an index less than or equal 2154 /// to the size of \a A or \a B. \n 2155 /// \returns Returns 1 if the resulting mask is non-zero, otherwise, returns 0. 2156 #define _mm_cmpestrc(A, LA, B, LB, M) \ 2157 (int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \ 2158 (__v16qi)(__m128i)(B), (int)(LB), \ 2159 (int)(M)) 2160 2161 /// Uses the immediate operand \a M to perform a comparison of string 2162 /// data with explicitly defined lengths that is contained in source operands 2163 /// \a A and \a B. Returns bit 0 of the resulting bit mask. 2164 /// 2165 /// \headerfile <x86intrin.h> 2166 /// 2167 /// \code 2168 /// int _mm_cmpestro(__m128i A, int LA, __m128i B, int LB, const int M); 2169 /// \endcode 2170 /// 2171 /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> 2172 /// instruction. 2173 /// 2174 /// \param A 2175 /// A 128-bit integer vector containing one of the source operands to be 2176 /// compared. 2177 /// \param LA 2178 /// An integer that specifies the length of the string in \a A. 2179 /// \param B 2180 /// A 128-bit integer vector containing one of the source operands to be 2181 /// compared. 2182 /// \param LB 2183 /// An integer that specifies the length of the string in \a B. 2184 /// \param M 2185 /// An 8-bit immediate operand specifying whether the characters are bytes or 2186 /// words and the type of comparison to perform. \n 2187 /// Bits [1:0]: Determine source data format. \n 2188 /// 00: 16 unsigned bytes \n 2189 /// 01: 8 unsigned words \n 2190 /// 10: 16 signed bytes \n 2191 /// 11: 8 signed words \n 2192 /// Bits [3:2]: Determine comparison type and aggregation method. \n 2193 /// 00: Subset: Each character in \a B is compared for equality with all 2194 /// the characters in \a A. \n 2195 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2196 /// basis is greater than or equal for even-indexed elements in \a A, 2197 /// and less than or equal for odd-indexed elements in \a A. \n 2198 /// 10: Match: Compare each pair of corresponding characters in \a A and 2199 /// \a B for equality. \n 2200 /// 11: Substring: Search \a B for substring matches of \a A. \n 2201 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 2202 /// mask of the comparison results. \n 2203 /// 00: No effect. \n 2204 /// 01: Negate the bit mask. \n 2205 /// 10: No effect. \n 2206 /// 11: Negate the bit mask only for bits with an index less than or equal 2207 /// to the size of \a A or \a B. 2208 /// \returns Returns bit 0 of the resulting bit mask. 2209 #define _mm_cmpestro(A, LA, B, LB, M) \ 2210 (int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \ 2211 (__v16qi)(__m128i)(B), (int)(LB), \ 2212 (int)(M)) 2213 2214 /// Uses the immediate operand \a M to perform a comparison of string 2215 /// data with explicitly defined lengths that is contained in source operands 2216 /// \a A and \a B. Returns 1 if the length of the string in \a A is less than 2217 /// the maximum, otherwise, returns 0. 2218 /// 2219 /// \headerfile <x86intrin.h> 2220 /// 2221 /// \code 2222 /// int _mm_cmpestrs(__m128i A, int LA, __m128i B, int LB, const int M); 2223 /// \endcode 2224 /// 2225 /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> 2226 /// instruction. 2227 /// 2228 /// \param A 2229 /// A 128-bit integer vector containing one of the source operands to be 2230 /// compared. 2231 /// \param LA 2232 /// An integer that specifies the length of the string in \a A. 2233 /// \param B 2234 /// A 128-bit integer vector containing one of the source operands to be 2235 /// compared. 2236 /// \param LB 2237 /// An integer that specifies the length of the string in \a B. 2238 /// \param M 2239 /// An 8-bit immediate operand specifying whether the characters are bytes or 2240 /// words and the type of comparison to perform. \n 2241 /// Bits [1:0]: Determine source data format. \n 2242 /// 00: 16 unsigned bytes \n 2243 /// 01: 8 unsigned words \n 2244 /// 10: 16 signed bytes \n 2245 /// 11: 8 signed words \n 2246 /// Bits [3:2]: Determine comparison type and aggregation method. \n 2247 /// 00: Subset: Each character in \a B is compared for equality with all 2248 /// the characters in \a A. \n 2249 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2250 /// basis is greater than or equal for even-indexed elements in \a A, 2251 /// and less than or equal for odd-indexed elements in \a A. \n 2252 /// 10: Match: Compare each pair of corresponding characters in \a A and 2253 /// \a B for equality. \n 2254 /// 11: Substring: Search \a B for substring matches of \a A. \n 2255 /// Bits [5:4]: Determine whether to perform a one's complement in the bit 2256 /// mask of the comparison results. \n 2257 /// 00: No effect. \n 2258 /// 01: Negate the bit mask. \n 2259 /// 10: No effect. \n 2260 /// 11: Negate the bit mask only for bits with an index less than or equal 2261 /// to the size of \a A or \a B. \n 2262 /// \returns Returns 1 if the length of the string in \a A is less than the 2263 /// maximum, otherwise, returns 0. 2264 #define _mm_cmpestrs(A, LA, B, LB, M) \ 2265 (int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \ 2266 (__v16qi)(__m128i)(B), (int)(LB), \ 2267 (int)(M)) 2268 2269 /// Uses the immediate operand \a M to perform a comparison of string 2270 /// data with explicitly defined lengths that is contained in source operands 2271 /// \a A and \a B. Returns 1 if the length of the string in \a B is less than 2272 /// the maximum, otherwise, returns 0. 2273 /// 2274 /// \headerfile <x86intrin.h> 2275 /// 2276 /// \code 2277 /// int _mm_cmpestrz(__m128i A, int LA, __m128i B, int LB, const int M); 2278 /// \endcode 2279 /// 2280 /// This intrinsic corresponds to the <c> VPCMPESTRI </c> instruction. 2281 /// 2282 /// \param A 2283 /// A 128-bit integer vector containing one of the source operands to be 2284 /// compared. 2285 /// \param LA 2286 /// An integer that specifies the length of the string in \a A. 2287 /// \param B 2288 /// A 128-bit integer vector containing one of the source operands to be 2289 /// compared. 2290 /// \param LB 2291 /// An integer that specifies the length of the string in \a B. 2292 /// \param M 2293 /// An 8-bit immediate operand specifying whether the characters are bytes or 2294 /// words and the type of comparison to perform. \n 2295 /// Bits [1:0]: Determine source data format. \n 2296 /// 00: 16 unsigned bytes \n 2297 /// 01: 8 unsigned words \n 2298 /// 10: 16 signed bytes \n 2299 /// 11: 8 signed words \n 2300 /// Bits [3:2]: Determine comparison type and aggregation method. \n 2301 /// 00: Subset: Each character in \a B is compared for equality with all 2302 /// the characters in \a A. \n 2303 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2304 /// basis is greater than or equal for even-indexed elements in \a A, 2305 /// and less than or equal for odd-indexed elements in \a A. \n 2306 /// 10: Match: Compare each pair of corresponding characters in \a A and 2307 /// \a B for equality. \n 2308 /// 11: Substring: Search \a B for substring matches of \a A. \n 2309 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 2310 /// mask of the comparison results. \n 2311 /// 00: No effect. \n 2312 /// 01: Negate the bit mask. \n 2313 /// 10: No effect. \n 2314 /// 11: Negate the bit mask only for bits with an index less than or equal 2315 /// to the size of \a A or \a B. 2316 /// \returns Returns 1 if the length of the string in \a B is less than the 2317 /// maximum, otherwise, returns 0. 2318 #define _mm_cmpestrz(A, LA, B, LB, M) \ 2319 (int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \ 2320 (__v16qi)(__m128i)(B), (int)(LB), \ 2321 (int)(M)) 2322 2323 /* SSE4.2 Compare Packed Data -- Greater Than. */ 2324 /// Compares each of the corresponding 64-bit values of the 128-bit 2325 /// integer vectors to determine if the values in the first operand are 2326 /// greater than those in the second operand. 2327 /// 2328 /// \headerfile <x86intrin.h> 2329 /// 2330 /// This intrinsic corresponds to the <c> VPCMPGTQ / PCMPGTQ </c> instruction. 2331 /// 2332 /// \param __V1 2333 /// A 128-bit integer vector. 2334 /// \param __V2 2335 /// A 128-bit integer vector. 2336 /// \returns A 128-bit integer vector containing the comparison results. 2337 static __inline__ __m128i __DEFAULT_FN_ATTRS 2338 _mm_cmpgt_epi64(__m128i __V1, __m128i __V2) 2339 { 2340 return (__m128i)((__v2di)__V1 > (__v2di)__V2); 2341 } 2342 2343 /* SSE4.2 Accumulate CRC32. */ 2344 /// Adds the unsigned integer operand to the CRC-32C checksum of the 2345 /// unsigned char operand. 2346 /// 2347 /// \headerfile <x86intrin.h> 2348 /// 2349 /// This intrinsic corresponds to the <c> CRC32B </c> instruction. 2350 /// 2351 /// \param __C 2352 /// An unsigned integer operand to add to the CRC-32C checksum of operand 2353 /// \a __D. 2354 /// \param __D 2355 /// An unsigned 8-bit integer operand used to compute the CRC-32C checksum. 2356 /// \returns The result of adding operand \a __C to the CRC-32C checksum of 2357 /// operand \a __D. 2358 static __inline__ unsigned int __DEFAULT_FN_ATTRS 2359 _mm_crc32_u8(unsigned int __C, unsigned char __D) 2360 { 2361 return __builtin_ia32_crc32qi(__C, __D); 2362 } 2363 2364 /// Adds the unsigned integer operand to the CRC-32C checksum of the 2365 /// unsigned short operand. 2366 /// 2367 /// \headerfile <x86intrin.h> 2368 /// 2369 /// This intrinsic corresponds to the <c> CRC32W </c> instruction. 2370 /// 2371 /// \param __C 2372 /// An unsigned integer operand to add to the CRC-32C checksum of operand 2373 /// \a __D. 2374 /// \param __D 2375 /// An unsigned 16-bit integer operand used to compute the CRC-32C checksum. 2376 /// \returns The result of adding operand \a __C to the CRC-32C checksum of 2377 /// operand \a __D. 2378 static __inline__ unsigned int __DEFAULT_FN_ATTRS 2379 _mm_crc32_u16(unsigned int __C, unsigned short __D) 2380 { 2381 return __builtin_ia32_crc32hi(__C, __D); 2382 } 2383 2384 /// Adds the first unsigned integer operand to the CRC-32C checksum of 2385 /// the second unsigned integer operand. 2386 /// 2387 /// \headerfile <x86intrin.h> 2388 /// 2389 /// This intrinsic corresponds to the <c> CRC32L </c> instruction. 2390 /// 2391 /// \param __C 2392 /// An unsigned integer operand to add to the CRC-32C checksum of operand 2393 /// \a __D. 2394 /// \param __D 2395 /// An unsigned 32-bit integer operand used to compute the CRC-32C checksum. 2396 /// \returns The result of adding operand \a __C to the CRC-32C checksum of 2397 /// operand \a __D. 2398 static __inline__ unsigned int __DEFAULT_FN_ATTRS 2399 _mm_crc32_u32(unsigned int __C, unsigned int __D) 2400 { 2401 return __builtin_ia32_crc32si(__C, __D); 2402 } 2403 2404 #ifdef __x86_64__ 2405 /// Adds the unsigned integer operand to the CRC-32C checksum of the 2406 /// unsigned 64-bit integer operand. 2407 /// 2408 /// \headerfile <x86intrin.h> 2409 /// 2410 /// This intrinsic corresponds to the <c> CRC32Q </c> instruction. 2411 /// 2412 /// \param __C 2413 /// An unsigned integer operand to add to the CRC-32C checksum of operand 2414 /// \a __D. 2415 /// \param __D 2416 /// An unsigned 64-bit integer operand used to compute the CRC-32C checksum. 2417 /// \returns The result of adding operand \a __C to the CRC-32C checksum of 2418 /// operand \a __D. 2419 static __inline__ unsigned long long __DEFAULT_FN_ATTRS 2420 _mm_crc32_u64(unsigned long long __C, unsigned long long __D) 2421 { 2422 return __builtin_ia32_crc32di(__C, __D); 2423 } 2424 #endif /* __x86_64__ */ 2425 2426 #undef __DEFAULT_FN_ATTRS 2427 2428 #include <popcntintrin.h> 2429 2430 #endif /* __SMMINTRIN_H */ 2431