1 /*===------------- avx512vlvbmi2intrin.h - VBMI2 intrinsics -----------------=== 2 * 3 * 4 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 5 * See https://llvm.org/LICENSE.txt for license information. 6 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 * 8 *===-----------------------------------------------------------------------=== 9 */ 10 #ifndef __IMMINTRIN_H 11 #error "Never use <avx512vlvbmi2intrin.h> directly; include <immintrin.h> instead." 12 #endif 13 14 #ifndef __AVX512VLVBMI2INTRIN_H 15 #define __AVX512VLVBMI2INTRIN_H 16 17 /* Define the default attributes for the functions in this file. */ 18 #define __DEFAULT_FN_ATTRS128 \ 19 __attribute__((__always_inline__, __nodebug__, \ 20 __target__("avx512vl,avx512vbmi2,no-evex512"), \ 21 __min_vector_width__(128))) 22 #define __DEFAULT_FN_ATTRS256 \ 23 __attribute__((__always_inline__, __nodebug__, \ 24 __target__("avx512vl,avx512vbmi2,no-evex512"), \ 25 __min_vector_width__(256))) 26 27 static __inline__ __m128i __DEFAULT_FN_ATTRS128 28 _mm_mask_compress_epi16(__m128i __S, __mmask8 __U, __m128i __D) 29 { 30 return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi) __D, 31 (__v8hi) __S, 32 __U); 33 } 34 35 static __inline__ __m128i __DEFAULT_FN_ATTRS128 36 _mm_maskz_compress_epi16(__mmask8 __U, __m128i __D) 37 { 38 return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi) __D, 39 (__v8hi) _mm_setzero_si128(), 40 __U); 41 } 42 43 static __inline__ __m128i __DEFAULT_FN_ATTRS128 44 _mm_mask_compress_epi8(__m128i __S, __mmask16 __U, __m128i __D) 45 { 46 return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi) __D, 47 (__v16qi) __S, 48 __U); 49 } 50 51 static __inline__ __m128i __DEFAULT_FN_ATTRS128 52 _mm_maskz_compress_epi8(__mmask16 __U, __m128i __D) 53 { 54 return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi) __D, 55 (__v16qi) _mm_setzero_si128(), 56 __U); 57 } 58 59 static __inline__ void __DEFAULT_FN_ATTRS128 60 _mm_mask_compressstoreu_epi16(void *__P, __mmask8 __U, __m128i __D) 61 { 62 __builtin_ia32_compressstorehi128_mask ((__v8hi *) __P, (__v8hi) __D, 63 __U); 64 } 65 66 static __inline__ void __DEFAULT_FN_ATTRS128 67 _mm_mask_compressstoreu_epi8(void *__P, __mmask16 __U, __m128i __D) 68 { 69 __builtin_ia32_compressstoreqi128_mask ((__v16qi *) __P, (__v16qi) __D, 70 __U); 71 } 72 73 static __inline__ __m128i __DEFAULT_FN_ATTRS128 74 _mm_mask_expand_epi16(__m128i __S, __mmask8 __U, __m128i __D) 75 { 76 return (__m128i) __builtin_ia32_expandhi128_mask ((__v8hi) __D, 77 (__v8hi) __S, 78 __U); 79 } 80 81 static __inline__ __m128i __DEFAULT_FN_ATTRS128 82 _mm_maskz_expand_epi16(__mmask8 __U, __m128i __D) 83 { 84 return (__m128i) __builtin_ia32_expandhi128_mask ((__v8hi) __D, 85 (__v8hi) _mm_setzero_si128(), 86 __U); 87 } 88 89 static __inline__ __m128i __DEFAULT_FN_ATTRS128 90 _mm_mask_expand_epi8(__m128i __S, __mmask16 __U, __m128i __D) 91 { 92 return (__m128i) __builtin_ia32_expandqi128_mask ((__v16qi) __D, 93 (__v16qi) __S, 94 __U); 95 } 96 97 static __inline__ __m128i __DEFAULT_FN_ATTRS128 98 _mm_maskz_expand_epi8(__mmask16 __U, __m128i __D) 99 { 100 return (__m128i) __builtin_ia32_expandqi128_mask ((__v16qi) __D, 101 (__v16qi) _mm_setzero_si128(), 102 __U); 103 } 104 105 static __inline__ __m128i __DEFAULT_FN_ATTRS128 106 _mm_mask_expandloadu_epi16(__m128i __S, __mmask8 __U, void const *__P) 107 { 108 return (__m128i) __builtin_ia32_expandloadhi128_mask ((const __v8hi *)__P, 109 (__v8hi) __S, 110 __U); 111 } 112 113 static __inline__ __m128i __DEFAULT_FN_ATTRS128 114 _mm_maskz_expandloadu_epi16(__mmask8 __U, void const *__P) 115 { 116 return (__m128i) __builtin_ia32_expandloadhi128_mask ((const __v8hi *)__P, 117 (__v8hi) _mm_setzero_si128(), 118 __U); 119 } 120 121 static __inline__ __m128i __DEFAULT_FN_ATTRS128 122 _mm_mask_expandloadu_epi8(__m128i __S, __mmask16 __U, void const *__P) 123 { 124 return (__m128i) __builtin_ia32_expandloadqi128_mask ((const __v16qi *)__P, 125 (__v16qi) __S, 126 __U); 127 } 128 129 static __inline__ __m128i __DEFAULT_FN_ATTRS128 130 _mm_maskz_expandloadu_epi8(__mmask16 __U, void const *__P) 131 { 132 return (__m128i) __builtin_ia32_expandloadqi128_mask ((const __v16qi *)__P, 133 (__v16qi) _mm_setzero_si128(), 134 __U); 135 } 136 137 static __inline__ __m256i __DEFAULT_FN_ATTRS256 138 _mm256_mask_compress_epi16(__m256i __S, __mmask16 __U, __m256i __D) 139 { 140 return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi) __D, 141 (__v16hi) __S, 142 __U); 143 } 144 145 static __inline__ __m256i __DEFAULT_FN_ATTRS256 146 _mm256_maskz_compress_epi16(__mmask16 __U, __m256i __D) 147 { 148 return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi) __D, 149 (__v16hi) _mm256_setzero_si256(), 150 __U); 151 } 152 153 static __inline__ __m256i __DEFAULT_FN_ATTRS256 154 _mm256_mask_compress_epi8(__m256i __S, __mmask32 __U, __m256i __D) 155 { 156 return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi) __D, 157 (__v32qi) __S, 158 __U); 159 } 160 161 static __inline__ __m256i __DEFAULT_FN_ATTRS256 162 _mm256_maskz_compress_epi8(__mmask32 __U, __m256i __D) 163 { 164 return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi) __D, 165 (__v32qi) _mm256_setzero_si256(), 166 __U); 167 } 168 169 static __inline__ void __DEFAULT_FN_ATTRS256 170 _mm256_mask_compressstoreu_epi16(void *__P, __mmask16 __U, __m256i __D) 171 { 172 __builtin_ia32_compressstorehi256_mask ((__v16hi *) __P, (__v16hi) __D, 173 __U); 174 } 175 176 static __inline__ void __DEFAULT_FN_ATTRS256 177 _mm256_mask_compressstoreu_epi8(void *__P, __mmask32 __U, __m256i __D) 178 { 179 __builtin_ia32_compressstoreqi256_mask ((__v32qi *) __P, (__v32qi) __D, 180 __U); 181 } 182 183 static __inline__ __m256i __DEFAULT_FN_ATTRS256 184 _mm256_mask_expand_epi16(__m256i __S, __mmask16 __U, __m256i __D) 185 { 186 return (__m256i) __builtin_ia32_expandhi256_mask ((__v16hi) __D, 187 (__v16hi) __S, 188 __U); 189 } 190 191 static __inline__ __m256i __DEFAULT_FN_ATTRS256 192 _mm256_maskz_expand_epi16(__mmask16 __U, __m256i __D) 193 { 194 return (__m256i) __builtin_ia32_expandhi256_mask ((__v16hi) __D, 195 (__v16hi) _mm256_setzero_si256(), 196 __U); 197 } 198 199 static __inline__ __m256i __DEFAULT_FN_ATTRS256 200 _mm256_mask_expand_epi8(__m256i __S, __mmask32 __U, __m256i __D) 201 { 202 return (__m256i) __builtin_ia32_expandqi256_mask ((__v32qi) __D, 203 (__v32qi) __S, 204 __U); 205 } 206 207 static __inline__ __m256i __DEFAULT_FN_ATTRS256 208 _mm256_maskz_expand_epi8(__mmask32 __U, __m256i __D) 209 { 210 return (__m256i) __builtin_ia32_expandqi256_mask ((__v32qi) __D, 211 (__v32qi) _mm256_setzero_si256(), 212 __U); 213 } 214 215 static __inline__ __m256i __DEFAULT_FN_ATTRS256 216 _mm256_mask_expandloadu_epi16(__m256i __S, __mmask16 __U, void const *__P) 217 { 218 return (__m256i) __builtin_ia32_expandloadhi256_mask ((const __v16hi *)__P, 219 (__v16hi) __S, 220 __U); 221 } 222 223 static __inline__ __m256i __DEFAULT_FN_ATTRS256 224 _mm256_maskz_expandloadu_epi16(__mmask16 __U, void const *__P) 225 { 226 return (__m256i) __builtin_ia32_expandloadhi256_mask ((const __v16hi *)__P, 227 (__v16hi) _mm256_setzero_si256(), 228 __U); 229 } 230 231 static __inline__ __m256i __DEFAULT_FN_ATTRS256 232 _mm256_mask_expandloadu_epi8(__m256i __S, __mmask32 __U, void const *__P) 233 { 234 return (__m256i) __builtin_ia32_expandloadqi256_mask ((const __v32qi *)__P, 235 (__v32qi) __S, 236 __U); 237 } 238 239 static __inline__ __m256i __DEFAULT_FN_ATTRS256 240 _mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P) 241 { 242 return (__m256i) __builtin_ia32_expandloadqi256_mask ((const __v32qi *)__P, 243 (__v32qi) _mm256_setzero_si256(), 244 __U); 245 } 246 247 #define _mm256_shldi_epi64(A, B, I) \ 248 ((__m256i)__builtin_ia32_vpshldq256((__v4di)(__m256i)(A), \ 249 (__v4di)(__m256i)(B), (int)(I))) 250 251 #define _mm256_mask_shldi_epi64(S, U, A, B, I) \ 252 ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ 253 (__v4di)_mm256_shldi_epi64((A), (B), (I)), \ 254 (__v4di)(__m256i)(S))) 255 256 #define _mm256_maskz_shldi_epi64(U, A, B, I) \ 257 ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ 258 (__v4di)_mm256_shldi_epi64((A), (B), (I)), \ 259 (__v4di)_mm256_setzero_si256())) 260 261 #define _mm_shldi_epi64(A, B, I) \ 262 ((__m128i)__builtin_ia32_vpshldq128((__v2di)(__m128i)(A), \ 263 (__v2di)(__m128i)(B), (int)(I))) 264 265 #define _mm_mask_shldi_epi64(S, U, A, B, I) \ 266 ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ 267 (__v2di)_mm_shldi_epi64((A), (B), (I)), \ 268 (__v2di)(__m128i)(S))) 269 270 #define _mm_maskz_shldi_epi64(U, A, B, I) \ 271 ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ 272 (__v2di)_mm_shldi_epi64((A), (B), (I)), \ 273 (__v2di)_mm_setzero_si128())) 274 275 #define _mm256_shldi_epi32(A, B, I) \ 276 ((__m256i)__builtin_ia32_vpshldd256((__v8si)(__m256i)(A), \ 277 (__v8si)(__m256i)(B), (int)(I))) 278 279 #define _mm256_mask_shldi_epi32(S, U, A, B, I) \ 280 ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ 281 (__v8si)_mm256_shldi_epi32((A), (B), (I)), \ 282 (__v8si)(__m256i)(S))) 283 284 #define _mm256_maskz_shldi_epi32(U, A, B, I) \ 285 ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ 286 (__v8si)_mm256_shldi_epi32((A), (B), (I)), \ 287 (__v8si)_mm256_setzero_si256())) 288 289 #define _mm_shldi_epi32(A, B, I) \ 290 ((__m128i)__builtin_ia32_vpshldd128((__v4si)(__m128i)(A), \ 291 (__v4si)(__m128i)(B), (int)(I))) 292 293 #define _mm_mask_shldi_epi32(S, U, A, B, I) \ 294 ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ 295 (__v4si)_mm_shldi_epi32((A), (B), (I)), \ 296 (__v4si)(__m128i)(S))) 297 298 #define _mm_maskz_shldi_epi32(U, A, B, I) \ 299 ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ 300 (__v4si)_mm_shldi_epi32((A), (B), (I)), \ 301 (__v4si)_mm_setzero_si128())) 302 303 #define _mm256_shldi_epi16(A, B, I) \ 304 ((__m256i)__builtin_ia32_vpshldw256((__v16hi)(__m256i)(A), \ 305 (__v16hi)(__m256i)(B), (int)(I))) 306 307 #define _mm256_mask_shldi_epi16(S, U, A, B, I) \ 308 ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ 309 (__v16hi)_mm256_shldi_epi16((A), (B), (I)), \ 310 (__v16hi)(__m256i)(S))) 311 312 #define _mm256_maskz_shldi_epi16(U, A, B, I) \ 313 ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ 314 (__v16hi)_mm256_shldi_epi16((A), (B), (I)), \ 315 (__v16hi)_mm256_setzero_si256())) 316 317 #define _mm_shldi_epi16(A, B, I) \ 318 ((__m128i)__builtin_ia32_vpshldw128((__v8hi)(__m128i)(A), \ 319 (__v8hi)(__m128i)(B), (int)(I))) 320 321 #define _mm_mask_shldi_epi16(S, U, A, B, I) \ 322 ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ 323 (__v8hi)_mm_shldi_epi16((A), (B), (I)), \ 324 (__v8hi)(__m128i)(S))) 325 326 #define _mm_maskz_shldi_epi16(U, A, B, I) \ 327 ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ 328 (__v8hi)_mm_shldi_epi16((A), (B), (I)), \ 329 (__v8hi)_mm_setzero_si128())) 330 331 #define _mm256_shrdi_epi64(A, B, I) \ 332 ((__m256i)__builtin_ia32_vpshrdq256((__v4di)(__m256i)(A), \ 333 (__v4di)(__m256i)(B), (int)(I))) 334 335 #define _mm256_mask_shrdi_epi64(S, U, A, B, I) \ 336 ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ 337 (__v4di)_mm256_shrdi_epi64((A), (B), (I)), \ 338 (__v4di)(__m256i)(S))) 339 340 #define _mm256_maskz_shrdi_epi64(U, A, B, I) \ 341 ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ 342 (__v4di)_mm256_shrdi_epi64((A), (B), (I)), \ 343 (__v4di)_mm256_setzero_si256())) 344 345 #define _mm_shrdi_epi64(A, B, I) \ 346 ((__m128i)__builtin_ia32_vpshrdq128((__v2di)(__m128i)(A), \ 347 (__v2di)(__m128i)(B), (int)(I))) 348 349 #define _mm_mask_shrdi_epi64(S, U, A, B, I) \ 350 ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ 351 (__v2di)_mm_shrdi_epi64((A), (B), (I)), \ 352 (__v2di)(__m128i)(S))) 353 354 #define _mm_maskz_shrdi_epi64(U, A, B, I) \ 355 ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ 356 (__v2di)_mm_shrdi_epi64((A), (B), (I)), \ 357 (__v2di)_mm_setzero_si128())) 358 359 #define _mm256_shrdi_epi32(A, B, I) \ 360 ((__m256i)__builtin_ia32_vpshrdd256((__v8si)(__m256i)(A), \ 361 (__v8si)(__m256i)(B), (int)(I))) 362 363 #define _mm256_mask_shrdi_epi32(S, U, A, B, I) \ 364 ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ 365 (__v8si)_mm256_shrdi_epi32((A), (B), (I)), \ 366 (__v8si)(__m256i)(S))) 367 368 #define _mm256_maskz_shrdi_epi32(U, A, B, I) \ 369 ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ 370 (__v8si)_mm256_shrdi_epi32((A), (B), (I)), \ 371 (__v8si)_mm256_setzero_si256())) 372 373 #define _mm_shrdi_epi32(A, B, I) \ 374 ((__m128i)__builtin_ia32_vpshrdd128((__v4si)(__m128i)(A), \ 375 (__v4si)(__m128i)(B), (int)(I))) 376 377 #define _mm_mask_shrdi_epi32(S, U, A, B, I) \ 378 ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ 379 (__v4si)_mm_shrdi_epi32((A), (B), (I)), \ 380 (__v4si)(__m128i)(S))) 381 382 #define _mm_maskz_shrdi_epi32(U, A, B, I) \ 383 ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ 384 (__v4si)_mm_shrdi_epi32((A), (B), (I)), \ 385 (__v4si)_mm_setzero_si128())) 386 387 #define _mm256_shrdi_epi16(A, B, I) \ 388 ((__m256i)__builtin_ia32_vpshrdw256((__v16hi)(__m256i)(A), \ 389 (__v16hi)(__m256i)(B), (int)(I))) 390 391 #define _mm256_mask_shrdi_epi16(S, U, A, B, I) \ 392 ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ 393 (__v16hi)_mm256_shrdi_epi16((A), (B), (I)), \ 394 (__v16hi)(__m256i)(S))) 395 396 #define _mm256_maskz_shrdi_epi16(U, A, B, I) \ 397 ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ 398 (__v16hi)_mm256_shrdi_epi16((A), (B), (I)), \ 399 (__v16hi)_mm256_setzero_si256())) 400 401 #define _mm_shrdi_epi16(A, B, I) \ 402 ((__m128i)__builtin_ia32_vpshrdw128((__v8hi)(__m128i)(A), \ 403 (__v8hi)(__m128i)(B), (int)(I))) 404 405 #define _mm_mask_shrdi_epi16(S, U, A, B, I) \ 406 ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ 407 (__v8hi)_mm_shrdi_epi16((A), (B), (I)), \ 408 (__v8hi)(__m128i)(S))) 409 410 #define _mm_maskz_shrdi_epi16(U, A, B, I) \ 411 ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ 412 (__v8hi)_mm_shrdi_epi16((A), (B), (I)), \ 413 (__v8hi)_mm_setzero_si128())) 414 415 static __inline__ __m256i __DEFAULT_FN_ATTRS256 416 _mm256_shldv_epi64(__m256i __A, __m256i __B, __m256i __C) 417 { 418 return (__m256i)__builtin_ia32_vpshldvq256((__v4di)__A, (__v4di)__B, 419 (__v4di)__C); 420 } 421 422 static __inline__ __m256i __DEFAULT_FN_ATTRS256 423 _mm256_mask_shldv_epi64(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) 424 { 425 return (__m256i)__builtin_ia32_selectq_256(__U, 426 (__v4di)_mm256_shldv_epi64(__A, __B, __C), 427 (__v4di)__A); 428 } 429 430 static __inline__ __m256i __DEFAULT_FN_ATTRS256 431 _mm256_maskz_shldv_epi64(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) 432 { 433 return (__m256i)__builtin_ia32_selectq_256(__U, 434 (__v4di)_mm256_shldv_epi64(__A, __B, __C), 435 (__v4di)_mm256_setzero_si256()); 436 } 437 438 static __inline__ __m128i __DEFAULT_FN_ATTRS128 439 _mm_shldv_epi64(__m128i __A, __m128i __B, __m128i __C) 440 { 441 return (__m128i)__builtin_ia32_vpshldvq128((__v2di)__A, (__v2di)__B, 442 (__v2di)__C); 443 } 444 445 static __inline__ __m128i __DEFAULT_FN_ATTRS128 446 _mm_mask_shldv_epi64(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) 447 { 448 return (__m128i)__builtin_ia32_selectq_128(__U, 449 (__v2di)_mm_shldv_epi64(__A, __B, __C), 450 (__v2di)__A); 451 } 452 453 static __inline__ __m128i __DEFAULT_FN_ATTRS128 454 _mm_maskz_shldv_epi64(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) 455 { 456 return (__m128i)__builtin_ia32_selectq_128(__U, 457 (__v2di)_mm_shldv_epi64(__A, __B, __C), 458 (__v2di)_mm_setzero_si128()); 459 } 460 461 static __inline__ __m256i __DEFAULT_FN_ATTRS256 462 _mm256_shldv_epi32(__m256i __A, __m256i __B, __m256i __C) 463 { 464 return (__m256i)__builtin_ia32_vpshldvd256((__v8si)__A, (__v8si)__B, 465 (__v8si)__C); 466 } 467 468 static __inline__ __m256i __DEFAULT_FN_ATTRS256 469 _mm256_mask_shldv_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) 470 { 471 return (__m256i)__builtin_ia32_selectd_256(__U, 472 (__v8si)_mm256_shldv_epi32(__A, __B, __C), 473 (__v8si)__A); 474 } 475 476 static __inline__ __m256i __DEFAULT_FN_ATTRS256 477 _mm256_maskz_shldv_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) 478 { 479 return (__m256i)__builtin_ia32_selectd_256(__U, 480 (__v8si)_mm256_shldv_epi32(__A, __B, __C), 481 (__v8si)_mm256_setzero_si256()); 482 } 483 484 static __inline__ __m128i __DEFAULT_FN_ATTRS128 485 _mm_shldv_epi32(__m128i __A, __m128i __B, __m128i __C) 486 { 487 return (__m128i)__builtin_ia32_vpshldvd128((__v4si)__A, (__v4si)__B, 488 (__v4si)__C); 489 } 490 491 static __inline__ __m128i __DEFAULT_FN_ATTRS128 492 _mm_mask_shldv_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) 493 { 494 return (__m128i)__builtin_ia32_selectd_128(__U, 495 (__v4si)_mm_shldv_epi32(__A, __B, __C), 496 (__v4si)__A); 497 } 498 499 static __inline__ __m128i __DEFAULT_FN_ATTRS128 500 _mm_maskz_shldv_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) 501 { 502 return (__m128i)__builtin_ia32_selectd_128(__U, 503 (__v4si)_mm_shldv_epi32(__A, __B, __C), 504 (__v4si)_mm_setzero_si128()); 505 } 506 507 static __inline__ __m256i __DEFAULT_FN_ATTRS256 508 _mm256_shldv_epi16(__m256i __A, __m256i __B, __m256i __C) 509 { 510 return (__m256i)__builtin_ia32_vpshldvw256((__v16hi)__A, (__v16hi)__B, 511 (__v16hi)__C); 512 } 513 514 static __inline__ __m256i __DEFAULT_FN_ATTRS256 515 _mm256_mask_shldv_epi16(__m256i __A, __mmask16 __U, __m256i __B, __m256i __C) 516 { 517 return (__m256i)__builtin_ia32_selectw_256(__U, 518 (__v16hi)_mm256_shldv_epi16(__A, __B, __C), 519 (__v16hi)__A); 520 } 521 522 static __inline__ __m256i __DEFAULT_FN_ATTRS256 523 _mm256_maskz_shldv_epi16(__mmask16 __U, __m256i __A, __m256i __B, __m256i __C) 524 { 525 return (__m256i)__builtin_ia32_selectw_256(__U, 526 (__v16hi)_mm256_shldv_epi16(__A, __B, __C), 527 (__v16hi)_mm256_setzero_si256()); 528 } 529 530 static __inline__ __m128i __DEFAULT_FN_ATTRS128 531 _mm_shldv_epi16(__m128i __A, __m128i __B, __m128i __C) 532 { 533 return (__m128i)__builtin_ia32_vpshldvw128((__v8hi)__A, (__v8hi)__B, 534 (__v8hi)__C); 535 } 536 537 static __inline__ __m128i __DEFAULT_FN_ATTRS128 538 _mm_mask_shldv_epi16(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) 539 { 540 return (__m128i)__builtin_ia32_selectw_128(__U, 541 (__v8hi)_mm_shldv_epi16(__A, __B, __C), 542 (__v8hi)__A); 543 } 544 545 static __inline__ __m128i __DEFAULT_FN_ATTRS128 546 _mm_maskz_shldv_epi16(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) 547 { 548 return (__m128i)__builtin_ia32_selectw_128(__U, 549 (__v8hi)_mm_shldv_epi16(__A, __B, __C), 550 (__v8hi)_mm_setzero_si128()); 551 } 552 553 static __inline__ __m256i __DEFAULT_FN_ATTRS256 554 _mm256_shrdv_epi64(__m256i __A, __m256i __B, __m256i __C) 555 { 556 return (__m256i)__builtin_ia32_vpshrdvq256((__v4di)__A, (__v4di)__B, 557 (__v4di)__C); 558 } 559 560 static __inline__ __m256i __DEFAULT_FN_ATTRS256 561 _mm256_mask_shrdv_epi64(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) 562 { 563 return (__m256i)__builtin_ia32_selectq_256(__U, 564 (__v4di)_mm256_shrdv_epi64(__A, __B, __C), 565 (__v4di)__A); 566 } 567 568 static __inline__ __m256i __DEFAULT_FN_ATTRS256 569 _mm256_maskz_shrdv_epi64(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) 570 { 571 return (__m256i)__builtin_ia32_selectq_256(__U, 572 (__v4di)_mm256_shrdv_epi64(__A, __B, __C), 573 (__v4di)_mm256_setzero_si256()); 574 } 575 576 static __inline__ __m128i __DEFAULT_FN_ATTRS128 577 _mm_shrdv_epi64(__m128i __A, __m128i __B, __m128i __C) 578 { 579 return (__m128i)__builtin_ia32_vpshrdvq128((__v2di)__A, (__v2di)__B, 580 (__v2di)__C); 581 } 582 583 static __inline__ __m128i __DEFAULT_FN_ATTRS128 584 _mm_mask_shrdv_epi64(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) 585 { 586 return (__m128i)__builtin_ia32_selectq_128(__U, 587 (__v2di)_mm_shrdv_epi64(__A, __B, __C), 588 (__v2di)__A); 589 } 590 591 static __inline__ __m128i __DEFAULT_FN_ATTRS128 592 _mm_maskz_shrdv_epi64(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) 593 { 594 return (__m128i)__builtin_ia32_selectq_128(__U, 595 (__v2di)_mm_shrdv_epi64(__A, __B, __C), 596 (__v2di)_mm_setzero_si128()); 597 } 598 599 static __inline__ __m256i __DEFAULT_FN_ATTRS256 600 _mm256_shrdv_epi32(__m256i __A, __m256i __B, __m256i __C) 601 { 602 return (__m256i)__builtin_ia32_vpshrdvd256((__v8si)__A, (__v8si)__B, 603 (__v8si)__C); 604 } 605 606 static __inline__ __m256i __DEFAULT_FN_ATTRS256 607 _mm256_mask_shrdv_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) 608 { 609 return (__m256i)__builtin_ia32_selectd_256(__U, 610 (__v8si)_mm256_shrdv_epi32(__A, __B, __C), 611 (__v8si)__A); 612 } 613 614 static __inline__ __m256i __DEFAULT_FN_ATTRS256 615 _mm256_maskz_shrdv_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) 616 { 617 return (__m256i)__builtin_ia32_selectd_256(__U, 618 (__v8si)_mm256_shrdv_epi32(__A, __B, __C), 619 (__v8si)_mm256_setzero_si256()); 620 } 621 622 static __inline__ __m128i __DEFAULT_FN_ATTRS128 623 _mm_shrdv_epi32(__m128i __A, __m128i __B, __m128i __C) 624 { 625 return (__m128i)__builtin_ia32_vpshrdvd128((__v4si)__A, (__v4si)__B, 626 (__v4si)__C); 627 } 628 629 static __inline__ __m128i __DEFAULT_FN_ATTRS128 630 _mm_mask_shrdv_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) 631 { 632 return (__m128i)__builtin_ia32_selectd_128(__U, 633 (__v4si)_mm_shrdv_epi32(__A, __B, __C), 634 (__v4si)__A); 635 } 636 637 static __inline__ __m128i __DEFAULT_FN_ATTRS128 638 _mm_maskz_shrdv_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) 639 { 640 return (__m128i)__builtin_ia32_selectd_128(__U, 641 (__v4si)_mm_shrdv_epi32(__A, __B, __C), 642 (__v4si)_mm_setzero_si128()); 643 } 644 645 static __inline__ __m256i __DEFAULT_FN_ATTRS256 646 _mm256_shrdv_epi16(__m256i __A, __m256i __B, __m256i __C) 647 { 648 return (__m256i)__builtin_ia32_vpshrdvw256((__v16hi)__A, (__v16hi)__B, 649 (__v16hi)__C); 650 } 651 652 static __inline__ __m256i __DEFAULT_FN_ATTRS256 653 _mm256_mask_shrdv_epi16(__m256i __A, __mmask16 __U, __m256i __B, __m256i __C) 654 { 655 return (__m256i)__builtin_ia32_selectw_256(__U, 656 (__v16hi)_mm256_shrdv_epi16(__A, __B, __C), 657 (__v16hi)__A); 658 } 659 660 static __inline__ __m256i __DEFAULT_FN_ATTRS256 661 _mm256_maskz_shrdv_epi16(__mmask16 __U, __m256i __A, __m256i __B, __m256i __C) 662 { 663 return (__m256i)__builtin_ia32_selectw_256(__U, 664 (__v16hi)_mm256_shrdv_epi16(__A, __B, __C), 665 (__v16hi)_mm256_setzero_si256()); 666 } 667 668 static __inline__ __m128i __DEFAULT_FN_ATTRS128 669 _mm_shrdv_epi16(__m128i __A, __m128i __B, __m128i __C) 670 { 671 return (__m128i)__builtin_ia32_vpshrdvw128((__v8hi)__A, (__v8hi)__B, 672 (__v8hi)__C); 673 } 674 675 static __inline__ __m128i __DEFAULT_FN_ATTRS128 676 _mm_mask_shrdv_epi16(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) 677 { 678 return (__m128i)__builtin_ia32_selectw_128(__U, 679 (__v8hi)_mm_shrdv_epi16(__A, __B, __C), 680 (__v8hi)__A); 681 } 682 683 static __inline__ __m128i __DEFAULT_FN_ATTRS128 684 _mm_maskz_shrdv_epi16(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) 685 { 686 return (__m128i)__builtin_ia32_selectw_128(__U, 687 (__v8hi)_mm_shrdv_epi16(__A, __B, __C), 688 (__v8hi)_mm_setzero_si128()); 689 } 690 691 692 #undef __DEFAULT_FN_ATTRS128 693 #undef __DEFAULT_FN_ATTRS256 694 695 #endif 696