1 /*===---- avx512vldqintrin.h - AVX512VL and AVX512DQ intrinsics ------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10 #ifndef __IMMINTRIN_H 11 #error "Never use <avx512vldqintrin.h> directly; include <immintrin.h> instead." 12 #endif 13 14 #ifndef __AVX512VLDQINTRIN_H 15 #define __AVX512VLDQINTRIN_H 16 17 /* Define the default attributes for the functions in this file. */ 18 #define __DEFAULT_FN_ATTRS128 \ 19 __attribute__((__always_inline__, __nodebug__, \ 20 __target__("avx512vl,avx512dq,no-evex512"), \ 21 __min_vector_width__(128))) 22 #define __DEFAULT_FN_ATTRS256 \ 23 __attribute__((__always_inline__, __nodebug__, \ 24 __target__("avx512vl,avx512dq,no-evex512"), \ 25 __min_vector_width__(256))) 26 27 static __inline__ __m256i __DEFAULT_FN_ATTRS256 28 _mm256_mullo_epi64 (__m256i __A, __m256i __B) { 29 return (__m256i) ((__v4du) __A * (__v4du) __B); 30 } 31 32 static __inline__ __m256i __DEFAULT_FN_ATTRS256 33 _mm256_mask_mullo_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { 34 return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, 35 (__v4di)_mm256_mullo_epi64(__A, __B), 36 (__v4di)__W); 37 } 38 39 static __inline__ __m256i __DEFAULT_FN_ATTRS256 40 _mm256_maskz_mullo_epi64(__mmask8 __U, __m256i __A, __m256i __B) { 41 return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, 42 (__v4di)_mm256_mullo_epi64(__A, __B), 43 (__v4di)_mm256_setzero_si256()); 44 } 45 46 static __inline__ __m128i __DEFAULT_FN_ATTRS128 47 _mm_mullo_epi64 (__m128i __A, __m128i __B) { 48 return (__m128i) ((__v2du) __A * (__v2du) __B); 49 } 50 51 static __inline__ __m128i __DEFAULT_FN_ATTRS128 52 _mm_mask_mullo_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { 53 return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, 54 (__v2di)_mm_mullo_epi64(__A, __B), 55 (__v2di)__W); 56 } 57 58 static __inline__ __m128i __DEFAULT_FN_ATTRS128 59 _mm_maskz_mullo_epi64(__mmask8 __U, __m128i __A, __m128i __B) { 60 return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, 61 (__v2di)_mm_mullo_epi64(__A, __B), 62 (__v2di)_mm_setzero_si128()); 63 } 64 65 static __inline__ __m256d __DEFAULT_FN_ATTRS256 66 _mm256_mask_andnot_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { 67 return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, 68 (__v4df)_mm256_andnot_pd(__A, __B), 69 (__v4df)__W); 70 } 71 72 static __inline__ __m256d __DEFAULT_FN_ATTRS256 73 _mm256_maskz_andnot_pd(__mmask8 __U, __m256d __A, __m256d __B) { 74 return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, 75 (__v4df)_mm256_andnot_pd(__A, __B), 76 (__v4df)_mm256_setzero_pd()); 77 } 78 79 static __inline__ __m128d __DEFAULT_FN_ATTRS128 80 _mm_mask_andnot_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { 81 return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, 82 (__v2df)_mm_andnot_pd(__A, __B), 83 (__v2df)__W); 84 } 85 86 static __inline__ __m128d __DEFAULT_FN_ATTRS128 87 _mm_maskz_andnot_pd(__mmask8 __U, __m128d __A, __m128d __B) { 88 return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, 89 (__v2df)_mm_andnot_pd(__A, __B), 90 (__v2df)_mm_setzero_pd()); 91 } 92 93 static __inline__ __m256 __DEFAULT_FN_ATTRS256 94 _mm256_mask_andnot_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { 95 return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, 96 (__v8sf)_mm256_andnot_ps(__A, __B), 97 (__v8sf)__W); 98 } 99 100 static __inline__ __m256 __DEFAULT_FN_ATTRS256 101 _mm256_maskz_andnot_ps(__mmask8 __U, __m256 __A, __m256 __B) { 102 return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, 103 (__v8sf)_mm256_andnot_ps(__A, __B), 104 (__v8sf)_mm256_setzero_ps()); 105 } 106 107 static __inline__ __m128 __DEFAULT_FN_ATTRS128 108 _mm_mask_andnot_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { 109 return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, 110 (__v4sf)_mm_andnot_ps(__A, __B), 111 (__v4sf)__W); 112 } 113 114 static __inline__ __m128 __DEFAULT_FN_ATTRS128 115 _mm_maskz_andnot_ps(__mmask8 __U, __m128 __A, __m128 __B) { 116 return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, 117 (__v4sf)_mm_andnot_ps(__A, __B), 118 (__v4sf)_mm_setzero_ps()); 119 } 120 121 static __inline__ __m256d __DEFAULT_FN_ATTRS256 122 _mm256_mask_and_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { 123 return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, 124 (__v4df)_mm256_and_pd(__A, __B), 125 (__v4df)__W); 126 } 127 128 static __inline__ __m256d __DEFAULT_FN_ATTRS256 129 _mm256_maskz_and_pd(__mmask8 __U, __m256d __A, __m256d __B) { 130 return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, 131 (__v4df)_mm256_and_pd(__A, __B), 132 (__v4df)_mm256_setzero_pd()); 133 } 134 135 static __inline__ __m128d __DEFAULT_FN_ATTRS128 136 _mm_mask_and_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { 137 return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, 138 (__v2df)_mm_and_pd(__A, __B), 139 (__v2df)__W); 140 } 141 142 static __inline__ __m128d __DEFAULT_FN_ATTRS128 143 _mm_maskz_and_pd(__mmask8 __U, __m128d __A, __m128d __B) { 144 return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, 145 (__v2df)_mm_and_pd(__A, __B), 146 (__v2df)_mm_setzero_pd()); 147 } 148 149 static __inline__ __m256 __DEFAULT_FN_ATTRS256 150 _mm256_mask_and_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { 151 return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, 152 (__v8sf)_mm256_and_ps(__A, __B), 153 (__v8sf)__W); 154 } 155 156 static __inline__ __m256 __DEFAULT_FN_ATTRS256 157 _mm256_maskz_and_ps(__mmask8 __U, __m256 __A, __m256 __B) { 158 return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, 159 (__v8sf)_mm256_and_ps(__A, __B), 160 (__v8sf)_mm256_setzero_ps()); 161 } 162 163 static __inline__ __m128 __DEFAULT_FN_ATTRS128 164 _mm_mask_and_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { 165 return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, 166 (__v4sf)_mm_and_ps(__A, __B), 167 (__v4sf)__W); 168 } 169 170 static __inline__ __m128 __DEFAULT_FN_ATTRS128 171 _mm_maskz_and_ps(__mmask8 __U, __m128 __A, __m128 __B) { 172 return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, 173 (__v4sf)_mm_and_ps(__A, __B), 174 (__v4sf)_mm_setzero_ps()); 175 } 176 177 static __inline__ __m256d __DEFAULT_FN_ATTRS256 178 _mm256_mask_xor_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { 179 return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, 180 (__v4df)_mm256_xor_pd(__A, __B), 181 (__v4df)__W); 182 } 183 184 static __inline__ __m256d __DEFAULT_FN_ATTRS256 185 _mm256_maskz_xor_pd(__mmask8 __U, __m256d __A, __m256d __B) { 186 return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, 187 (__v4df)_mm256_xor_pd(__A, __B), 188 (__v4df)_mm256_setzero_pd()); 189 } 190 191 static __inline__ __m128d __DEFAULT_FN_ATTRS128 192 _mm_mask_xor_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { 193 return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, 194 (__v2df)_mm_xor_pd(__A, __B), 195 (__v2df)__W); 196 } 197 198 static __inline__ __m128d __DEFAULT_FN_ATTRS128 199 _mm_maskz_xor_pd (__mmask8 __U, __m128d __A, __m128d __B) { 200 return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, 201 (__v2df)_mm_xor_pd(__A, __B), 202 (__v2df)_mm_setzero_pd()); 203 } 204 205 static __inline__ __m256 __DEFAULT_FN_ATTRS256 206 _mm256_mask_xor_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { 207 return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, 208 (__v8sf)_mm256_xor_ps(__A, __B), 209 (__v8sf)__W); 210 } 211 212 static __inline__ __m256 __DEFAULT_FN_ATTRS256 213 _mm256_maskz_xor_ps(__mmask8 __U, __m256 __A, __m256 __B) { 214 return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, 215 (__v8sf)_mm256_xor_ps(__A, __B), 216 (__v8sf)_mm256_setzero_ps()); 217 } 218 219 static __inline__ __m128 __DEFAULT_FN_ATTRS128 220 _mm_mask_xor_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { 221 return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, 222 (__v4sf)_mm_xor_ps(__A, __B), 223 (__v4sf)__W); 224 } 225 226 static __inline__ __m128 __DEFAULT_FN_ATTRS128 227 _mm_maskz_xor_ps(__mmask8 __U, __m128 __A, __m128 __B) { 228 return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, 229 (__v4sf)_mm_xor_ps(__A, __B), 230 (__v4sf)_mm_setzero_ps()); 231 } 232 233 static __inline__ __m256d __DEFAULT_FN_ATTRS256 234 _mm256_mask_or_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { 235 return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, 236 (__v4df)_mm256_or_pd(__A, __B), 237 (__v4df)__W); 238 } 239 240 static __inline__ __m256d __DEFAULT_FN_ATTRS256 241 _mm256_maskz_or_pd(__mmask8 __U, __m256d __A, __m256d __B) { 242 return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, 243 (__v4df)_mm256_or_pd(__A, __B), 244 (__v4df)_mm256_setzero_pd()); 245 } 246 247 static __inline__ __m128d __DEFAULT_FN_ATTRS128 248 _mm_mask_or_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { 249 return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, 250 (__v2df)_mm_or_pd(__A, __B), 251 (__v2df)__W); 252 } 253 254 static __inline__ __m128d __DEFAULT_FN_ATTRS128 255 _mm_maskz_or_pd(__mmask8 __U, __m128d __A, __m128d __B) { 256 return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, 257 (__v2df)_mm_or_pd(__A, __B), 258 (__v2df)_mm_setzero_pd()); 259 } 260 261 static __inline__ __m256 __DEFAULT_FN_ATTRS256 262 _mm256_mask_or_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { 263 return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, 264 (__v8sf)_mm256_or_ps(__A, __B), 265 (__v8sf)__W); 266 } 267 268 static __inline__ __m256 __DEFAULT_FN_ATTRS256 269 _mm256_maskz_or_ps(__mmask8 __U, __m256 __A, __m256 __B) { 270 return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, 271 (__v8sf)_mm256_or_ps(__A, __B), 272 (__v8sf)_mm256_setzero_ps()); 273 } 274 275 static __inline__ __m128 __DEFAULT_FN_ATTRS128 276 _mm_mask_or_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { 277 return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, 278 (__v4sf)_mm_or_ps(__A, __B), 279 (__v4sf)__W); 280 } 281 282 static __inline__ __m128 __DEFAULT_FN_ATTRS128 283 _mm_maskz_or_ps(__mmask8 __U, __m128 __A, __m128 __B) { 284 return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, 285 (__v4sf)_mm_or_ps(__A, __B), 286 (__v4sf)_mm_setzero_ps()); 287 } 288 289 static __inline__ __m128i __DEFAULT_FN_ATTRS128 290 _mm_cvtpd_epi64 (__m128d __A) { 291 return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A, 292 (__v2di) _mm_setzero_si128(), 293 (__mmask8) -1); 294 } 295 296 static __inline__ __m128i __DEFAULT_FN_ATTRS128 297 _mm_mask_cvtpd_epi64 (__m128i __W, __mmask8 __U, __m128d __A) { 298 return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A, 299 (__v2di) __W, 300 (__mmask8) __U); 301 } 302 303 static __inline__ __m128i __DEFAULT_FN_ATTRS128 304 _mm_maskz_cvtpd_epi64 (__mmask8 __U, __m128d __A) { 305 return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A, 306 (__v2di) _mm_setzero_si128(), 307 (__mmask8) __U); 308 } 309 310 static __inline__ __m256i __DEFAULT_FN_ATTRS256 311 _mm256_cvtpd_epi64 (__m256d __A) { 312 return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A, 313 (__v4di) _mm256_setzero_si256(), 314 (__mmask8) -1); 315 } 316 317 static __inline__ __m256i __DEFAULT_FN_ATTRS256 318 _mm256_mask_cvtpd_epi64 (__m256i __W, __mmask8 __U, __m256d __A) { 319 return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A, 320 (__v4di) __W, 321 (__mmask8) __U); 322 } 323 324 static __inline__ __m256i __DEFAULT_FN_ATTRS256 325 _mm256_maskz_cvtpd_epi64 (__mmask8 __U, __m256d __A) { 326 return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A, 327 (__v4di) _mm256_setzero_si256(), 328 (__mmask8) __U); 329 } 330 331 static __inline__ __m128i __DEFAULT_FN_ATTRS128 332 _mm_cvtpd_epu64 (__m128d __A) { 333 return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A, 334 (__v2di) _mm_setzero_si128(), 335 (__mmask8) -1); 336 } 337 338 static __inline__ __m128i __DEFAULT_FN_ATTRS128 339 _mm_mask_cvtpd_epu64 (__m128i __W, __mmask8 __U, __m128d __A) { 340 return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A, 341 (__v2di) __W, 342 (__mmask8) __U); 343 } 344 345 static __inline__ __m128i __DEFAULT_FN_ATTRS128 346 _mm_maskz_cvtpd_epu64 (__mmask8 __U, __m128d __A) { 347 return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A, 348 (__v2di) _mm_setzero_si128(), 349 (__mmask8) __U); 350 } 351 352 static __inline__ __m256i __DEFAULT_FN_ATTRS256 353 _mm256_cvtpd_epu64 (__m256d __A) { 354 return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A, 355 (__v4di) _mm256_setzero_si256(), 356 (__mmask8) -1); 357 } 358 359 static __inline__ __m256i __DEFAULT_FN_ATTRS256 360 _mm256_mask_cvtpd_epu64 (__m256i __W, __mmask8 __U, __m256d __A) { 361 return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A, 362 (__v4di) __W, 363 (__mmask8) __U); 364 } 365 366 static __inline__ __m256i __DEFAULT_FN_ATTRS256 367 _mm256_maskz_cvtpd_epu64 (__mmask8 __U, __m256d __A) { 368 return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A, 369 (__v4di) _mm256_setzero_si256(), 370 (__mmask8) __U); 371 } 372 373 static __inline__ __m128i __DEFAULT_FN_ATTRS128 374 _mm_cvtps_epi64 (__m128 __A) { 375 return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A, 376 (__v2di) _mm_setzero_si128(), 377 (__mmask8) -1); 378 } 379 380 static __inline__ __m128i __DEFAULT_FN_ATTRS128 381 _mm_mask_cvtps_epi64 (__m128i __W, __mmask8 __U, __m128 __A) { 382 return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A, 383 (__v2di) __W, 384 (__mmask8) __U); 385 } 386 387 static __inline__ __m128i __DEFAULT_FN_ATTRS128 388 _mm_maskz_cvtps_epi64 (__mmask8 __U, __m128 __A) { 389 return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A, 390 (__v2di) _mm_setzero_si128(), 391 (__mmask8) __U); 392 } 393 394 static __inline__ __m256i __DEFAULT_FN_ATTRS256 395 _mm256_cvtps_epi64 (__m128 __A) { 396 return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A, 397 (__v4di) _mm256_setzero_si256(), 398 (__mmask8) -1); 399 } 400 401 static __inline__ __m256i __DEFAULT_FN_ATTRS256 402 _mm256_mask_cvtps_epi64 (__m256i __W, __mmask8 __U, __m128 __A) { 403 return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A, 404 (__v4di) __W, 405 (__mmask8) __U); 406 } 407 408 static __inline__ __m256i __DEFAULT_FN_ATTRS256 409 _mm256_maskz_cvtps_epi64 (__mmask8 __U, __m128 __A) { 410 return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A, 411 (__v4di) _mm256_setzero_si256(), 412 (__mmask8) __U); 413 } 414 415 static __inline__ __m128i __DEFAULT_FN_ATTRS128 416 _mm_cvtps_epu64 (__m128 __A) { 417 return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A, 418 (__v2di) _mm_setzero_si128(), 419 (__mmask8) -1); 420 } 421 422 static __inline__ __m128i __DEFAULT_FN_ATTRS128 423 _mm_mask_cvtps_epu64 (__m128i __W, __mmask8 __U, __m128 __A) { 424 return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A, 425 (__v2di) __W, 426 (__mmask8) __U); 427 } 428 429 static __inline__ __m128i __DEFAULT_FN_ATTRS128 430 _mm_maskz_cvtps_epu64 (__mmask8 __U, __m128 __A) { 431 return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A, 432 (__v2di) _mm_setzero_si128(), 433 (__mmask8) __U); 434 } 435 436 static __inline__ __m256i __DEFAULT_FN_ATTRS256 437 _mm256_cvtps_epu64 (__m128 __A) { 438 return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A, 439 (__v4di) _mm256_setzero_si256(), 440 (__mmask8) -1); 441 } 442 443 static __inline__ __m256i __DEFAULT_FN_ATTRS256 444 _mm256_mask_cvtps_epu64 (__m256i __W, __mmask8 __U, __m128 __A) { 445 return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A, 446 (__v4di) __W, 447 (__mmask8) __U); 448 } 449 450 static __inline__ __m256i __DEFAULT_FN_ATTRS256 451 _mm256_maskz_cvtps_epu64 (__mmask8 __U, __m128 __A) { 452 return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A, 453 (__v4di) _mm256_setzero_si256(), 454 (__mmask8) __U); 455 } 456 457 static __inline__ __m128d __DEFAULT_FN_ATTRS128 458 _mm_cvtepi64_pd (__m128i __A) { 459 return (__m128d)__builtin_convertvector((__v2di)__A, __v2df); 460 } 461 462 static __inline__ __m128d __DEFAULT_FN_ATTRS128 463 _mm_mask_cvtepi64_pd (__m128d __W, __mmask8 __U, __m128i __A) { 464 return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, 465 (__v2df)_mm_cvtepi64_pd(__A), 466 (__v2df)__W); 467 } 468 469 static __inline__ __m128d __DEFAULT_FN_ATTRS128 470 _mm_maskz_cvtepi64_pd (__mmask8 __U, __m128i __A) { 471 return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, 472 (__v2df)_mm_cvtepi64_pd(__A), 473 (__v2df)_mm_setzero_pd()); 474 } 475 476 static __inline__ __m256d __DEFAULT_FN_ATTRS256 477 _mm256_cvtepi64_pd (__m256i __A) { 478 return (__m256d)__builtin_convertvector((__v4di)__A, __v4df); 479 } 480 481 static __inline__ __m256d __DEFAULT_FN_ATTRS256 482 _mm256_mask_cvtepi64_pd (__m256d __W, __mmask8 __U, __m256i __A) { 483 return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, 484 (__v4df)_mm256_cvtepi64_pd(__A), 485 (__v4df)__W); 486 } 487 488 static __inline__ __m256d __DEFAULT_FN_ATTRS256 489 _mm256_maskz_cvtepi64_pd (__mmask8 __U, __m256i __A) { 490 return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, 491 (__v4df)_mm256_cvtepi64_pd(__A), 492 (__v4df)_mm256_setzero_pd()); 493 } 494 495 static __inline__ __m128 __DEFAULT_FN_ATTRS128 496 _mm_cvtepi64_ps (__m128i __A) { 497 return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A, 498 (__v4sf) _mm_setzero_ps(), 499 (__mmask8) -1); 500 } 501 502 static __inline__ __m128 __DEFAULT_FN_ATTRS128 503 _mm_mask_cvtepi64_ps (__m128 __W, __mmask8 __U, __m128i __A) { 504 return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A, 505 (__v4sf) __W, 506 (__mmask8) __U); 507 } 508 509 static __inline__ __m128 __DEFAULT_FN_ATTRS128 510 _mm_maskz_cvtepi64_ps (__mmask8 __U, __m128i __A) { 511 return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A, 512 (__v4sf) _mm_setzero_ps(), 513 (__mmask8) __U); 514 } 515 516 static __inline__ __m128 __DEFAULT_FN_ATTRS256 517 _mm256_cvtepi64_ps (__m256i __A) { 518 return (__m128)__builtin_convertvector((__v4di)__A, __v4sf); 519 } 520 521 static __inline__ __m128 __DEFAULT_FN_ATTRS256 522 _mm256_mask_cvtepi64_ps (__m128 __W, __mmask8 __U, __m256i __A) { 523 return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, 524 (__v4sf)_mm256_cvtepi64_ps(__A), 525 (__v4sf)__W); 526 } 527 528 static __inline__ __m128 __DEFAULT_FN_ATTRS256 529 _mm256_maskz_cvtepi64_ps (__mmask8 __U, __m256i __A) { 530 return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, 531 (__v4sf)_mm256_cvtepi64_ps(__A), 532 (__v4sf)_mm_setzero_ps()); 533 } 534 535 static __inline__ __m128i __DEFAULT_FN_ATTRS128 536 _mm_cvttpd_epi64 (__m128d __A) { 537 return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A, 538 (__v2di) _mm_setzero_si128(), 539 (__mmask8) -1); 540 } 541 542 static __inline__ __m128i __DEFAULT_FN_ATTRS128 543 _mm_mask_cvttpd_epi64 (__m128i __W, __mmask8 __U, __m128d __A) { 544 return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A, 545 (__v2di) __W, 546 (__mmask8) __U); 547 } 548 549 static __inline__ __m128i __DEFAULT_FN_ATTRS128 550 _mm_maskz_cvttpd_epi64 (__mmask8 __U, __m128d __A) { 551 return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A, 552 (__v2di) _mm_setzero_si128(), 553 (__mmask8) __U); 554 } 555 556 static __inline__ __m256i __DEFAULT_FN_ATTRS256 557 _mm256_cvttpd_epi64 (__m256d __A) { 558 return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A, 559 (__v4di) _mm256_setzero_si256(), 560 (__mmask8) -1); 561 } 562 563 static __inline__ __m256i __DEFAULT_FN_ATTRS256 564 _mm256_mask_cvttpd_epi64 (__m256i __W, __mmask8 __U, __m256d __A) { 565 return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A, 566 (__v4di) __W, 567 (__mmask8) __U); 568 } 569 570 static __inline__ __m256i __DEFAULT_FN_ATTRS256 571 _mm256_maskz_cvttpd_epi64 (__mmask8 __U, __m256d __A) { 572 return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A, 573 (__v4di) _mm256_setzero_si256(), 574 (__mmask8) __U); 575 } 576 577 static __inline__ __m128i __DEFAULT_FN_ATTRS128 578 _mm_cvttpd_epu64 (__m128d __A) { 579 return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A, 580 (__v2di) _mm_setzero_si128(), 581 (__mmask8) -1); 582 } 583 584 static __inline__ __m128i __DEFAULT_FN_ATTRS128 585 _mm_mask_cvttpd_epu64 (__m128i __W, __mmask8 __U, __m128d __A) { 586 return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A, 587 (__v2di) __W, 588 (__mmask8) __U); 589 } 590 591 static __inline__ __m128i __DEFAULT_FN_ATTRS128 592 _mm_maskz_cvttpd_epu64 (__mmask8 __U, __m128d __A) { 593 return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A, 594 (__v2di) _mm_setzero_si128(), 595 (__mmask8) __U); 596 } 597 598 static __inline__ __m256i __DEFAULT_FN_ATTRS256 599 _mm256_cvttpd_epu64 (__m256d __A) { 600 return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A, 601 (__v4di) _mm256_setzero_si256(), 602 (__mmask8) -1); 603 } 604 605 static __inline__ __m256i __DEFAULT_FN_ATTRS256 606 _mm256_mask_cvttpd_epu64 (__m256i __W, __mmask8 __U, __m256d __A) { 607 return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A, 608 (__v4di) __W, 609 (__mmask8) __U); 610 } 611 612 static __inline__ __m256i __DEFAULT_FN_ATTRS256 613 _mm256_maskz_cvttpd_epu64 (__mmask8 __U, __m256d __A) { 614 return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A, 615 (__v4di) _mm256_setzero_si256(), 616 (__mmask8) __U); 617 } 618 619 static __inline__ __m128i __DEFAULT_FN_ATTRS128 620 _mm_cvttps_epi64 (__m128 __A) { 621 return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A, 622 (__v2di) _mm_setzero_si128(), 623 (__mmask8) -1); 624 } 625 626 static __inline__ __m128i __DEFAULT_FN_ATTRS128 627 _mm_mask_cvttps_epi64 (__m128i __W, __mmask8 __U, __m128 __A) { 628 return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A, 629 (__v2di) __W, 630 (__mmask8) __U); 631 } 632 633 static __inline__ __m128i __DEFAULT_FN_ATTRS128 634 _mm_maskz_cvttps_epi64 (__mmask8 __U, __m128 __A) { 635 return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A, 636 (__v2di) _mm_setzero_si128(), 637 (__mmask8) __U); 638 } 639 640 static __inline__ __m256i __DEFAULT_FN_ATTRS256 641 _mm256_cvttps_epi64 (__m128 __A) { 642 return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A, 643 (__v4di) _mm256_setzero_si256(), 644 (__mmask8) -1); 645 } 646 647 static __inline__ __m256i __DEFAULT_FN_ATTRS256 648 _mm256_mask_cvttps_epi64 (__m256i __W, __mmask8 __U, __m128 __A) { 649 return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A, 650 (__v4di) __W, 651 (__mmask8) __U); 652 } 653 654 static __inline__ __m256i __DEFAULT_FN_ATTRS256 655 _mm256_maskz_cvttps_epi64 (__mmask8 __U, __m128 __A) { 656 return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A, 657 (__v4di) _mm256_setzero_si256(), 658 (__mmask8) __U); 659 } 660 661 static __inline__ __m128i __DEFAULT_FN_ATTRS128 662 _mm_cvttps_epu64 (__m128 __A) { 663 return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A, 664 (__v2di) _mm_setzero_si128(), 665 (__mmask8) -1); 666 } 667 668 static __inline__ __m128i __DEFAULT_FN_ATTRS128 669 _mm_mask_cvttps_epu64 (__m128i __W, __mmask8 __U, __m128 __A) { 670 return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A, 671 (__v2di) __W, 672 (__mmask8) __U); 673 } 674 675 static __inline__ __m128i __DEFAULT_FN_ATTRS128 676 _mm_maskz_cvttps_epu64 (__mmask8 __U, __m128 __A) { 677 return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A, 678 (__v2di) _mm_setzero_si128(), 679 (__mmask8) __U); 680 } 681 682 static __inline__ __m256i __DEFAULT_FN_ATTRS256 683 _mm256_cvttps_epu64 (__m128 __A) { 684 return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A, 685 (__v4di) _mm256_setzero_si256(), 686 (__mmask8) -1); 687 } 688 689 static __inline__ __m256i __DEFAULT_FN_ATTRS256 690 _mm256_mask_cvttps_epu64 (__m256i __W, __mmask8 __U, __m128 __A) { 691 return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A, 692 (__v4di) __W, 693 (__mmask8) __U); 694 } 695 696 static __inline__ __m256i __DEFAULT_FN_ATTRS256 697 _mm256_maskz_cvttps_epu64 (__mmask8 __U, __m128 __A) { 698 return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A, 699 (__v4di) _mm256_setzero_si256(), 700 (__mmask8) __U); 701 } 702 703 static __inline__ __m128d __DEFAULT_FN_ATTRS128 704 _mm_cvtepu64_pd (__m128i __A) { 705 return (__m128d)__builtin_convertvector((__v2du)__A, __v2df); 706 } 707 708 static __inline__ __m128d __DEFAULT_FN_ATTRS128 709 _mm_mask_cvtepu64_pd (__m128d __W, __mmask8 __U, __m128i __A) { 710 return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, 711 (__v2df)_mm_cvtepu64_pd(__A), 712 (__v2df)__W); 713 } 714 715 static __inline__ __m128d __DEFAULT_FN_ATTRS128 716 _mm_maskz_cvtepu64_pd (__mmask8 __U, __m128i __A) { 717 return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, 718 (__v2df)_mm_cvtepu64_pd(__A), 719 (__v2df)_mm_setzero_pd()); 720 } 721 722 static __inline__ __m256d __DEFAULT_FN_ATTRS256 723 _mm256_cvtepu64_pd (__m256i __A) { 724 return (__m256d)__builtin_convertvector((__v4du)__A, __v4df); 725 } 726 727 static __inline__ __m256d __DEFAULT_FN_ATTRS256 728 _mm256_mask_cvtepu64_pd (__m256d __W, __mmask8 __U, __m256i __A) { 729 return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, 730 (__v4df)_mm256_cvtepu64_pd(__A), 731 (__v4df)__W); 732 } 733 734 static __inline__ __m256d __DEFAULT_FN_ATTRS256 735 _mm256_maskz_cvtepu64_pd (__mmask8 __U, __m256i __A) { 736 return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, 737 (__v4df)_mm256_cvtepu64_pd(__A), 738 (__v4df)_mm256_setzero_pd()); 739 } 740 741 static __inline__ __m128 __DEFAULT_FN_ATTRS128 742 _mm_cvtepu64_ps (__m128i __A) { 743 return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A, 744 (__v4sf) _mm_setzero_ps(), 745 (__mmask8) -1); 746 } 747 748 static __inline__ __m128 __DEFAULT_FN_ATTRS128 749 _mm_mask_cvtepu64_ps (__m128 __W, __mmask8 __U, __m128i __A) { 750 return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A, 751 (__v4sf) __W, 752 (__mmask8) __U); 753 } 754 755 static __inline__ __m128 __DEFAULT_FN_ATTRS128 756 _mm_maskz_cvtepu64_ps (__mmask8 __U, __m128i __A) { 757 return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A, 758 (__v4sf) _mm_setzero_ps(), 759 (__mmask8) __U); 760 } 761 762 static __inline__ __m128 __DEFAULT_FN_ATTRS256 763 _mm256_cvtepu64_ps (__m256i __A) { 764 return (__m128)__builtin_convertvector((__v4du)__A, __v4sf); 765 } 766 767 static __inline__ __m128 __DEFAULT_FN_ATTRS256 768 _mm256_mask_cvtepu64_ps (__m128 __W, __mmask8 __U, __m256i __A) { 769 return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, 770 (__v4sf)_mm256_cvtepu64_ps(__A), 771 (__v4sf)__W); 772 } 773 774 static __inline__ __m128 __DEFAULT_FN_ATTRS256 775 _mm256_maskz_cvtepu64_ps (__mmask8 __U, __m256i __A) { 776 return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, 777 (__v4sf)_mm256_cvtepu64_ps(__A), 778 (__v4sf)_mm_setzero_ps()); 779 } 780 781 #define _mm_range_pd(A, B, C) \ 782 ((__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \ 783 (__v2df)(__m128d)(B), (int)(C), \ 784 (__v2df)_mm_setzero_pd(), \ 785 (__mmask8)-1)) 786 787 #define _mm_mask_range_pd(W, U, A, B, C) \ 788 ((__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \ 789 (__v2df)(__m128d)(B), (int)(C), \ 790 (__v2df)(__m128d)(W), \ 791 (__mmask8)(U))) 792 793 #define _mm_maskz_range_pd(U, A, B, C) \ 794 ((__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \ 795 (__v2df)(__m128d)(B), (int)(C), \ 796 (__v2df)_mm_setzero_pd(), \ 797 (__mmask8)(U))) 798 799 #define _mm256_range_pd(A, B, C) \ 800 ((__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \ 801 (__v4df)(__m256d)(B), (int)(C), \ 802 (__v4df)_mm256_setzero_pd(), \ 803 (__mmask8)-1)) 804 805 #define _mm256_mask_range_pd(W, U, A, B, C) \ 806 ((__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \ 807 (__v4df)(__m256d)(B), (int)(C), \ 808 (__v4df)(__m256d)(W), \ 809 (__mmask8)(U))) 810 811 #define _mm256_maskz_range_pd(U, A, B, C) \ 812 ((__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \ 813 (__v4df)(__m256d)(B), (int)(C), \ 814 (__v4df)_mm256_setzero_pd(), \ 815 (__mmask8)(U))) 816 817 #define _mm_range_ps(A, B, C) \ 818 ((__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \ 819 (__v4sf)(__m128)(B), (int)(C), \ 820 (__v4sf)_mm_setzero_ps(), \ 821 (__mmask8)-1)) 822 823 #define _mm_mask_range_ps(W, U, A, B, C) \ 824 ((__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \ 825 (__v4sf)(__m128)(B), (int)(C), \ 826 (__v4sf)(__m128)(W), (__mmask8)(U))) 827 828 #define _mm_maskz_range_ps(U, A, B, C) \ 829 ((__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \ 830 (__v4sf)(__m128)(B), (int)(C), \ 831 (__v4sf)_mm_setzero_ps(), \ 832 (__mmask8)(U))) 833 834 #define _mm256_range_ps(A, B, C) \ 835 ((__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \ 836 (__v8sf)(__m256)(B), (int)(C), \ 837 (__v8sf)_mm256_setzero_ps(), \ 838 (__mmask8)-1)) 839 840 #define _mm256_mask_range_ps(W, U, A, B, C) \ 841 ((__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \ 842 (__v8sf)(__m256)(B), (int)(C), \ 843 (__v8sf)(__m256)(W), (__mmask8)(U))) 844 845 #define _mm256_maskz_range_ps(U, A, B, C) \ 846 ((__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \ 847 (__v8sf)(__m256)(B), (int)(C), \ 848 (__v8sf)_mm256_setzero_ps(), \ 849 (__mmask8)(U))) 850 851 #define _mm_reduce_pd(A, B) \ 852 ((__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \ 853 (__v2df)_mm_setzero_pd(), \ 854 (__mmask8)-1)) 855 856 #define _mm_mask_reduce_pd(W, U, A, B) \ 857 ((__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \ 858 (__v2df)(__m128d)(W), \ 859 (__mmask8)(U))) 860 861 #define _mm_maskz_reduce_pd(U, A, B) \ 862 ((__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \ 863 (__v2df)_mm_setzero_pd(), \ 864 (__mmask8)(U))) 865 866 #define _mm256_reduce_pd(A, B) \ 867 ((__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \ 868 (__v4df)_mm256_setzero_pd(), \ 869 (__mmask8)-1)) 870 871 #define _mm256_mask_reduce_pd(W, U, A, B) \ 872 ((__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \ 873 (__v4df)(__m256d)(W), \ 874 (__mmask8)(U))) 875 876 #define _mm256_maskz_reduce_pd(U, A, B) \ 877 ((__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \ 878 (__v4df)_mm256_setzero_pd(), \ 879 (__mmask8)(U))) 880 881 #define _mm_reduce_ps(A, B) \ 882 ((__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \ 883 (__v4sf)_mm_setzero_ps(), \ 884 (__mmask8)-1)) 885 886 #define _mm_mask_reduce_ps(W, U, A, B) \ 887 ((__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \ 888 (__v4sf)(__m128)(W), \ 889 (__mmask8)(U))) 890 891 #define _mm_maskz_reduce_ps(U, A, B) \ 892 ((__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \ 893 (__v4sf)_mm_setzero_ps(), \ 894 (__mmask8)(U))) 895 896 #define _mm256_reduce_ps(A, B) \ 897 ((__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \ 898 (__v8sf)_mm256_setzero_ps(), \ 899 (__mmask8)-1)) 900 901 #define _mm256_mask_reduce_ps(W, U, A, B) \ 902 ((__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \ 903 (__v8sf)(__m256)(W), \ 904 (__mmask8)(U))) 905 906 #define _mm256_maskz_reduce_ps(U, A, B) \ 907 ((__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \ 908 (__v8sf)_mm256_setzero_ps(), \ 909 (__mmask8)(U))) 910 911 static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 912 _mm_movepi32_mask (__m128i __A) 913 { 914 return (__mmask8) __builtin_ia32_cvtd2mask128 ((__v4si) __A); 915 } 916 917 static __inline__ __mmask8 __DEFAULT_FN_ATTRS256 918 _mm256_movepi32_mask (__m256i __A) 919 { 920 return (__mmask8) __builtin_ia32_cvtd2mask256 ((__v8si) __A); 921 } 922 923 static __inline__ __m128i __DEFAULT_FN_ATTRS128 924 _mm_movm_epi32 (__mmask8 __A) 925 { 926 return (__m128i) __builtin_ia32_cvtmask2d128 (__A); 927 } 928 929 static __inline__ __m256i __DEFAULT_FN_ATTRS256 930 _mm256_movm_epi32 (__mmask8 __A) 931 { 932 return (__m256i) __builtin_ia32_cvtmask2d256 (__A); 933 } 934 935 static __inline__ __m128i __DEFAULT_FN_ATTRS128 936 _mm_movm_epi64 (__mmask8 __A) 937 { 938 return (__m128i) __builtin_ia32_cvtmask2q128 (__A); 939 } 940 941 static __inline__ __m256i __DEFAULT_FN_ATTRS256 942 _mm256_movm_epi64 (__mmask8 __A) 943 { 944 return (__m256i) __builtin_ia32_cvtmask2q256 (__A); 945 } 946 947 static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 948 _mm_movepi64_mask (__m128i __A) 949 { 950 return (__mmask8) __builtin_ia32_cvtq2mask128 ((__v2di) __A); 951 } 952 953 static __inline__ __mmask8 __DEFAULT_FN_ATTRS256 954 _mm256_movepi64_mask (__m256i __A) 955 { 956 return (__mmask8) __builtin_ia32_cvtq2mask256 ((__v4di) __A); 957 } 958 959 static __inline__ __m256 __DEFAULT_FN_ATTRS256 960 _mm256_broadcast_f32x2 (__m128 __A) 961 { 962 return (__m256)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A, 963 0, 1, 0, 1, 0, 1, 0, 1); 964 } 965 966 static __inline__ __m256 __DEFAULT_FN_ATTRS256 967 _mm256_mask_broadcast_f32x2 (__m256 __O, __mmask8 __M, __m128 __A) 968 { 969 return (__m256)__builtin_ia32_selectps_256((__mmask8)__M, 970 (__v8sf)_mm256_broadcast_f32x2(__A), 971 (__v8sf)__O); 972 } 973 974 static __inline__ __m256 __DEFAULT_FN_ATTRS256 975 _mm256_maskz_broadcast_f32x2 (__mmask8 __M, __m128 __A) 976 { 977 return (__m256)__builtin_ia32_selectps_256((__mmask8)__M, 978 (__v8sf)_mm256_broadcast_f32x2(__A), 979 (__v8sf)_mm256_setzero_ps()); 980 } 981 982 static __inline__ __m256d __DEFAULT_FN_ATTRS256 983 _mm256_broadcast_f64x2(__m128d __A) 984 { 985 return (__m256d)__builtin_shufflevector((__v2df)__A, (__v2df)__A, 986 0, 1, 0, 1); 987 } 988 989 static __inline__ __m256d __DEFAULT_FN_ATTRS256 990 _mm256_mask_broadcast_f64x2(__m256d __O, __mmask8 __M, __m128d __A) 991 { 992 return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__M, 993 (__v4df)_mm256_broadcast_f64x2(__A), 994 (__v4df)__O); 995 } 996 997 static __inline__ __m256d __DEFAULT_FN_ATTRS256 998 _mm256_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A) 999 { 1000 return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__M, 1001 (__v4df)_mm256_broadcast_f64x2(__A), 1002 (__v4df)_mm256_setzero_pd()); 1003 } 1004 1005 static __inline__ __m128i __DEFAULT_FN_ATTRS128 1006 _mm_broadcast_i32x2 (__m128i __A) 1007 { 1008 return (__m128i)__builtin_shufflevector((__v4si)__A, (__v4si)__A, 1009 0, 1, 0, 1); 1010 } 1011 1012 static __inline__ __m128i __DEFAULT_FN_ATTRS128 1013 _mm_mask_broadcast_i32x2 (__m128i __O, __mmask8 __M, __m128i __A) 1014 { 1015 return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, 1016 (__v4si)_mm_broadcast_i32x2(__A), 1017 (__v4si)__O); 1018 } 1019 1020 static __inline__ __m128i __DEFAULT_FN_ATTRS128 1021 _mm_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A) 1022 { 1023 return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, 1024 (__v4si)_mm_broadcast_i32x2(__A), 1025 (__v4si)_mm_setzero_si128()); 1026 } 1027 1028 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1029 _mm256_broadcast_i32x2 (__m128i __A) 1030 { 1031 return (__m256i)__builtin_shufflevector((__v4si)__A, (__v4si)__A, 1032 0, 1, 0, 1, 0, 1, 0, 1); 1033 } 1034 1035 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1036 _mm256_mask_broadcast_i32x2 (__m256i __O, __mmask8 __M, __m128i __A) 1037 { 1038 return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, 1039 (__v8si)_mm256_broadcast_i32x2(__A), 1040 (__v8si)__O); 1041 } 1042 1043 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1044 _mm256_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A) 1045 { 1046 return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, 1047 (__v8si)_mm256_broadcast_i32x2(__A), 1048 (__v8si)_mm256_setzero_si256()); 1049 } 1050 1051 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1052 _mm256_broadcast_i64x2(__m128i __A) 1053 { 1054 return (__m256i)__builtin_shufflevector((__v2di)__A, (__v2di)__A, 1055 0, 1, 0, 1); 1056 } 1057 1058 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1059 _mm256_mask_broadcast_i64x2(__m256i __O, __mmask8 __M, __m128i __A) 1060 { 1061 return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, 1062 (__v4di)_mm256_broadcast_i64x2(__A), 1063 (__v4di)__O); 1064 } 1065 1066 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1067 _mm256_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A) 1068 { 1069 return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, 1070 (__v4di)_mm256_broadcast_i64x2(__A), 1071 (__v4di)_mm256_setzero_si256()); 1072 } 1073 1074 #define _mm256_extractf64x2_pd(A, imm) \ 1075 ((__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \ 1076 (int)(imm), \ 1077 (__v2df)_mm_undefined_pd(), \ 1078 (__mmask8)-1)) 1079 1080 #define _mm256_mask_extractf64x2_pd(W, U, A, imm) \ 1081 ((__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \ 1082 (int)(imm), \ 1083 (__v2df)(__m128d)(W), \ 1084 (__mmask8)(U))) 1085 1086 #define _mm256_maskz_extractf64x2_pd(U, A, imm) \ 1087 ((__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \ 1088 (int)(imm), \ 1089 (__v2df)_mm_setzero_pd(), \ 1090 (__mmask8)(U))) 1091 1092 #define _mm256_extracti64x2_epi64(A, imm) \ 1093 ((__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \ 1094 (int)(imm), \ 1095 (__v2di)_mm_undefined_si128(), \ 1096 (__mmask8)-1)) 1097 1098 #define _mm256_mask_extracti64x2_epi64(W, U, A, imm) \ 1099 ((__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \ 1100 (int)(imm), \ 1101 (__v2di)(__m128i)(W), \ 1102 (__mmask8)(U))) 1103 1104 #define _mm256_maskz_extracti64x2_epi64(U, A, imm) \ 1105 ((__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \ 1106 (int)(imm), \ 1107 (__v2di)_mm_setzero_si128(), \ 1108 (__mmask8)(U))) 1109 1110 #define _mm256_insertf64x2(A, B, imm) \ 1111 ((__m256d)__builtin_ia32_insertf64x2_256((__v4df)(__m256d)(A), \ 1112 (__v2df)(__m128d)(B), (int)(imm))) 1113 1114 #define _mm256_mask_insertf64x2(W, U, A, B, imm) \ 1115 ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ 1116 (__v4df)_mm256_insertf64x2((A), (B), (imm)), \ 1117 (__v4df)(__m256d)(W))) 1118 1119 #define _mm256_maskz_insertf64x2(U, A, B, imm) \ 1120 ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ 1121 (__v4df)_mm256_insertf64x2((A), (B), (imm)), \ 1122 (__v4df)_mm256_setzero_pd())) 1123 1124 #define _mm256_inserti64x2(A, B, imm) \ 1125 ((__m256i)__builtin_ia32_inserti64x2_256((__v4di)(__m256i)(A), \ 1126 (__v2di)(__m128i)(B), (int)(imm))) 1127 1128 #define _mm256_mask_inserti64x2(W, U, A, B, imm) \ 1129 ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ 1130 (__v4di)_mm256_inserti64x2((A), (B), (imm)), \ 1131 (__v4di)(__m256i)(W))) 1132 1133 #define _mm256_maskz_inserti64x2(U, A, B, imm) \ 1134 ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ 1135 (__v4di)_mm256_inserti64x2((A), (B), (imm)), \ 1136 (__v4di)_mm256_setzero_si256())) 1137 1138 #define _mm_mask_fpclass_pd_mask(U, A, imm) \ 1139 ((__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \ 1140 (__mmask8)(U))) 1141 1142 #define _mm_fpclass_pd_mask(A, imm) \ 1143 ((__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \ 1144 (__mmask8)-1)) 1145 1146 #define _mm256_mask_fpclass_pd_mask(U, A, imm) \ 1147 ((__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(A), (int)(imm), \ 1148 (__mmask8)(U))) 1149 1150 #define _mm256_fpclass_pd_mask(A, imm) \ 1151 ((__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(A), (int)(imm), \ 1152 (__mmask8)-1)) 1153 1154 #define _mm_mask_fpclass_ps_mask(U, A, imm) \ 1155 ((__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(A), (int)(imm), \ 1156 (__mmask8)(U))) 1157 1158 #define _mm_fpclass_ps_mask(A, imm) \ 1159 ((__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(A), (int)(imm), \ 1160 (__mmask8)-1)) 1161 1162 #define _mm256_mask_fpclass_ps_mask(U, A, imm) \ 1163 ((__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(A), (int)(imm), \ 1164 (__mmask8)(U))) 1165 1166 #define _mm256_fpclass_ps_mask(A, imm) \ 1167 ((__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(A), (int)(imm), \ 1168 (__mmask8)-1)) 1169 1170 #undef __DEFAULT_FN_ATTRS128 1171 #undef __DEFAULT_FN_ATTRS256 1172 1173 #endif 1174