1 /*===----------- avx512fp16intrin.h - AVX512-FP16 intrinsics ---------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 #ifndef __IMMINTRIN_H 10 #error "Never use <avx512fp16intrin.h> directly; include <immintrin.h> instead." 11 #endif 12 13 #ifdef __SSE2__ 14 15 #ifndef __AVX512FP16INTRIN_H 16 #define __AVX512FP16INTRIN_H 17 18 /* Define the default attributes for the functions in this file. */ 19 typedef _Float16 __v32hf __attribute__((__vector_size__(64), __aligned__(64))); 20 typedef _Float16 __m512h __attribute__((__vector_size__(64), __aligned__(64))); 21 typedef _Float16 __m512h_u __attribute__((__vector_size__(64), __aligned__(1))); 22 23 /* Define the default attributes for the functions in this file. */ 24 #define __DEFAULT_FN_ATTRS512 \ 25 __attribute__((__always_inline__, __nodebug__, \ 26 __target__("avx512fp16,evex512"), __min_vector_width__(512))) 27 #define __DEFAULT_FN_ATTRS256 \ 28 __attribute__((__always_inline__, __nodebug__, \ 29 __target__("avx512fp16,no-evex512"), \ 30 __min_vector_width__(256))) 31 #define __DEFAULT_FN_ATTRS128 \ 32 __attribute__((__always_inline__, __nodebug__, \ 33 __target__("avx512fp16,no-evex512"), \ 34 __min_vector_width__(128))) 35 36 static __inline__ _Float16 __DEFAULT_FN_ATTRS512 _mm512_cvtsh_h(__m512h __a) { 37 return __a[0]; 38 } 39 40 static __inline __m128h __DEFAULT_FN_ATTRS128 _mm_setzero_ph(void) { 41 return (__m128h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; 42 } 43 44 static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_setzero_ph(void) { 45 return (__m256h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 46 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; 47 } 48 49 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_undefined_ph(void) { 50 return (__m256h)__builtin_ia32_undef256(); 51 } 52 53 static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_setzero_ph(void) { 54 return (__m512h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 55 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 56 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; 57 } 58 59 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_undefined_ph(void) { 60 return (__m128h)__builtin_ia32_undef128(); 61 } 62 63 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_undefined_ph(void) { 64 return (__m512h)__builtin_ia32_undef512(); 65 } 66 67 static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_set1_ph(_Float16 __h) { 68 return (__m512h)(__v32hf){__h, __h, __h, __h, __h, __h, __h, __h, 69 __h, __h, __h, __h, __h, __h, __h, __h, 70 __h, __h, __h, __h, __h, __h, __h, __h, 71 __h, __h, __h, __h, __h, __h, __h, __h}; 72 } 73 74 static __inline __m512h __DEFAULT_FN_ATTRS512 75 _mm512_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4, 76 _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8, 77 _Float16 __h9, _Float16 __h10, _Float16 __h11, _Float16 __h12, 78 _Float16 __h13, _Float16 __h14, _Float16 __h15, _Float16 __h16, 79 _Float16 __h17, _Float16 __h18, _Float16 __h19, _Float16 __h20, 80 _Float16 __h21, _Float16 __h22, _Float16 __h23, _Float16 __h24, 81 _Float16 __h25, _Float16 __h26, _Float16 __h27, _Float16 __h28, 82 _Float16 __h29, _Float16 __h30, _Float16 __h31, _Float16 __h32) { 83 return (__m512h)(__v32hf){__h32, __h31, __h30, __h29, __h28, __h27, __h26, 84 __h25, __h24, __h23, __h22, __h21, __h20, __h19, 85 __h18, __h17, __h16, __h15, __h14, __h13, __h12, 86 __h11, __h10, __h9, __h8, __h7, __h6, __h5, 87 __h4, __h3, __h2, __h1}; 88 } 89 90 #define _mm512_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11, h12, h13, \ 91 h14, h15, h16, h17, h18, h19, h20, h21, h22, h23, h24, \ 92 h25, h26, h27, h28, h29, h30, h31, h32) \ 93 _mm512_set_ph((h32), (h31), (h30), (h29), (h28), (h27), (h26), (h25), (h24), \ 94 (h23), (h22), (h21), (h20), (h19), (h18), (h17), (h16), (h15), \ 95 (h14), (h13), (h12), (h11), (h10), (h9), (h8), (h7), (h6), \ 96 (h5), (h4), (h3), (h2), (h1)) 97 98 static __inline __m512h __DEFAULT_FN_ATTRS512 99 _mm512_set1_pch(_Float16 _Complex h) { 100 return (__m512h)_mm512_set1_ps(__builtin_bit_cast(float, h)); 101 } 102 103 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_castph_ps(__m128h __a) { 104 return (__m128)__a; 105 } 106 107 static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_castph_ps(__m256h __a) { 108 return (__m256)__a; 109 } 110 111 static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_castph_ps(__m512h __a) { 112 return (__m512)__a; 113 } 114 115 static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_castph_pd(__m128h __a) { 116 return (__m128d)__a; 117 } 118 119 static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_castph_pd(__m256h __a) { 120 return (__m256d)__a; 121 } 122 123 static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_castph_pd(__m512h __a) { 124 return (__m512d)__a; 125 } 126 127 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_castph_si128(__m128h __a) { 128 return (__m128i)__a; 129 } 130 131 static __inline__ __m256i __DEFAULT_FN_ATTRS256 132 _mm256_castph_si256(__m256h __a) { 133 return (__m256i)__a; 134 } 135 136 static __inline__ __m512i __DEFAULT_FN_ATTRS512 137 _mm512_castph_si512(__m512h __a) { 138 return (__m512i)__a; 139 } 140 141 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castps_ph(__m128 __a) { 142 return (__m128h)__a; 143 } 144 145 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_castps_ph(__m256 __a) { 146 return (__m256h)__a; 147 } 148 149 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_castps_ph(__m512 __a) { 150 return (__m512h)__a; 151 } 152 153 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castpd_ph(__m128d __a) { 154 return (__m128h)__a; 155 } 156 157 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_castpd_ph(__m256d __a) { 158 return (__m256h)__a; 159 } 160 161 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_castpd_ph(__m512d __a) { 162 return (__m512h)__a; 163 } 164 165 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castsi128_ph(__m128i __a) { 166 return (__m128h)__a; 167 } 168 169 static __inline__ __m256h __DEFAULT_FN_ATTRS256 170 _mm256_castsi256_ph(__m256i __a) { 171 return (__m256h)__a; 172 } 173 174 static __inline__ __m512h __DEFAULT_FN_ATTRS512 175 _mm512_castsi512_ph(__m512i __a) { 176 return (__m512h)__a; 177 } 178 179 static __inline__ __m128h __DEFAULT_FN_ATTRS256 180 _mm256_castph256_ph128(__m256h __a) { 181 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7); 182 } 183 184 static __inline__ __m128h __DEFAULT_FN_ATTRS512 185 _mm512_castph512_ph128(__m512h __a) { 186 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7); 187 } 188 189 static __inline__ __m256h __DEFAULT_FN_ATTRS512 190 _mm512_castph512_ph256(__m512h __a) { 191 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 192 12, 13, 14, 15); 193 } 194 195 static __inline__ __m256h __DEFAULT_FN_ATTRS256 196 _mm256_castph128_ph256(__m128h __a) { 197 return __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a), 198 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 199 } 200 201 static __inline__ __m512h __DEFAULT_FN_ATTRS512 202 _mm512_castph128_ph512(__m128h __a) { 203 __m256h __b = __builtin_nondeterministic_value(__b); 204 return __builtin_shufflevector( 205 __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a), 206 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), 207 __b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 208 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); 209 } 210 211 static __inline__ __m512h __DEFAULT_FN_ATTRS512 212 _mm512_castph256_ph512(__m256h __a) { 213 return __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a), 0, 214 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 215 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 216 27, 28, 29, 30, 31); 217 } 218 219 /// Constructs a 256-bit floating-point vector of [16 x half] from a 220 /// 128-bit floating-point vector of [8 x half]. The lower 128 bits 221 /// contain the value of the source vector. The upper 384 bits are set 222 /// to zero. 223 /// 224 /// \headerfile <x86intrin.h> 225 /// 226 /// This intrinsic has no corresponding instruction. 227 /// 228 /// \param __a 229 /// A 128-bit vector of [8 x half]. 230 /// \returns A 512-bit floating-point vector of [16 x half]. The lower 128 bits 231 /// contain the value of the parameter. The upper 384 bits are set to zero. 232 static __inline__ __m256h __DEFAULT_FN_ATTRS256 233 _mm256_zextph128_ph256(__m128h __a) { 234 return __builtin_shufflevector(__a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4, 235 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 236 } 237 238 /// Constructs a 512-bit floating-point vector of [32 x half] from a 239 /// 128-bit floating-point vector of [8 x half]. The lower 128 bits 240 /// contain the value of the source vector. The upper 384 bits are set 241 /// to zero. 242 /// 243 /// \headerfile <x86intrin.h> 244 /// 245 /// This intrinsic has no corresponding instruction. 246 /// 247 /// \param __a 248 /// A 128-bit vector of [8 x half]. 249 /// \returns A 512-bit floating-point vector of [32 x half]. The lower 128 bits 250 /// contain the value of the parameter. The upper 384 bits are set to zero. 251 static __inline__ __m512h __DEFAULT_FN_ATTRS512 252 _mm512_zextph128_ph512(__m128h __a) { 253 return __builtin_shufflevector( 254 __a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 255 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15); 256 } 257 258 /// Constructs a 512-bit floating-point vector of [32 x half] from a 259 /// 256-bit floating-point vector of [16 x half]. The lower 256 bits 260 /// contain the value of the source vector. The upper 256 bits are set 261 /// to zero. 262 /// 263 /// \headerfile <x86intrin.h> 264 /// 265 /// This intrinsic has no corresponding instruction. 266 /// 267 /// \param __a 268 /// A 256-bit vector of [16 x half]. 269 /// \returns A 512-bit floating-point vector of [32 x half]. The lower 256 bits 270 /// contain the value of the parameter. The upper 256 bits are set to zero. 271 static __inline__ __m512h __DEFAULT_FN_ATTRS512 272 _mm512_zextph256_ph512(__m256h __a) { 273 return __builtin_shufflevector(__a, (__v16hf)_mm256_setzero_ph(), 0, 1, 2, 3, 274 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 275 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 276 29, 30, 31); 277 } 278 279 #define _mm_comi_round_sh(A, B, P, R) \ 280 __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, (int)(P), (int)(R)) 281 282 #define _mm_comi_sh(A, B, pred) \ 283 _mm_comi_round_sh((A), (B), (pred), _MM_FROUND_CUR_DIRECTION) 284 285 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comieq_sh(__m128h A, 286 __m128h B) { 287 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_EQ_OS, 288 _MM_FROUND_CUR_DIRECTION); 289 } 290 291 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comilt_sh(__m128h A, 292 __m128h B) { 293 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LT_OS, 294 _MM_FROUND_CUR_DIRECTION); 295 } 296 297 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comile_sh(__m128h A, 298 __m128h B) { 299 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LE_OS, 300 _MM_FROUND_CUR_DIRECTION); 301 } 302 303 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comigt_sh(__m128h A, 304 __m128h B) { 305 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GT_OS, 306 _MM_FROUND_CUR_DIRECTION); 307 } 308 309 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comige_sh(__m128h A, 310 __m128h B) { 311 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GE_OS, 312 _MM_FROUND_CUR_DIRECTION); 313 } 314 315 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comineq_sh(__m128h A, 316 __m128h B) { 317 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_NEQ_US, 318 _MM_FROUND_CUR_DIRECTION); 319 } 320 321 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomieq_sh(__m128h A, 322 __m128h B) { 323 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_EQ_OQ, 324 _MM_FROUND_CUR_DIRECTION); 325 } 326 327 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomilt_sh(__m128h A, 328 __m128h B) { 329 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LT_OQ, 330 _MM_FROUND_CUR_DIRECTION); 331 } 332 333 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomile_sh(__m128h A, 334 __m128h B) { 335 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LE_OQ, 336 _MM_FROUND_CUR_DIRECTION); 337 } 338 339 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomigt_sh(__m128h A, 340 __m128h B) { 341 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GT_OQ, 342 _MM_FROUND_CUR_DIRECTION); 343 } 344 345 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomige_sh(__m128h A, 346 __m128h B) { 347 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GE_OQ, 348 _MM_FROUND_CUR_DIRECTION); 349 } 350 351 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomineq_sh(__m128h A, 352 __m128h B) { 353 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_NEQ_UQ, 354 _MM_FROUND_CUR_DIRECTION); 355 } 356 357 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_add_ph(__m512h __A, 358 __m512h __B) { 359 return (__m512h)((__v32hf)__A + (__v32hf)__B); 360 } 361 362 static __inline__ __m512h __DEFAULT_FN_ATTRS512 363 _mm512_mask_add_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { 364 return (__m512h)__builtin_ia32_selectph_512( 365 (__mmask32)__U, (__v32hf)_mm512_add_ph(__A, __B), (__v32hf)__W); 366 } 367 368 static __inline__ __m512h __DEFAULT_FN_ATTRS512 369 _mm512_maskz_add_ph(__mmask32 __U, __m512h __A, __m512h __B) { 370 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, 371 (__v32hf)_mm512_add_ph(__A, __B), 372 (__v32hf)_mm512_setzero_ph()); 373 } 374 375 #define _mm512_add_round_ph(A, B, R) \ 376 ((__m512h)__builtin_ia32_addph512((__v32hf)(__m512h)(A), \ 377 (__v32hf)(__m512h)(B), (int)(R))) 378 379 #define _mm512_mask_add_round_ph(W, U, A, B, R) \ 380 ((__m512h)__builtin_ia32_selectph_512( \ 381 (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)), \ 382 (__v32hf)(__m512h)(W))) 383 384 #define _mm512_maskz_add_round_ph(U, A, B, R) \ 385 ((__m512h)__builtin_ia32_selectph_512( \ 386 (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)), \ 387 (__v32hf)_mm512_setzero_ph())) 388 389 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sub_ph(__m512h __A, 390 __m512h __B) { 391 return (__m512h)((__v32hf)__A - (__v32hf)__B); 392 } 393 394 static __inline__ __m512h __DEFAULT_FN_ATTRS512 395 _mm512_mask_sub_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { 396 return (__m512h)__builtin_ia32_selectph_512( 397 (__mmask32)__U, (__v32hf)_mm512_sub_ph(__A, __B), (__v32hf)__W); 398 } 399 400 static __inline__ __m512h __DEFAULT_FN_ATTRS512 401 _mm512_maskz_sub_ph(__mmask32 __U, __m512h __A, __m512h __B) { 402 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, 403 (__v32hf)_mm512_sub_ph(__A, __B), 404 (__v32hf)_mm512_setzero_ph()); 405 } 406 407 #define _mm512_sub_round_ph(A, B, R) \ 408 ((__m512h)__builtin_ia32_subph512((__v32hf)(__m512h)(A), \ 409 (__v32hf)(__m512h)(B), (int)(R))) 410 411 #define _mm512_mask_sub_round_ph(W, U, A, B, R) \ 412 ((__m512h)__builtin_ia32_selectph_512( \ 413 (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)), \ 414 (__v32hf)(__m512h)(W))) 415 416 #define _mm512_maskz_sub_round_ph(U, A, B, R) \ 417 ((__m512h)__builtin_ia32_selectph_512( \ 418 (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)), \ 419 (__v32hf)_mm512_setzero_ph())) 420 421 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_mul_ph(__m512h __A, 422 __m512h __B) { 423 return (__m512h)((__v32hf)__A * (__v32hf)__B); 424 } 425 426 static __inline__ __m512h __DEFAULT_FN_ATTRS512 427 _mm512_mask_mul_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { 428 return (__m512h)__builtin_ia32_selectph_512( 429 (__mmask32)__U, (__v32hf)_mm512_mul_ph(__A, __B), (__v32hf)__W); 430 } 431 432 static __inline__ __m512h __DEFAULT_FN_ATTRS512 433 _mm512_maskz_mul_ph(__mmask32 __U, __m512h __A, __m512h __B) { 434 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, 435 (__v32hf)_mm512_mul_ph(__A, __B), 436 (__v32hf)_mm512_setzero_ph()); 437 } 438 439 #define _mm512_mul_round_ph(A, B, R) \ 440 ((__m512h)__builtin_ia32_mulph512((__v32hf)(__m512h)(A), \ 441 (__v32hf)(__m512h)(B), (int)(R))) 442 443 #define _mm512_mask_mul_round_ph(W, U, A, B, R) \ 444 ((__m512h)__builtin_ia32_selectph_512( \ 445 (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)), \ 446 (__v32hf)(__m512h)(W))) 447 448 #define _mm512_maskz_mul_round_ph(U, A, B, R) \ 449 ((__m512h)__builtin_ia32_selectph_512( \ 450 (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)), \ 451 (__v32hf)_mm512_setzero_ph())) 452 453 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_div_ph(__m512h __A, 454 __m512h __B) { 455 return (__m512h)((__v32hf)__A / (__v32hf)__B); 456 } 457 458 static __inline__ __m512h __DEFAULT_FN_ATTRS512 459 _mm512_mask_div_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { 460 return (__m512h)__builtin_ia32_selectph_512( 461 (__mmask32)__U, (__v32hf)_mm512_div_ph(__A, __B), (__v32hf)__W); 462 } 463 464 static __inline__ __m512h __DEFAULT_FN_ATTRS512 465 _mm512_maskz_div_ph(__mmask32 __U, __m512h __A, __m512h __B) { 466 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, 467 (__v32hf)_mm512_div_ph(__A, __B), 468 (__v32hf)_mm512_setzero_ph()); 469 } 470 471 #define _mm512_div_round_ph(A, B, R) \ 472 ((__m512h)__builtin_ia32_divph512((__v32hf)(__m512h)(A), \ 473 (__v32hf)(__m512h)(B), (int)(R))) 474 475 #define _mm512_mask_div_round_ph(W, U, A, B, R) \ 476 ((__m512h)__builtin_ia32_selectph_512( \ 477 (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)), \ 478 (__v32hf)(__m512h)(W))) 479 480 #define _mm512_maskz_div_round_ph(U, A, B, R) \ 481 ((__m512h)__builtin_ia32_selectph_512( \ 482 (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)), \ 483 (__v32hf)_mm512_setzero_ph())) 484 485 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_min_ph(__m512h __A, 486 __m512h __B) { 487 return (__m512h)__builtin_ia32_minph512((__v32hf)__A, (__v32hf)__B, 488 _MM_FROUND_CUR_DIRECTION); 489 } 490 491 static __inline__ __m512h __DEFAULT_FN_ATTRS512 492 _mm512_mask_min_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { 493 return (__m512h)__builtin_ia32_selectph_512( 494 (__mmask32)__U, (__v32hf)_mm512_min_ph(__A, __B), (__v32hf)__W); 495 } 496 497 static __inline__ __m512h __DEFAULT_FN_ATTRS512 498 _mm512_maskz_min_ph(__mmask32 __U, __m512h __A, __m512h __B) { 499 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, 500 (__v32hf)_mm512_min_ph(__A, __B), 501 (__v32hf)_mm512_setzero_ph()); 502 } 503 504 #define _mm512_min_round_ph(A, B, R) \ 505 ((__m512h)__builtin_ia32_minph512((__v32hf)(__m512h)(A), \ 506 (__v32hf)(__m512h)(B), (int)(R))) 507 508 #define _mm512_mask_min_round_ph(W, U, A, B, R) \ 509 ((__m512h)__builtin_ia32_selectph_512( \ 510 (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)), \ 511 (__v32hf)(__m512h)(W))) 512 513 #define _mm512_maskz_min_round_ph(U, A, B, R) \ 514 ((__m512h)__builtin_ia32_selectph_512( \ 515 (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)), \ 516 (__v32hf)_mm512_setzero_ph())) 517 518 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_max_ph(__m512h __A, 519 __m512h __B) { 520 return (__m512h)__builtin_ia32_maxph512((__v32hf)__A, (__v32hf)__B, 521 _MM_FROUND_CUR_DIRECTION); 522 } 523 524 static __inline__ __m512h __DEFAULT_FN_ATTRS512 525 _mm512_mask_max_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { 526 return (__m512h)__builtin_ia32_selectph_512( 527 (__mmask32)__U, (__v32hf)_mm512_max_ph(__A, __B), (__v32hf)__W); 528 } 529 530 static __inline__ __m512h __DEFAULT_FN_ATTRS512 531 _mm512_maskz_max_ph(__mmask32 __U, __m512h __A, __m512h __B) { 532 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, 533 (__v32hf)_mm512_max_ph(__A, __B), 534 (__v32hf)_mm512_setzero_ph()); 535 } 536 537 #define _mm512_max_round_ph(A, B, R) \ 538 ((__m512h)__builtin_ia32_maxph512((__v32hf)(__m512h)(A), \ 539 (__v32hf)(__m512h)(B), (int)(R))) 540 541 #define _mm512_mask_max_round_ph(W, U, A, B, R) \ 542 ((__m512h)__builtin_ia32_selectph_512( \ 543 (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)), \ 544 (__v32hf)(__m512h)(W))) 545 546 #define _mm512_maskz_max_round_ph(U, A, B, R) \ 547 ((__m512h)__builtin_ia32_selectph_512( \ 548 (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)), \ 549 (__v32hf)_mm512_setzero_ph())) 550 551 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_abs_ph(__m512h __A) { 552 return (__m512h)_mm512_and_epi32(_mm512_set1_epi32(0x7FFF7FFF), (__m512i)__A); 553 } 554 555 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_conj_pch(__m512h __A) { 556 return (__m512h)_mm512_xor_ps((__m512)__A, _mm512_set1_ps(-0.0f)); 557 } 558 559 static __inline__ __m512h __DEFAULT_FN_ATTRS512 560 _mm512_mask_conj_pch(__m512h __W, __mmask16 __U, __m512h __A) { 561 return (__m512h)__builtin_ia32_selectps_512( 562 (__mmask16)__U, (__v16sf)_mm512_conj_pch(__A), (__v16sf)__W); 563 } 564 565 static __inline__ __m512h __DEFAULT_FN_ATTRS512 566 _mm512_maskz_conj_pch(__mmask16 __U, __m512h __A) { 567 return (__m512h)__builtin_ia32_selectps_512((__mmask16)__U, 568 (__v16sf)_mm512_conj_pch(__A), 569 (__v16sf)_mm512_setzero_ps()); 570 } 571 572 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_add_sh(__m128h __A, 573 __m128h __B) { 574 __A[0] += __B[0]; 575 return __A; 576 } 577 578 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_add_sh(__m128h __W, 579 __mmask8 __U, 580 __m128h __A, 581 __m128h __B) { 582 __A = _mm_add_sh(__A, __B); 583 return __builtin_ia32_selectsh_128(__U, __A, __W); 584 } 585 586 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_add_sh(__mmask8 __U, 587 __m128h __A, 588 __m128h __B) { 589 __A = _mm_add_sh(__A, __B); 590 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph()); 591 } 592 593 #define _mm_add_round_sh(A, B, R) \ 594 ((__m128h)__builtin_ia32_addsh_round_mask( \ 595 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ 596 (__mmask8)-1, (int)(R))) 597 598 #define _mm_mask_add_round_sh(W, U, A, B, R) \ 599 ((__m128h)__builtin_ia32_addsh_round_mask( \ 600 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ 601 (__mmask8)(U), (int)(R))) 602 603 #define _mm_maskz_add_round_sh(U, A, B, R) \ 604 ((__m128h)__builtin_ia32_addsh_round_mask( \ 605 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ 606 (__mmask8)(U), (int)(R))) 607 608 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sub_sh(__m128h __A, 609 __m128h __B) { 610 __A[0] -= __B[0]; 611 return __A; 612 } 613 614 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sub_sh(__m128h __W, 615 __mmask8 __U, 616 __m128h __A, 617 __m128h __B) { 618 __A = _mm_sub_sh(__A, __B); 619 return __builtin_ia32_selectsh_128(__U, __A, __W); 620 } 621 622 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sub_sh(__mmask8 __U, 623 __m128h __A, 624 __m128h __B) { 625 __A = _mm_sub_sh(__A, __B); 626 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph()); 627 } 628 629 #define _mm_sub_round_sh(A, B, R) \ 630 ((__m128h)__builtin_ia32_subsh_round_mask( \ 631 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ 632 (__mmask8)-1, (int)(R))) 633 634 #define _mm_mask_sub_round_sh(W, U, A, B, R) \ 635 ((__m128h)__builtin_ia32_subsh_round_mask( \ 636 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ 637 (__mmask8)(U), (int)(R))) 638 639 #define _mm_maskz_sub_round_sh(U, A, B, R) \ 640 ((__m128h)__builtin_ia32_subsh_round_mask( \ 641 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ 642 (__mmask8)(U), (int)(R))) 643 644 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mul_sh(__m128h __A, 645 __m128h __B) { 646 __A[0] *= __B[0]; 647 return __A; 648 } 649 650 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_mul_sh(__m128h __W, 651 __mmask8 __U, 652 __m128h __A, 653 __m128h __B) { 654 __A = _mm_mul_sh(__A, __B); 655 return __builtin_ia32_selectsh_128(__U, __A, __W); 656 } 657 658 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_mul_sh(__mmask8 __U, 659 __m128h __A, 660 __m128h __B) { 661 __A = _mm_mul_sh(__A, __B); 662 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph()); 663 } 664 665 #define _mm_mul_round_sh(A, B, R) \ 666 ((__m128h)__builtin_ia32_mulsh_round_mask( \ 667 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ 668 (__mmask8)-1, (int)(R))) 669 670 #define _mm_mask_mul_round_sh(W, U, A, B, R) \ 671 ((__m128h)__builtin_ia32_mulsh_round_mask( \ 672 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ 673 (__mmask8)(U), (int)(R))) 674 675 #define _mm_maskz_mul_round_sh(U, A, B, R) \ 676 ((__m128h)__builtin_ia32_mulsh_round_mask( \ 677 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ 678 (__mmask8)(U), (int)(R))) 679 680 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_div_sh(__m128h __A, 681 __m128h __B) { 682 __A[0] /= __B[0]; 683 return __A; 684 } 685 686 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_div_sh(__m128h __W, 687 __mmask8 __U, 688 __m128h __A, 689 __m128h __B) { 690 __A = _mm_div_sh(__A, __B); 691 return __builtin_ia32_selectsh_128(__U, __A, __W); 692 } 693 694 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_div_sh(__mmask8 __U, 695 __m128h __A, 696 __m128h __B) { 697 __A = _mm_div_sh(__A, __B); 698 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph()); 699 } 700 701 #define _mm_div_round_sh(A, B, R) \ 702 ((__m128h)__builtin_ia32_divsh_round_mask( \ 703 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ 704 (__mmask8)-1, (int)(R))) 705 706 #define _mm_mask_div_round_sh(W, U, A, B, R) \ 707 ((__m128h)__builtin_ia32_divsh_round_mask( \ 708 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ 709 (__mmask8)(U), (int)(R))) 710 711 #define _mm_maskz_div_round_sh(U, A, B, R) \ 712 ((__m128h)__builtin_ia32_divsh_round_mask( \ 713 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ 714 (__mmask8)(U), (int)(R))) 715 716 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_min_sh(__m128h __A, 717 __m128h __B) { 718 return (__m128h)__builtin_ia32_minsh_round_mask( 719 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1, 720 _MM_FROUND_CUR_DIRECTION); 721 } 722 723 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_min_sh(__m128h __W, 724 __mmask8 __U, 725 __m128h __A, 726 __m128h __B) { 727 return (__m128h)__builtin_ia32_minsh_round_mask((__v8hf)__A, (__v8hf)__B, 728 (__v8hf)__W, (__mmask8)__U, 729 _MM_FROUND_CUR_DIRECTION); 730 } 731 732 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_min_sh(__mmask8 __U, 733 __m128h __A, 734 __m128h __B) { 735 return (__m128h)__builtin_ia32_minsh_round_mask( 736 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U, 737 _MM_FROUND_CUR_DIRECTION); 738 } 739 740 #define _mm_min_round_sh(A, B, R) \ 741 ((__m128h)__builtin_ia32_minsh_round_mask( \ 742 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ 743 (__mmask8)-1, (int)(R))) 744 745 #define _mm_mask_min_round_sh(W, U, A, B, R) \ 746 ((__m128h)__builtin_ia32_minsh_round_mask( \ 747 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ 748 (__mmask8)(U), (int)(R))) 749 750 #define _mm_maskz_min_round_sh(U, A, B, R) \ 751 ((__m128h)__builtin_ia32_minsh_round_mask( \ 752 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ 753 (__mmask8)(U), (int)(R))) 754 755 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_max_sh(__m128h __A, 756 __m128h __B) { 757 return (__m128h)__builtin_ia32_maxsh_round_mask( 758 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1, 759 _MM_FROUND_CUR_DIRECTION); 760 } 761 762 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_max_sh(__m128h __W, 763 __mmask8 __U, 764 __m128h __A, 765 __m128h __B) { 766 return (__m128h)__builtin_ia32_maxsh_round_mask((__v8hf)__A, (__v8hf)__B, 767 (__v8hf)__W, (__mmask8)__U, 768 _MM_FROUND_CUR_DIRECTION); 769 } 770 771 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_max_sh(__mmask8 __U, 772 __m128h __A, 773 __m128h __B) { 774 return (__m128h)__builtin_ia32_maxsh_round_mask( 775 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U, 776 _MM_FROUND_CUR_DIRECTION); 777 } 778 779 #define _mm_max_round_sh(A, B, R) \ 780 ((__m128h)__builtin_ia32_maxsh_round_mask( \ 781 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ 782 (__mmask8)-1, (int)(R))) 783 784 #define _mm_mask_max_round_sh(W, U, A, B, R) \ 785 ((__m128h)__builtin_ia32_maxsh_round_mask( \ 786 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ 787 (__mmask8)(U), (int)(R))) 788 789 #define _mm_maskz_max_round_sh(U, A, B, R) \ 790 ((__m128h)__builtin_ia32_maxsh_round_mask( \ 791 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ 792 (__mmask8)(U), (int)(R))) 793 794 #define _mm512_cmp_round_ph_mask(A, B, P, R) \ 795 ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A), \ 796 (__v32hf)(__m512h)(B), (int)(P), \ 797 (__mmask32)-1, (int)(R))) 798 799 #define _mm512_mask_cmp_round_ph_mask(U, A, B, P, R) \ 800 ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A), \ 801 (__v32hf)(__m512h)(B), (int)(P), \ 802 (__mmask32)(U), (int)(R))) 803 804 #define _mm512_cmp_ph_mask(A, B, P) \ 805 _mm512_cmp_round_ph_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION) 806 807 #define _mm512_mask_cmp_ph_mask(U, A, B, P) \ 808 _mm512_mask_cmp_round_ph_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION) 809 810 #define _mm_cmp_round_sh_mask(X, Y, P, R) \ 811 ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X), \ 812 (__v8hf)(__m128h)(Y), (int)(P), \ 813 (__mmask8)-1, (int)(R))) 814 815 #define _mm_mask_cmp_round_sh_mask(M, X, Y, P, R) \ 816 ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X), \ 817 (__v8hf)(__m128h)(Y), (int)(P), \ 818 (__mmask8)(M), (int)(R))) 819 820 #define _mm_cmp_sh_mask(X, Y, P) \ 821 ((__mmask8)__builtin_ia32_cmpsh_mask( \ 822 (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)-1, \ 823 _MM_FROUND_CUR_DIRECTION)) 824 825 #define _mm_mask_cmp_sh_mask(M, X, Y, P) \ 826 ((__mmask8)__builtin_ia32_cmpsh_mask( \ 827 (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)(M), \ 828 _MM_FROUND_CUR_DIRECTION)) 829 // loads with vmovsh: 830 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_sh(void const *__dp) { 831 struct __mm_load_sh_struct { 832 _Float16 __u; 833 } __attribute__((__packed__, __may_alias__)); 834 _Float16 __u = ((const struct __mm_load_sh_struct *)__dp)->__u; 835 return (__m128h){__u, 0, 0, 0, 0, 0, 0, 0}; 836 } 837 838 static __inline__ __m128h __DEFAULT_FN_ATTRS128 839 _mm_mask_load_sh(__m128h __W, __mmask8 __U, const void *__A) { 840 __m128h src = (__v8hf)__builtin_shufflevector( 841 (__v8hf)__W, (__v8hf)_mm_setzero_ph(), 0, 8, 8, 8, 8, 8, 8, 8); 842 843 return (__m128h)__builtin_ia32_loadsh128_mask((const __v8hf *)__A, src, __U & 1); 844 } 845 846 static __inline__ __m128h __DEFAULT_FN_ATTRS128 847 _mm_maskz_load_sh(__mmask8 __U, const void *__A) { 848 return (__m128h)__builtin_ia32_loadsh128_mask( 849 (const __v8hf *)__A, (__v8hf)_mm_setzero_ph(), __U & 1); 850 } 851 852 static __inline__ __m512h __DEFAULT_FN_ATTRS512 853 _mm512_load_ph(void const *__p) { 854 return *(const __m512h *)__p; 855 } 856 857 static __inline__ __m256h __DEFAULT_FN_ATTRS256 858 _mm256_load_ph(void const *__p) { 859 return *(const __m256h *)__p; 860 } 861 862 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_ph(void const *__p) { 863 return *(const __m128h *)__p; 864 } 865 866 static __inline__ __m512h __DEFAULT_FN_ATTRS512 867 _mm512_loadu_ph(void const *__p) { 868 struct __loadu_ph { 869 __m512h_u __v; 870 } __attribute__((__packed__, __may_alias__)); 871 return ((const struct __loadu_ph *)__p)->__v; 872 } 873 874 static __inline__ __m256h __DEFAULT_FN_ATTRS256 875 _mm256_loadu_ph(void const *__p) { 876 struct __loadu_ph { 877 __m256h_u __v; 878 } __attribute__((__packed__, __may_alias__)); 879 return ((const struct __loadu_ph *)__p)->__v; 880 } 881 882 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_loadu_ph(void const *__p) { 883 struct __loadu_ph { 884 __m128h_u __v; 885 } __attribute__((__packed__, __may_alias__)); 886 return ((const struct __loadu_ph *)__p)->__v; 887 } 888 889 // stores with vmovsh: 890 static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_sh(void *__dp, 891 __m128h __a) { 892 struct __mm_store_sh_struct { 893 _Float16 __u; 894 } __attribute__((__packed__, __may_alias__)); 895 ((struct __mm_store_sh_struct *)__dp)->__u = __a[0]; 896 } 897 898 static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_store_sh(void *__W, 899 __mmask8 __U, 900 __m128h __A) { 901 __builtin_ia32_storesh128_mask((__v8hf *)__W, __A, __U & 1); 902 } 903 904 static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_store_ph(void *__P, 905 __m512h __A) { 906 *(__m512h *)__P = __A; 907 } 908 909 static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_store_ph(void *__P, 910 __m256h __A) { 911 *(__m256h *)__P = __A; 912 } 913 914 static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_ph(void *__P, 915 __m128h __A) { 916 *(__m128h *)__P = __A; 917 } 918 919 static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_storeu_ph(void *__P, 920 __m512h __A) { 921 struct __storeu_ph { 922 __m512h_u __v; 923 } __attribute__((__packed__, __may_alias__)); 924 ((struct __storeu_ph *)__P)->__v = __A; 925 } 926 927 static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_storeu_ph(void *__P, 928 __m256h __A) { 929 struct __storeu_ph { 930 __m256h_u __v; 931 } __attribute__((__packed__, __may_alias__)); 932 ((struct __storeu_ph *)__P)->__v = __A; 933 } 934 935 static __inline__ void __DEFAULT_FN_ATTRS128 _mm_storeu_ph(void *__P, 936 __m128h __A) { 937 struct __storeu_ph { 938 __m128h_u __v; 939 } __attribute__((__packed__, __may_alias__)); 940 ((struct __storeu_ph *)__P)->__v = __A; 941 } 942 943 // moves with vmovsh: 944 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_move_sh(__m128h __a, 945 __m128h __b) { 946 __a[0] = __b[0]; 947 return __a; 948 } 949 950 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_move_sh(__m128h __W, 951 __mmask8 __U, 952 __m128h __A, 953 __m128h __B) { 954 return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B), __W); 955 } 956 957 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_move_sh(__mmask8 __U, 958 __m128h __A, 959 __m128h __B) { 960 return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B), 961 _mm_setzero_ph()); 962 } 963 964 // vmovw: 965 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtsi16_si128(short __a) { 966 return (__m128i)(__v8hi){__a, 0, 0, 0, 0, 0, 0, 0}; 967 } 968 969 static __inline__ short __DEFAULT_FN_ATTRS128 _mm_cvtsi128_si16(__m128i __a) { 970 __v8hi __b = (__v8hi)__a; 971 return __b[0]; 972 } 973 974 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rcp_ph(__m512h __A) { 975 return (__m512h)__builtin_ia32_rcpph512_mask( 976 (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1); 977 } 978 979 static __inline__ __m512h __DEFAULT_FN_ATTRS512 980 _mm512_mask_rcp_ph(__m512h __W, __mmask32 __U, __m512h __A) { 981 return (__m512h)__builtin_ia32_rcpph512_mask((__v32hf)__A, (__v32hf)__W, 982 (__mmask32)__U); 983 } 984 985 static __inline__ __m512h __DEFAULT_FN_ATTRS512 986 _mm512_maskz_rcp_ph(__mmask32 __U, __m512h __A) { 987 return (__m512h)__builtin_ia32_rcpph512_mask( 988 (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U); 989 } 990 991 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rsqrt_ph(__m512h __A) { 992 return (__m512h)__builtin_ia32_rsqrtph512_mask( 993 (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1); 994 } 995 996 static __inline__ __m512h __DEFAULT_FN_ATTRS512 997 _mm512_mask_rsqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) { 998 return (__m512h)__builtin_ia32_rsqrtph512_mask((__v32hf)__A, (__v32hf)__W, 999 (__mmask32)__U); 1000 } 1001 1002 static __inline__ __m512h __DEFAULT_FN_ATTRS512 1003 _mm512_maskz_rsqrt_ph(__mmask32 __U, __m512h __A) { 1004 return (__m512h)__builtin_ia32_rsqrtph512_mask( 1005 (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U); 1006 } 1007 1008 #define _mm512_getmant_ph(A, B, C) \ 1009 ((__m512h)__builtin_ia32_getmantph512_mask( \ 1010 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \ 1011 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, \ 1012 _MM_FROUND_CUR_DIRECTION)) 1013 1014 #define _mm512_mask_getmant_ph(W, U, A, B, C) \ 1015 ((__m512h)__builtin_ia32_getmantph512_mask( \ 1016 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W), \ 1017 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION)) 1018 1019 #define _mm512_maskz_getmant_ph(U, A, B, C) \ 1020 ((__m512h)__builtin_ia32_getmantph512_mask( \ 1021 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \ 1022 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), _MM_FROUND_CUR_DIRECTION)) 1023 1024 #define _mm512_getmant_round_ph(A, B, C, R) \ 1025 ((__m512h)__builtin_ia32_getmantph512_mask( \ 1026 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \ 1027 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R))) 1028 1029 #define _mm512_mask_getmant_round_ph(W, U, A, B, C, R) \ 1030 ((__m512h)__builtin_ia32_getmantph512_mask( \ 1031 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W), \ 1032 (__mmask32)(U), (int)(R))) 1033 1034 #define _mm512_maskz_getmant_round_ph(U, A, B, C, R) \ 1035 ((__m512h)__builtin_ia32_getmantph512_mask( \ 1036 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \ 1037 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R))) 1038 1039 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_getexp_ph(__m512h __A) { 1040 return (__m512h)__builtin_ia32_getexpph512_mask( 1041 (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, 1042 _MM_FROUND_CUR_DIRECTION); 1043 } 1044 1045 static __inline__ __m512h __DEFAULT_FN_ATTRS512 1046 _mm512_mask_getexp_ph(__m512h __W, __mmask32 __U, __m512h __A) { 1047 return (__m512h)__builtin_ia32_getexpph512_mask( 1048 (__v32hf)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION); 1049 } 1050 1051 static __inline__ __m512h __DEFAULT_FN_ATTRS512 1052 _mm512_maskz_getexp_ph(__mmask32 __U, __m512h __A) { 1053 return (__m512h)__builtin_ia32_getexpph512_mask( 1054 (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U, 1055 _MM_FROUND_CUR_DIRECTION); 1056 } 1057 1058 #define _mm512_getexp_round_ph(A, R) \ 1059 ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \ 1060 (__v32hf)_mm512_undefined_ph(), \ 1061 (__mmask32)-1, (int)(R))) 1062 1063 #define _mm512_mask_getexp_round_ph(W, U, A, R) \ 1064 ((__m512h)__builtin_ia32_getexpph512_mask( \ 1065 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(W), (__mmask32)(U), (int)(R))) 1066 1067 #define _mm512_maskz_getexp_round_ph(U, A, R) \ 1068 ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \ 1069 (__v32hf)_mm512_setzero_ph(), \ 1070 (__mmask32)(U), (int)(R))) 1071 1072 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_scalef_ph(__m512h __A, 1073 __m512h __B) { 1074 return (__m512h)__builtin_ia32_scalefph512_mask( 1075 (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, 1076 _MM_FROUND_CUR_DIRECTION); 1077 } 1078 1079 static __inline__ __m512h __DEFAULT_FN_ATTRS512 1080 _mm512_mask_scalef_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { 1081 return (__m512h)__builtin_ia32_scalefph512_mask((__v32hf)__A, (__v32hf)__B, 1082 (__v32hf)__W, (__mmask32)__U, 1083 _MM_FROUND_CUR_DIRECTION); 1084 } 1085 1086 static __inline__ __m512h __DEFAULT_FN_ATTRS512 1087 _mm512_maskz_scalef_ph(__mmask32 __U, __m512h __A, __m512h __B) { 1088 return (__m512h)__builtin_ia32_scalefph512_mask( 1089 (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U, 1090 _MM_FROUND_CUR_DIRECTION); 1091 } 1092 1093 #define _mm512_scalef_round_ph(A, B, R) \ 1094 ((__m512h)__builtin_ia32_scalefph512_mask( \ 1095 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), \ 1096 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R))) 1097 1098 #define _mm512_mask_scalef_round_ph(W, U, A, B, R) \ 1099 ((__m512h)__builtin_ia32_scalefph512_mask( \ 1100 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(W), \ 1101 (__mmask32)(U), (int)(R))) 1102 1103 #define _mm512_maskz_scalef_round_ph(U, A, B, R) \ 1104 ((__m512h)__builtin_ia32_scalefph512_mask( \ 1105 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), \ 1106 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R))) 1107 1108 #define _mm512_roundscale_ph(A, B) \ 1109 ((__m512h)__builtin_ia32_rndscaleph_mask( \ 1110 (__v32hf)(__m512h)(A), (int)(B), (__v32hf)(__m512h)(A), (__mmask32)-1, \ 1111 _MM_FROUND_CUR_DIRECTION)) 1112 1113 #define _mm512_mask_roundscale_ph(A, B, C, imm) \ 1114 ((__m512h)__builtin_ia32_rndscaleph_mask( \ 1115 (__v32hf)(__m512h)(C), (int)(imm), (__v32hf)(__m512h)(A), \ 1116 (__mmask32)(B), _MM_FROUND_CUR_DIRECTION)) 1117 1118 #define _mm512_maskz_roundscale_ph(A, B, imm) \ 1119 ((__m512h)__builtin_ia32_rndscaleph_mask( \ 1120 (__v32hf)(__m512h)(B), (int)(imm), (__v32hf)_mm512_setzero_ph(), \ 1121 (__mmask32)(A), _MM_FROUND_CUR_DIRECTION)) 1122 1123 #define _mm512_mask_roundscale_round_ph(A, B, C, imm, R) \ 1124 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(C), (int)(imm), \ 1125 (__v32hf)(__m512h)(A), \ 1126 (__mmask32)(B), (int)(R))) 1127 1128 #define _mm512_maskz_roundscale_round_ph(A, B, imm, R) \ 1129 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(B), (int)(imm), \ 1130 (__v32hf)_mm512_setzero_ph(), \ 1131 (__mmask32)(A), (int)(R))) 1132 1133 #define _mm512_roundscale_round_ph(A, imm, R) \ 1134 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(A), (int)(imm), \ 1135 (__v32hf)_mm512_undefined_ph(), \ 1136 (__mmask32)-1, (int)(R))) 1137 1138 #define _mm512_reduce_ph(A, imm) \ 1139 ((__m512h)__builtin_ia32_reduceph512_mask( \ 1140 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_undefined_ph(), \ 1141 (__mmask32)-1, _MM_FROUND_CUR_DIRECTION)) 1142 1143 #define _mm512_mask_reduce_ph(W, U, A, imm) \ 1144 ((__m512h)__builtin_ia32_reduceph512_mask( \ 1145 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)(__m512h)(W), \ 1146 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION)) 1147 1148 #define _mm512_maskz_reduce_ph(U, A, imm) \ 1149 ((__m512h)__builtin_ia32_reduceph512_mask( \ 1150 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_setzero_ph(), \ 1151 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION)) 1152 1153 #define _mm512_mask_reduce_round_ph(W, U, A, imm, R) \ 1154 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \ 1155 (__v32hf)(__m512h)(W), \ 1156 (__mmask32)(U), (int)(R))) 1157 1158 #define _mm512_maskz_reduce_round_ph(U, A, imm, R) \ 1159 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \ 1160 (__v32hf)_mm512_setzero_ph(), \ 1161 (__mmask32)(U), (int)(R))) 1162 1163 #define _mm512_reduce_round_ph(A, imm, R) \ 1164 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \ 1165 (__v32hf)_mm512_undefined_ph(), \ 1166 (__mmask32)-1, (int)(R))) 1167 1168 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rcp_sh(__m128h __A, 1169 __m128h __B) { 1170 return (__m128h)__builtin_ia32_rcpsh_mask( 1171 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1); 1172 } 1173 1174 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rcp_sh(__m128h __W, 1175 __mmask8 __U, 1176 __m128h __A, 1177 __m128h __B) { 1178 return (__m128h)__builtin_ia32_rcpsh_mask((__v8hf)__A, (__v8hf)__B, 1179 (__v8hf)__W, (__mmask8)__U); 1180 } 1181 1182 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_rcp_sh(__mmask8 __U, 1183 __m128h __A, 1184 __m128h __B) { 1185 return (__m128h)__builtin_ia32_rcpsh_mask( 1186 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); 1187 } 1188 1189 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rsqrt_sh(__m128h __A, 1190 __m128h __B) { 1191 return (__m128h)__builtin_ia32_rsqrtsh_mask( 1192 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1); 1193 } 1194 1195 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rsqrt_sh(__m128h __W, 1196 __mmask8 __U, 1197 __m128h __A, 1198 __m128h __B) { 1199 return (__m128h)__builtin_ia32_rsqrtsh_mask((__v8hf)__A, (__v8hf)__B, 1200 (__v8hf)__W, (__mmask8)__U); 1201 } 1202 1203 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1204 _mm_maskz_rsqrt_sh(__mmask8 __U, __m128h __A, __m128h __B) { 1205 return (__m128h)__builtin_ia32_rsqrtsh_mask( 1206 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); 1207 } 1208 1209 #define _mm_getmant_round_sh(A, B, C, D, R) \ 1210 ((__m128h)__builtin_ia32_getmantsh_round_mask( \ 1211 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \ 1212 (__v8hf)_mm_setzero_ph(), (__mmask8)-1, (int)(R))) 1213 1214 #define _mm_getmant_sh(A, B, C, D) \ 1215 ((__m128h)__builtin_ia32_getmantsh_round_mask( \ 1216 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \ 1217 (__v8hf)_mm_setzero_ph(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION)) 1218 1219 #define _mm_mask_getmant_sh(W, U, A, B, C, D) \ 1220 ((__m128h)__builtin_ia32_getmantsh_round_mask( \ 1221 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \ 1222 (__v8hf)(__m128h)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) 1223 1224 #define _mm_mask_getmant_round_sh(W, U, A, B, C, D, R) \ 1225 ((__m128h)__builtin_ia32_getmantsh_round_mask( \ 1226 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \ 1227 (__v8hf)(__m128h)(W), (__mmask8)(U), (int)(R))) 1228 1229 #define _mm_maskz_getmant_sh(U, A, B, C, D) \ 1230 ((__m128h)__builtin_ia32_getmantsh_round_mask( \ 1231 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \ 1232 (__v8hf)_mm_setzero_ph(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) 1233 1234 #define _mm_maskz_getmant_round_sh(U, A, B, C, D, R) \ 1235 ((__m128h)__builtin_ia32_getmantsh_round_mask( \ 1236 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \ 1237 (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R))) 1238 1239 #define _mm_getexp_round_sh(A, B, R) \ 1240 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \ 1241 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ 1242 (__mmask8)-1, (int)(R))) 1243 1244 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_getexp_sh(__m128h __A, 1245 __m128h __B) { 1246 return (__m128h)__builtin_ia32_getexpsh128_round_mask( 1247 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1, 1248 _MM_FROUND_CUR_DIRECTION); 1249 } 1250 1251 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1252 _mm_mask_getexp_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { 1253 return (__m128h)__builtin_ia32_getexpsh128_round_mask( 1254 (__v8hf)__A, (__v8hf)__B, (__v8hf)__W, (__mmask8)__U, 1255 _MM_FROUND_CUR_DIRECTION); 1256 } 1257 1258 #define _mm_mask_getexp_round_sh(W, U, A, B, R) \ 1259 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \ 1260 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ 1261 (__mmask8)(U), (int)(R))) 1262 1263 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1264 _mm_maskz_getexp_sh(__mmask8 __U, __m128h __A, __m128h __B) { 1265 return (__m128h)__builtin_ia32_getexpsh128_round_mask( 1266 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U, 1267 _MM_FROUND_CUR_DIRECTION); 1268 } 1269 1270 #define _mm_maskz_getexp_round_sh(U, A, B, R) \ 1271 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \ 1272 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ 1273 (__mmask8)(U), (int)(R))) 1274 1275 #define _mm_scalef_round_sh(A, B, R) \ 1276 ((__m128h)__builtin_ia32_scalefsh_round_mask( \ 1277 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ 1278 (__mmask8)-1, (int)(R))) 1279 1280 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_scalef_sh(__m128h __A, 1281 __m128h __B) { 1282 return (__m128h)__builtin_ia32_scalefsh_round_mask( 1283 (__v8hf)__A, (__v8hf)(__B), (__v8hf)_mm_setzero_ph(), (__mmask8)-1, 1284 _MM_FROUND_CUR_DIRECTION); 1285 } 1286 1287 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1288 _mm_mask_scalef_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { 1289 return (__m128h)__builtin_ia32_scalefsh_round_mask((__v8hf)__A, (__v8hf)__B, 1290 (__v8hf)__W, (__mmask8)__U, 1291 _MM_FROUND_CUR_DIRECTION); 1292 } 1293 1294 #define _mm_mask_scalef_round_sh(W, U, A, B, R) \ 1295 ((__m128h)__builtin_ia32_scalefsh_round_mask( \ 1296 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ 1297 (__mmask8)(U), (int)(R))) 1298 1299 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1300 _mm_maskz_scalef_sh(__mmask8 __U, __m128h __A, __m128h __B) { 1301 return (__m128h)__builtin_ia32_scalefsh_round_mask( 1302 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U, 1303 _MM_FROUND_CUR_DIRECTION); 1304 } 1305 1306 #define _mm_maskz_scalef_round_sh(U, A, B, R) \ 1307 ((__m128h)__builtin_ia32_scalefsh_round_mask( \ 1308 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ 1309 (__mmask8)(U), (int)(R))) 1310 1311 #define _mm_roundscale_round_sh(A, B, imm, R) \ 1312 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \ 1313 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ 1314 (__mmask8)-1, (int)(imm), (int)(R))) 1315 1316 #define _mm_roundscale_sh(A, B, imm) \ 1317 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \ 1318 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ 1319 (__mmask8)-1, (int)(imm), _MM_FROUND_CUR_DIRECTION)) 1320 1321 #define _mm_mask_roundscale_sh(W, U, A, B, I) \ 1322 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \ 1323 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ 1324 (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION)) 1325 1326 #define _mm_mask_roundscale_round_sh(W, U, A, B, I, R) \ 1327 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \ 1328 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ 1329 (__mmask8)(U), (int)(I), (int)(R))) 1330 1331 #define _mm_maskz_roundscale_sh(U, A, B, I) \ 1332 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \ 1333 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ 1334 (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION)) 1335 1336 #define _mm_maskz_roundscale_round_sh(U, A, B, I, R) \ 1337 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \ 1338 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ 1339 (__mmask8)(U), (int)(I), (int)(R))) 1340 1341 #define _mm_reduce_sh(A, B, C) \ 1342 ((__m128h)__builtin_ia32_reducesh_mask( \ 1343 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ 1344 (__mmask8)-1, (int)(C), _MM_FROUND_CUR_DIRECTION)) 1345 1346 #define _mm_mask_reduce_sh(W, U, A, B, C) \ 1347 ((__m128h)__builtin_ia32_reducesh_mask( \ 1348 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ 1349 (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION)) 1350 1351 #define _mm_maskz_reduce_sh(U, A, B, C) \ 1352 ((__m128h)__builtin_ia32_reducesh_mask( \ 1353 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ 1354 (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION)) 1355 1356 #define _mm_reduce_round_sh(A, B, C, R) \ 1357 ((__m128h)__builtin_ia32_reducesh_mask( \ 1358 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ 1359 (__mmask8)-1, (int)(C), (int)(R))) 1360 1361 #define _mm_mask_reduce_round_sh(W, U, A, B, C, R) \ 1362 ((__m128h)__builtin_ia32_reducesh_mask( \ 1363 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ 1364 (__mmask8)(U), (int)(C), (int)(R))) 1365 1366 #define _mm_maskz_reduce_round_sh(U, A, B, C, R) \ 1367 ((__m128h)__builtin_ia32_reducesh_mask( \ 1368 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ 1369 (__mmask8)(U), (int)(C), (int)(R))) 1370 1371 #define _mm512_sqrt_round_ph(A, R) \ 1372 ((__m512h)__builtin_ia32_sqrtph512((__v32hf)(__m512h)(A), (int)(R))) 1373 1374 #define _mm512_mask_sqrt_round_ph(W, U, A, R) \ 1375 ((__m512h)__builtin_ia32_selectph_512( \ 1376 (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)), \ 1377 (__v32hf)(__m512h)(W))) 1378 1379 #define _mm512_maskz_sqrt_round_ph(U, A, R) \ 1380 ((__m512h)__builtin_ia32_selectph_512( \ 1381 (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)), \ 1382 (__v32hf)_mm512_setzero_ph())) 1383 1384 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sqrt_ph(__m512h __A) { 1385 return (__m512h)__builtin_ia32_sqrtph512((__v32hf)__A, 1386 _MM_FROUND_CUR_DIRECTION); 1387 } 1388 1389 static __inline__ __m512h __DEFAULT_FN_ATTRS512 1390 _mm512_mask_sqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) { 1391 return (__m512h)__builtin_ia32_selectph_512( 1392 (__mmask32)(__U), 1393 (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)), 1394 (__v32hf)(__m512h)(__W)); 1395 } 1396 1397 static __inline__ __m512h __DEFAULT_FN_ATTRS512 1398 _mm512_maskz_sqrt_ph(__mmask32 __U, __m512h __A) { 1399 return (__m512h)__builtin_ia32_selectph_512( 1400 (__mmask32)(__U), 1401 (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)), 1402 (__v32hf)_mm512_setzero_ph()); 1403 } 1404 1405 #define _mm_sqrt_round_sh(A, B, R) \ 1406 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \ 1407 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ 1408 (__mmask8)-1, (int)(R))) 1409 1410 #define _mm_mask_sqrt_round_sh(W, U, A, B, R) \ 1411 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \ 1412 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ 1413 (__mmask8)(U), (int)(R))) 1414 1415 #define _mm_maskz_sqrt_round_sh(U, A, B, R) \ 1416 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \ 1417 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ 1418 (__mmask8)(U), (int)(R))) 1419 1420 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_sh(__m128h __A, 1421 __m128h __B) { 1422 return (__m128h)__builtin_ia32_sqrtsh_round_mask( 1423 (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(), 1424 (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); 1425 } 1426 1427 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_sh(__m128h __W, 1428 __mmask32 __U, 1429 __m128h __A, 1430 __m128h __B) { 1431 return (__m128h)__builtin_ia32_sqrtsh_round_mask( 1432 (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)(__m128h)(__W), 1433 (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION); 1434 } 1435 1436 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_sh(__mmask32 __U, 1437 __m128h __A, 1438 __m128h __B) { 1439 return (__m128h)__builtin_ia32_sqrtsh_round_mask( 1440 (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(), 1441 (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION); 1442 } 1443 1444 #define _mm512_mask_fpclass_ph_mask(U, A, imm) \ 1445 ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A), \ 1446 (int)(imm), (__mmask32)(U))) 1447 1448 #define _mm512_fpclass_ph_mask(A, imm) \ 1449 ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A), \ 1450 (int)(imm), (__mmask32)-1)) 1451 1452 #define _mm_fpclass_sh_mask(A, imm) \ 1453 ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm), \ 1454 (__mmask8)-1)) 1455 1456 #define _mm_mask_fpclass_sh_mask(U, A, imm) \ 1457 ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm), \ 1458 (__mmask8)(U))) 1459 1460 #define _mm512_cvt_roundpd_ph(A, R) \ 1461 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask( \ 1462 (__v8df)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R))) 1463 1464 #define _mm512_mask_cvt_roundpd_ph(W, U, A, R) \ 1465 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask((__v8df)(A), (__v8hf)(W), \ 1466 (__mmask8)(U), (int)(R))) 1467 1468 #define _mm512_maskz_cvt_roundpd_ph(U, A, R) \ 1469 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask( \ 1470 (__v8df)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R))) 1471 1472 static __inline__ __m128h __DEFAULT_FN_ATTRS512 _mm512_cvtpd_ph(__m512d __A) { 1473 return (__m128h)__builtin_ia32_vcvtpd2ph512_mask( 1474 (__v8df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1, 1475 _MM_FROUND_CUR_DIRECTION); 1476 } 1477 1478 static __inline__ __m128h __DEFAULT_FN_ATTRS512 1479 _mm512_mask_cvtpd_ph(__m128h __W, __mmask8 __U, __m512d __A) { 1480 return (__m128h)__builtin_ia32_vcvtpd2ph512_mask( 1481 (__v8df)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); 1482 } 1483 1484 static __inline__ __m128h __DEFAULT_FN_ATTRS512 1485 _mm512_maskz_cvtpd_ph(__mmask8 __U, __m512d __A) { 1486 return (__m128h)__builtin_ia32_vcvtpd2ph512_mask( 1487 (__v8df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U, 1488 _MM_FROUND_CUR_DIRECTION); 1489 } 1490 1491 #define _mm512_cvt_roundph_pd(A, R) \ 1492 ((__m512d)__builtin_ia32_vcvtph2pd512_mask( \ 1493 (__v8hf)(A), (__v8df)_mm512_undefined_pd(), (__mmask8)(-1), (int)(R))) 1494 1495 #define _mm512_mask_cvt_roundph_pd(W, U, A, R) \ 1496 ((__m512d)__builtin_ia32_vcvtph2pd512_mask((__v8hf)(A), (__v8df)(W), \ 1497 (__mmask8)(U), (int)(R))) 1498 1499 #define _mm512_maskz_cvt_roundph_pd(U, A, R) \ 1500 ((__m512d)__builtin_ia32_vcvtph2pd512_mask( \ 1501 (__v8hf)(A), (__v8df)_mm512_setzero_pd(), (__mmask8)(U), (int)(R))) 1502 1503 static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_cvtph_pd(__m128h __A) { 1504 return (__m512d)__builtin_ia32_vcvtph2pd512_mask( 1505 (__v8hf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)-1, 1506 _MM_FROUND_CUR_DIRECTION); 1507 } 1508 1509 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1510 _mm512_mask_cvtph_pd(__m512d __W, __mmask8 __U, __m128h __A) { 1511 return (__m512d)__builtin_ia32_vcvtph2pd512_mask( 1512 (__v8hf)__A, (__v8df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); 1513 } 1514 1515 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1516 _mm512_maskz_cvtph_pd(__mmask8 __U, __m128h __A) { 1517 return (__m512d)__builtin_ia32_vcvtph2pd512_mask( 1518 (__v8hf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, 1519 _MM_FROUND_CUR_DIRECTION); 1520 } 1521 1522 #define _mm_cvt_roundsh_ss(A, B, R) \ 1523 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B), \ 1524 (__v4sf)_mm_undefined_ps(), \ 1525 (__mmask8)(-1), (int)(R))) 1526 1527 #define _mm_mask_cvt_roundsh_ss(W, U, A, B, R) \ 1528 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask( \ 1529 (__v4sf)(A), (__v8hf)(B), (__v4sf)(W), (__mmask8)(U), (int)(R))) 1530 1531 #define _mm_maskz_cvt_roundsh_ss(U, A, B, R) \ 1532 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B), \ 1533 (__v4sf)_mm_setzero_ps(), \ 1534 (__mmask8)(U), (int)(R))) 1535 1536 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtsh_ss(__m128 __A, 1537 __m128h __B) { 1538 return (__m128)__builtin_ia32_vcvtsh2ss_round_mask( 1539 (__v4sf)__A, (__v8hf)__B, (__v4sf)_mm_undefined_ps(), (__mmask8)-1, 1540 _MM_FROUND_CUR_DIRECTION); 1541 } 1542 1543 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtsh_ss(__m128 __W, 1544 __mmask8 __U, 1545 __m128 __A, 1546 __m128h __B) { 1547 return (__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)__A, (__v8hf)__B, 1548 (__v4sf)__W, (__mmask8)__U, 1549 _MM_FROUND_CUR_DIRECTION); 1550 } 1551 1552 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_cvtsh_ss(__mmask8 __U, 1553 __m128 __A, 1554 __m128h __B) { 1555 return (__m128)__builtin_ia32_vcvtsh2ss_round_mask( 1556 (__v4sf)__A, (__v8hf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, 1557 _MM_FROUND_CUR_DIRECTION); 1558 } 1559 1560 #define _mm_cvt_roundss_sh(A, B, R) \ 1561 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B), \ 1562 (__v8hf)_mm_undefined_ph(), \ 1563 (__mmask8)(-1), (int)(R))) 1564 1565 #define _mm_mask_cvt_roundss_sh(W, U, A, B, R) \ 1566 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask( \ 1567 (__v8hf)(A), (__v4sf)(B), (__v8hf)(W), (__mmask8)(U), (int)(R))) 1568 1569 #define _mm_maskz_cvt_roundss_sh(U, A, B, R) \ 1570 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B), \ 1571 (__v8hf)_mm_setzero_ph(), \ 1572 (__mmask8)(U), (int)(R))) 1573 1574 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtss_sh(__m128h __A, 1575 __m128 __B) { 1576 return (__m128h)__builtin_ia32_vcvtss2sh_round_mask( 1577 (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_undefined_ph(), (__mmask8)-1, 1578 _MM_FROUND_CUR_DIRECTION); 1579 } 1580 1581 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtss_sh(__m128h __W, 1582 __mmask8 __U, 1583 __m128h __A, 1584 __m128 __B) { 1585 return (__m128h)__builtin_ia32_vcvtss2sh_round_mask( 1586 (__v8hf)__A, (__v4sf)__B, (__v8hf)__W, (__mmask8)__U, 1587 _MM_FROUND_CUR_DIRECTION); 1588 } 1589 1590 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_cvtss_sh(__mmask8 __U, 1591 __m128h __A, 1592 __m128 __B) { 1593 return (__m128h)__builtin_ia32_vcvtss2sh_round_mask( 1594 (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U, 1595 _MM_FROUND_CUR_DIRECTION); 1596 } 1597 1598 #define _mm_cvt_roundsd_sh(A, B, R) \ 1599 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B), \ 1600 (__v8hf)_mm_undefined_ph(), \ 1601 (__mmask8)(-1), (int)(R))) 1602 1603 #define _mm_mask_cvt_roundsd_sh(W, U, A, B, R) \ 1604 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask( \ 1605 (__v8hf)(A), (__v2df)(B), (__v8hf)(W), (__mmask8)(U), (int)(R))) 1606 1607 #define _mm_maskz_cvt_roundsd_sh(U, A, B, R) \ 1608 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B), \ 1609 (__v8hf)_mm_setzero_ph(), \ 1610 (__mmask8)(U), (int)(R))) 1611 1612 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtsd_sh(__m128h __A, 1613 __m128d __B) { 1614 return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask( 1615 (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_undefined_ph(), (__mmask8)-1, 1616 _MM_FROUND_CUR_DIRECTION); 1617 } 1618 1619 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtsd_sh(__m128h __W, 1620 __mmask8 __U, 1621 __m128h __A, 1622 __m128d __B) { 1623 return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask( 1624 (__v8hf)__A, (__v2df)__B, (__v8hf)__W, (__mmask8)__U, 1625 _MM_FROUND_CUR_DIRECTION); 1626 } 1627 1628 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1629 _mm_maskz_cvtsd_sh(__mmask8 __U, __m128h __A, __m128d __B) { 1630 return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask( 1631 (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U, 1632 _MM_FROUND_CUR_DIRECTION); 1633 } 1634 1635 #define _mm_cvt_roundsh_sd(A, B, R) \ 1636 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B), \ 1637 (__v2df)_mm_undefined_pd(), \ 1638 (__mmask8)(-1), (int)(R))) 1639 1640 #define _mm_mask_cvt_roundsh_sd(W, U, A, B, R) \ 1641 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask( \ 1642 (__v2df)(A), (__v8hf)(B), (__v2df)(W), (__mmask8)(U), (int)(R))) 1643 1644 #define _mm_maskz_cvt_roundsh_sd(U, A, B, R) \ 1645 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B), \ 1646 (__v2df)_mm_setzero_pd(), \ 1647 (__mmask8)(U), (int)(R))) 1648 1649 static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_cvtsh_sd(__m128d __A, 1650 __m128h __B) { 1651 return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask( 1652 (__v2df)__A, (__v8hf)__B, (__v2df)_mm_undefined_pd(), (__mmask8)-1, 1653 _MM_FROUND_CUR_DIRECTION); 1654 } 1655 1656 static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_cvtsh_sd(__m128d __W, 1657 __mmask8 __U, 1658 __m128d __A, 1659 __m128h __B) { 1660 return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask( 1661 (__v2df)__A, (__v8hf)__B, (__v2df)__W, (__mmask8)__U, 1662 _MM_FROUND_CUR_DIRECTION); 1663 } 1664 1665 static __inline__ __m128d __DEFAULT_FN_ATTRS128 1666 _mm_maskz_cvtsh_sd(__mmask8 __U, __m128d __A, __m128h __B) { 1667 return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask( 1668 (__v2df)__A, (__v8hf)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U, 1669 _MM_FROUND_CUR_DIRECTION); 1670 } 1671 1672 #define _mm512_cvt_roundph_epi16(A, R) \ 1673 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), \ 1674 (__v32hi)_mm512_undefined_epi32(), \ 1675 (__mmask32)(-1), (int)(R))) 1676 1677 #define _mm512_mask_cvt_roundph_epi16(W, U, A, R) \ 1678 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), (__v32hi)(W), \ 1679 (__mmask32)(U), (int)(R))) 1680 1681 #define _mm512_maskz_cvt_roundph_epi16(U, A, R) \ 1682 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), \ 1683 (__v32hi)_mm512_setzero_epi32(), \ 1684 (__mmask32)(U), (int)(R))) 1685 1686 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1687 _mm512_cvtph_epi16(__m512h __A) { 1688 return (__m512i)__builtin_ia32_vcvtph2w512_mask( 1689 (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)-1, 1690 _MM_FROUND_CUR_DIRECTION); 1691 } 1692 1693 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1694 _mm512_mask_cvtph_epi16(__m512i __W, __mmask32 __U, __m512h __A) { 1695 return (__m512i)__builtin_ia32_vcvtph2w512_mask( 1696 (__v32hf)__A, (__v32hi)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION); 1697 } 1698 1699 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1700 _mm512_maskz_cvtph_epi16(__mmask32 __U, __m512h __A) { 1701 return (__m512i)__builtin_ia32_vcvtph2w512_mask( 1702 (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)__U, 1703 _MM_FROUND_CUR_DIRECTION); 1704 } 1705 1706 #define _mm512_cvtt_roundph_epi16(A, R) \ 1707 ((__m512i)__builtin_ia32_vcvttph2w512_mask( \ 1708 (__v32hf)(A), (__v32hi)_mm512_undefined_epi32(), (__mmask32)(-1), \ 1709 (int)(R))) 1710 1711 #define _mm512_mask_cvtt_roundph_epi16(W, U, A, R) \ 1712 ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), (__v32hi)(W), \ 1713 (__mmask32)(U), (int)(R))) 1714 1715 #define _mm512_maskz_cvtt_roundph_epi16(U, A, R) \ 1716 ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), \ 1717 (__v32hi)_mm512_setzero_epi32(), \ 1718 (__mmask32)(U), (int)(R))) 1719 1720 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1721 _mm512_cvttph_epi16(__m512h __A) { 1722 return (__m512i)__builtin_ia32_vcvttph2w512_mask( 1723 (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)-1, 1724 _MM_FROUND_CUR_DIRECTION); 1725 } 1726 1727 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1728 _mm512_mask_cvttph_epi16(__m512i __W, __mmask32 __U, __m512h __A) { 1729 return (__m512i)__builtin_ia32_vcvttph2w512_mask( 1730 (__v32hf)__A, (__v32hi)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION); 1731 } 1732 1733 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1734 _mm512_maskz_cvttph_epi16(__mmask32 __U, __m512h __A) { 1735 return (__m512i)__builtin_ia32_vcvttph2w512_mask( 1736 (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)__U, 1737 _MM_FROUND_CUR_DIRECTION); 1738 } 1739 1740 #define _mm512_cvt_roundepi16_ph(A, R) \ 1741 ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), \ 1742 (__v32hf)_mm512_undefined_ph(), \ 1743 (__mmask32)(-1), (int)(R))) 1744 1745 #define _mm512_mask_cvt_roundepi16_ph(W, U, A, R) \ 1746 ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), (__v32hf)(W), \ 1747 (__mmask32)(U), (int)(R))) 1748 1749 #define _mm512_maskz_cvt_roundepi16_ph(U, A, R) \ 1750 ((__m512h)__builtin_ia32_vcvtw2ph512_mask( \ 1751 (__v32hi)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R))) 1752 1753 static __inline__ __m512h __DEFAULT_FN_ATTRS512 1754 _mm512_cvtepi16_ph(__m512i __A) { 1755 return (__m512h)__builtin_ia32_vcvtw2ph512_mask( 1756 (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)-1, 1757 _MM_FROUND_CUR_DIRECTION); 1758 } 1759 1760 static __inline__ __m512h __DEFAULT_FN_ATTRS512 1761 _mm512_mask_cvtepi16_ph(__m512h __W, __mmask32 __U, __m512i __A) { 1762 return (__m512h)__builtin_ia32_vcvtw2ph512_mask( 1763 (__v32hi)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION); 1764 } 1765 1766 static __inline__ __m512h __DEFAULT_FN_ATTRS512 1767 _mm512_maskz_cvtepi16_ph(__mmask32 __U, __m512i __A) { 1768 return (__m512h)__builtin_ia32_vcvtw2ph512_mask( 1769 (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U, 1770 _MM_FROUND_CUR_DIRECTION); 1771 } 1772 1773 #define _mm512_cvt_roundph_epu16(A, R) \ 1774 ((__m512i)__builtin_ia32_vcvtph2uw512_mask( \ 1775 (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1), \ 1776 (int)(R))) 1777 1778 #define _mm512_mask_cvt_roundph_epu16(W, U, A, R) \ 1779 ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), (__v32hu)(W), \ 1780 (__mmask32)(U), (int)(R))) 1781 1782 #define _mm512_maskz_cvt_roundph_epu16(U, A, R) \ 1783 ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), \ 1784 (__v32hu)_mm512_setzero_epi32(), \ 1785 (__mmask32)(U), (int)(R))) 1786 1787 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1788 _mm512_cvtph_epu16(__m512h __A) { 1789 return (__m512i)__builtin_ia32_vcvtph2uw512_mask( 1790 (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)-1, 1791 _MM_FROUND_CUR_DIRECTION); 1792 } 1793 1794 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1795 _mm512_mask_cvtph_epu16(__m512i __W, __mmask32 __U, __m512h __A) { 1796 return (__m512i)__builtin_ia32_vcvtph2uw512_mask( 1797 (__v32hf)__A, (__v32hu)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION); 1798 } 1799 1800 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1801 _mm512_maskz_cvtph_epu16(__mmask32 __U, __m512h __A) { 1802 return (__m512i)__builtin_ia32_vcvtph2uw512_mask( 1803 (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)__U, 1804 _MM_FROUND_CUR_DIRECTION); 1805 } 1806 1807 #define _mm512_cvtt_roundph_epu16(A, R) \ 1808 ((__m512i)__builtin_ia32_vcvttph2uw512_mask( \ 1809 (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1), \ 1810 (int)(R))) 1811 1812 #define _mm512_mask_cvtt_roundph_epu16(W, U, A, R) \ 1813 ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), (__v32hu)(W), \ 1814 (__mmask32)(U), (int)(R))) 1815 1816 #define _mm512_maskz_cvtt_roundph_epu16(U, A, R) \ 1817 ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), \ 1818 (__v32hu)_mm512_setzero_epi32(), \ 1819 (__mmask32)(U), (int)(R))) 1820 1821 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1822 _mm512_cvttph_epu16(__m512h __A) { 1823 return (__m512i)__builtin_ia32_vcvttph2uw512_mask( 1824 (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)-1, 1825 _MM_FROUND_CUR_DIRECTION); 1826 } 1827 1828 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1829 _mm512_mask_cvttph_epu16(__m512i __W, __mmask32 __U, __m512h __A) { 1830 return (__m512i)__builtin_ia32_vcvttph2uw512_mask( 1831 (__v32hf)__A, (__v32hu)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION); 1832 } 1833 1834 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1835 _mm512_maskz_cvttph_epu16(__mmask32 __U, __m512h __A) { 1836 return (__m512i)__builtin_ia32_vcvttph2uw512_mask( 1837 (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)__U, 1838 _MM_FROUND_CUR_DIRECTION); 1839 } 1840 1841 #define _mm512_cvt_roundepu16_ph(A, R) \ 1842 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), \ 1843 (__v32hf)_mm512_undefined_ph(), \ 1844 (__mmask32)(-1), (int)(R))) 1845 1846 #define _mm512_mask_cvt_roundepu16_ph(W, U, A, R) \ 1847 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), (__v32hf)(W), \ 1848 (__mmask32)(U), (int)(R))) 1849 1850 #define _mm512_maskz_cvt_roundepu16_ph(U, A, R) \ 1851 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask( \ 1852 (__v32hu)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R))) 1853 1854 static __inline__ __m512h __DEFAULT_FN_ATTRS512 1855 _mm512_cvtepu16_ph(__m512i __A) { 1856 return (__m512h)__builtin_ia32_vcvtuw2ph512_mask( 1857 (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)-1, 1858 _MM_FROUND_CUR_DIRECTION); 1859 } 1860 1861 static __inline__ __m512h __DEFAULT_FN_ATTRS512 1862 _mm512_mask_cvtepu16_ph(__m512h __W, __mmask32 __U, __m512i __A) { 1863 return (__m512h)__builtin_ia32_vcvtuw2ph512_mask( 1864 (__v32hu)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION); 1865 } 1866 1867 static __inline__ __m512h __DEFAULT_FN_ATTRS512 1868 _mm512_maskz_cvtepu16_ph(__mmask32 __U, __m512i __A) { 1869 return (__m512h)__builtin_ia32_vcvtuw2ph512_mask( 1870 (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U, 1871 _MM_FROUND_CUR_DIRECTION); 1872 } 1873 1874 #define _mm512_cvt_roundph_epi32(A, R) \ 1875 ((__m512i)__builtin_ia32_vcvtph2dq512_mask( \ 1876 (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1), \ 1877 (int)(R))) 1878 1879 #define _mm512_mask_cvt_roundph_epi32(W, U, A, R) \ 1880 ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), (__v16si)(W), \ 1881 (__mmask16)(U), (int)(R))) 1882 1883 #define _mm512_maskz_cvt_roundph_epi32(U, A, R) \ 1884 ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), \ 1885 (__v16si)_mm512_setzero_epi32(), \ 1886 (__mmask16)(U), (int)(R))) 1887 1888 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1889 _mm512_cvtph_epi32(__m256h __A) { 1890 return (__m512i)__builtin_ia32_vcvtph2dq512_mask( 1891 (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)-1, 1892 _MM_FROUND_CUR_DIRECTION); 1893 } 1894 1895 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1896 _mm512_mask_cvtph_epi32(__m512i __W, __mmask16 __U, __m256h __A) { 1897 return (__m512i)__builtin_ia32_vcvtph2dq512_mask( 1898 (__v16hf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); 1899 } 1900 1901 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1902 _mm512_maskz_cvtph_epi32(__mmask16 __U, __m256h __A) { 1903 return (__m512i)__builtin_ia32_vcvtph2dq512_mask( 1904 (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)__U, 1905 _MM_FROUND_CUR_DIRECTION); 1906 } 1907 1908 #define _mm512_cvt_roundph_epu32(A, R) \ 1909 ((__m512i)__builtin_ia32_vcvtph2udq512_mask( \ 1910 (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1), \ 1911 (int)(R))) 1912 1913 #define _mm512_mask_cvt_roundph_epu32(W, U, A, R) \ 1914 ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), (__v16su)(W), \ 1915 (__mmask16)(U), (int)(R))) 1916 1917 #define _mm512_maskz_cvt_roundph_epu32(U, A, R) \ 1918 ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), \ 1919 (__v16su)_mm512_setzero_epi32(), \ 1920 (__mmask16)(U), (int)(R))) 1921 1922 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1923 _mm512_cvtph_epu32(__m256h __A) { 1924 return (__m512i)__builtin_ia32_vcvtph2udq512_mask( 1925 (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)-1, 1926 _MM_FROUND_CUR_DIRECTION); 1927 } 1928 1929 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1930 _mm512_mask_cvtph_epu32(__m512i __W, __mmask16 __U, __m256h __A) { 1931 return (__m512i)__builtin_ia32_vcvtph2udq512_mask( 1932 (__v16hf)__A, (__v16su)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); 1933 } 1934 1935 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1936 _mm512_maskz_cvtph_epu32(__mmask16 __U, __m256h __A) { 1937 return (__m512i)__builtin_ia32_vcvtph2udq512_mask( 1938 (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)__U, 1939 _MM_FROUND_CUR_DIRECTION); 1940 } 1941 1942 #define _mm512_cvt_roundepi32_ph(A, R) \ 1943 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), \ 1944 (__v16hf)_mm256_undefined_ph(), \ 1945 (__mmask16)(-1), (int)(R))) 1946 1947 #define _mm512_mask_cvt_roundepi32_ph(W, U, A, R) \ 1948 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), (__v16hf)(W), \ 1949 (__mmask16)(U), (int)(R))) 1950 1951 #define _mm512_maskz_cvt_roundepi32_ph(U, A, R) \ 1952 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask( \ 1953 (__v16si)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R))) 1954 1955 static __inline__ __m256h __DEFAULT_FN_ATTRS512 1956 _mm512_cvtepi32_ph(__m512i __A) { 1957 return (__m256h)__builtin_ia32_vcvtdq2ph512_mask( 1958 (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1, 1959 _MM_FROUND_CUR_DIRECTION); 1960 } 1961 1962 static __inline__ __m256h __DEFAULT_FN_ATTRS512 1963 _mm512_mask_cvtepi32_ph(__m256h __W, __mmask16 __U, __m512i __A) { 1964 return (__m256h)__builtin_ia32_vcvtdq2ph512_mask( 1965 (__v16si)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); 1966 } 1967 1968 static __inline__ __m256h __DEFAULT_FN_ATTRS512 1969 _mm512_maskz_cvtepi32_ph(__mmask16 __U, __m512i __A) { 1970 return (__m256h)__builtin_ia32_vcvtdq2ph512_mask( 1971 (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U, 1972 _MM_FROUND_CUR_DIRECTION); 1973 } 1974 1975 #define _mm512_cvt_roundepu32_ph(A, R) \ 1976 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), \ 1977 (__v16hf)_mm256_undefined_ph(), \ 1978 (__mmask16)(-1), (int)(R))) 1979 1980 #define _mm512_mask_cvt_roundepu32_ph(W, U, A, R) \ 1981 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), (__v16hf)(W), \ 1982 (__mmask16)(U), (int)(R))) 1983 1984 #define _mm512_maskz_cvt_roundepu32_ph(U, A, R) \ 1985 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask( \ 1986 (__v16su)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R))) 1987 1988 static __inline__ __m256h __DEFAULT_FN_ATTRS512 1989 _mm512_cvtepu32_ph(__m512i __A) { 1990 return (__m256h)__builtin_ia32_vcvtudq2ph512_mask( 1991 (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1, 1992 _MM_FROUND_CUR_DIRECTION); 1993 } 1994 1995 static __inline__ __m256h __DEFAULT_FN_ATTRS512 1996 _mm512_mask_cvtepu32_ph(__m256h __W, __mmask16 __U, __m512i __A) { 1997 return (__m256h)__builtin_ia32_vcvtudq2ph512_mask( 1998 (__v16su)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); 1999 } 2000 2001 static __inline__ __m256h __DEFAULT_FN_ATTRS512 2002 _mm512_maskz_cvtepu32_ph(__mmask16 __U, __m512i __A) { 2003 return (__m256h)__builtin_ia32_vcvtudq2ph512_mask( 2004 (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U, 2005 _MM_FROUND_CUR_DIRECTION); 2006 } 2007 2008 #define _mm512_cvtt_roundph_epi32(A, R) \ 2009 ((__m512i)__builtin_ia32_vcvttph2dq512_mask( \ 2010 (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1), \ 2011 (int)(R))) 2012 2013 #define _mm512_mask_cvtt_roundph_epi32(W, U, A, R) \ 2014 ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), (__v16si)(W), \ 2015 (__mmask16)(U), (int)(R))) 2016 2017 #define _mm512_maskz_cvtt_roundph_epi32(U, A, R) \ 2018 ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), \ 2019 (__v16si)_mm512_setzero_epi32(), \ 2020 (__mmask16)(U), (int)(R))) 2021 2022 static __inline__ __m512i __DEFAULT_FN_ATTRS512 2023 _mm512_cvttph_epi32(__m256h __A) { 2024 return (__m512i)__builtin_ia32_vcvttph2dq512_mask( 2025 (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)-1, 2026 _MM_FROUND_CUR_DIRECTION); 2027 } 2028 2029 static __inline__ __m512i __DEFAULT_FN_ATTRS512 2030 _mm512_mask_cvttph_epi32(__m512i __W, __mmask16 __U, __m256h __A) { 2031 return (__m512i)__builtin_ia32_vcvttph2dq512_mask( 2032 (__v16hf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); 2033 } 2034 2035 static __inline__ __m512i __DEFAULT_FN_ATTRS512 2036 _mm512_maskz_cvttph_epi32(__mmask16 __U, __m256h __A) { 2037 return (__m512i)__builtin_ia32_vcvttph2dq512_mask( 2038 (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)__U, 2039 _MM_FROUND_CUR_DIRECTION); 2040 } 2041 2042 #define _mm512_cvtt_roundph_epu32(A, R) \ 2043 ((__m512i)__builtin_ia32_vcvttph2udq512_mask( \ 2044 (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1), \ 2045 (int)(R))) 2046 2047 #define _mm512_mask_cvtt_roundph_epu32(W, U, A, R) \ 2048 ((__m512i)__builtin_ia32_vcvttph2udq512_mask((__v16hf)(A), (__v16su)(W), \ 2049 (__mmask16)(U), (int)(R))) 2050 2051 #define _mm512_maskz_cvtt_roundph_epu32(U, A, R) \ 2052 ((__m512i)__builtin_ia32_vcvttph2udq512_mask( \ 2053 (__v16hf)(A), (__v16su)_mm512_setzero_epi32(), (__mmask16)(U), \ 2054 (int)(R))) 2055 2056 static __inline__ __m512i __DEFAULT_FN_ATTRS512 2057 _mm512_cvttph_epu32(__m256h __A) { 2058 return (__m512i)__builtin_ia32_vcvttph2udq512_mask( 2059 (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)-1, 2060 _MM_FROUND_CUR_DIRECTION); 2061 } 2062 2063 static __inline__ __m512i __DEFAULT_FN_ATTRS512 2064 _mm512_mask_cvttph_epu32(__m512i __W, __mmask16 __U, __m256h __A) { 2065 return (__m512i)__builtin_ia32_vcvttph2udq512_mask( 2066 (__v16hf)__A, (__v16su)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); 2067 } 2068 2069 static __inline__ __m512i __DEFAULT_FN_ATTRS512 2070 _mm512_maskz_cvttph_epu32(__mmask16 __U, __m256h __A) { 2071 return (__m512i)__builtin_ia32_vcvttph2udq512_mask( 2072 (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)__U, 2073 _MM_FROUND_CUR_DIRECTION); 2074 } 2075 2076 #define _mm512_cvt_roundepi64_ph(A, R) \ 2077 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask( \ 2078 (__v8di)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R))) 2079 2080 #define _mm512_mask_cvt_roundepi64_ph(W, U, A, R) \ 2081 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask((__v8di)(A), (__v8hf)(W), \ 2082 (__mmask8)(U), (int)(R))) 2083 2084 #define _mm512_maskz_cvt_roundepi64_ph(U, A, R) \ 2085 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask( \ 2086 (__v8di)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R))) 2087 2088 static __inline__ __m128h __DEFAULT_FN_ATTRS512 2089 _mm512_cvtepi64_ph(__m512i __A) { 2090 return (__m128h)__builtin_ia32_vcvtqq2ph512_mask( 2091 (__v8di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1, 2092 _MM_FROUND_CUR_DIRECTION); 2093 } 2094 2095 static __inline__ __m128h __DEFAULT_FN_ATTRS512 2096 _mm512_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m512i __A) { 2097 return (__m128h)__builtin_ia32_vcvtqq2ph512_mask( 2098 (__v8di)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); 2099 } 2100 2101 static __inline__ __m128h __DEFAULT_FN_ATTRS512 2102 _mm512_maskz_cvtepi64_ph(__mmask8 __U, __m512i __A) { 2103 return (__m128h)__builtin_ia32_vcvtqq2ph512_mask( 2104 (__v8di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U, 2105 _MM_FROUND_CUR_DIRECTION); 2106 } 2107 2108 #define _mm512_cvt_roundph_epi64(A, R) \ 2109 ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), \ 2110 (__v8di)_mm512_undefined_epi32(), \ 2111 (__mmask8)(-1), (int)(R))) 2112 2113 #define _mm512_mask_cvt_roundph_epi64(W, U, A, R) \ 2114 ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), (__v8di)(W), \ 2115 (__mmask8)(U), (int)(R))) 2116 2117 #define _mm512_maskz_cvt_roundph_epi64(U, A, R) \ 2118 ((__m512i)__builtin_ia32_vcvtph2qq512_mask( \ 2119 (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R))) 2120 2121 static __inline__ __m512i __DEFAULT_FN_ATTRS512 2122 _mm512_cvtph_epi64(__m128h __A) { 2123 return (__m512i)__builtin_ia32_vcvtph2qq512_mask( 2124 (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)-1, 2125 _MM_FROUND_CUR_DIRECTION); 2126 } 2127 2128 static __inline__ __m512i __DEFAULT_FN_ATTRS512 2129 _mm512_mask_cvtph_epi64(__m512i __W, __mmask8 __U, __m128h __A) { 2130 return (__m512i)__builtin_ia32_vcvtph2qq512_mask( 2131 (__v8hf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); 2132 } 2133 2134 static __inline__ __m512i __DEFAULT_FN_ATTRS512 2135 _mm512_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) { 2136 return (__m512i)__builtin_ia32_vcvtph2qq512_mask( 2137 (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)__U, 2138 _MM_FROUND_CUR_DIRECTION); 2139 } 2140 2141 #define _mm512_cvt_roundepu64_ph(A, R) \ 2142 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask( \ 2143 (__v8du)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R))) 2144 2145 #define _mm512_mask_cvt_roundepu64_ph(W, U, A, R) \ 2146 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask((__v8du)(A), (__v8hf)(W), \ 2147 (__mmask8)(U), (int)(R))) 2148 2149 #define _mm512_maskz_cvt_roundepu64_ph(U, A, R) \ 2150 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask( \ 2151 (__v8du)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R))) 2152 2153 static __inline__ __m128h __DEFAULT_FN_ATTRS512 2154 _mm512_cvtepu64_ph(__m512i __A) { 2155 return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask( 2156 (__v8du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1, 2157 _MM_FROUND_CUR_DIRECTION); 2158 } 2159 2160 static __inline__ __m128h __DEFAULT_FN_ATTRS512 2161 _mm512_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m512i __A) { 2162 return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask( 2163 (__v8du)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); 2164 } 2165 2166 static __inline__ __m128h __DEFAULT_FN_ATTRS512 2167 _mm512_maskz_cvtepu64_ph(__mmask8 __U, __m512i __A) { 2168 return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask( 2169 (__v8du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U, 2170 _MM_FROUND_CUR_DIRECTION); 2171 } 2172 2173 #define _mm512_cvt_roundph_epu64(A, R) \ 2174 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask( \ 2175 (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1), \ 2176 (int)(R))) 2177 2178 #define _mm512_mask_cvt_roundph_epu64(W, U, A, R) \ 2179 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask((__v8hf)(A), (__v8du)(W), \ 2180 (__mmask8)(U), (int)(R))) 2181 2182 #define _mm512_maskz_cvt_roundph_epu64(U, A, R) \ 2183 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask( \ 2184 (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R))) 2185 2186 static __inline__ __m512i __DEFAULT_FN_ATTRS512 2187 _mm512_cvtph_epu64(__m128h __A) { 2188 return (__m512i)__builtin_ia32_vcvtph2uqq512_mask( 2189 (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)-1, 2190 _MM_FROUND_CUR_DIRECTION); 2191 } 2192 2193 static __inline__ __m512i __DEFAULT_FN_ATTRS512 2194 _mm512_mask_cvtph_epu64(__m512i __W, __mmask8 __U, __m128h __A) { 2195 return (__m512i)__builtin_ia32_vcvtph2uqq512_mask( 2196 (__v8hf)__A, (__v8du)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); 2197 } 2198 2199 static __inline__ __m512i __DEFAULT_FN_ATTRS512 2200 _mm512_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) { 2201 return (__m512i)__builtin_ia32_vcvtph2uqq512_mask( 2202 (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)__U, 2203 _MM_FROUND_CUR_DIRECTION); 2204 } 2205 2206 #define _mm512_cvtt_roundph_epi64(A, R) \ 2207 ((__m512i)__builtin_ia32_vcvttph2qq512_mask( \ 2208 (__v8hf)(A), (__v8di)_mm512_undefined_epi32(), (__mmask8)(-1), \ 2209 (int)(R))) 2210 2211 #define _mm512_mask_cvtt_roundph_epi64(W, U, A, R) \ 2212 ((__m512i)__builtin_ia32_vcvttph2qq512_mask((__v8hf)(A), (__v8di)(W), \ 2213 (__mmask8)(U), (int)(R))) 2214 2215 #define _mm512_maskz_cvtt_roundph_epi64(U, A, R) \ 2216 ((__m512i)__builtin_ia32_vcvttph2qq512_mask( \ 2217 (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R))) 2218 2219 static __inline__ __m512i __DEFAULT_FN_ATTRS512 2220 _mm512_cvttph_epi64(__m128h __A) { 2221 return (__m512i)__builtin_ia32_vcvttph2qq512_mask( 2222 (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)-1, 2223 _MM_FROUND_CUR_DIRECTION); 2224 } 2225 2226 static __inline__ __m512i __DEFAULT_FN_ATTRS512 2227 _mm512_mask_cvttph_epi64(__m512i __W, __mmask8 __U, __m128h __A) { 2228 return (__m512i)__builtin_ia32_vcvttph2qq512_mask( 2229 (__v8hf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); 2230 } 2231 2232 static __inline__ __m512i __DEFAULT_FN_ATTRS512 2233 _mm512_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) { 2234 return (__m512i)__builtin_ia32_vcvttph2qq512_mask( 2235 (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)__U, 2236 _MM_FROUND_CUR_DIRECTION); 2237 } 2238 2239 #define _mm512_cvtt_roundph_epu64(A, R) \ 2240 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask( \ 2241 (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1), \ 2242 (int)(R))) 2243 2244 #define _mm512_mask_cvtt_roundph_epu64(W, U, A, R) \ 2245 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask((__v8hf)(A), (__v8du)(W), \ 2246 (__mmask8)(U), (int)(R))) 2247 2248 #define _mm512_maskz_cvtt_roundph_epu64(U, A, R) \ 2249 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask( \ 2250 (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R))) 2251 2252 static __inline__ __m512i __DEFAULT_FN_ATTRS512 2253 _mm512_cvttph_epu64(__m128h __A) { 2254 return (__m512i)__builtin_ia32_vcvttph2uqq512_mask( 2255 (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)-1, 2256 _MM_FROUND_CUR_DIRECTION); 2257 } 2258 2259 static __inline__ __m512i __DEFAULT_FN_ATTRS512 2260 _mm512_mask_cvttph_epu64(__m512i __W, __mmask8 __U, __m128h __A) { 2261 return (__m512i)__builtin_ia32_vcvttph2uqq512_mask( 2262 (__v8hf)__A, (__v8du)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); 2263 } 2264 2265 static __inline__ __m512i __DEFAULT_FN_ATTRS512 2266 _mm512_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) { 2267 return (__m512i)__builtin_ia32_vcvttph2uqq512_mask( 2268 (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)__U, 2269 _MM_FROUND_CUR_DIRECTION); 2270 } 2271 2272 #define _mm_cvt_roundsh_i32(A, R) \ 2273 ((int)__builtin_ia32_vcvtsh2si32((__v8hf)(A), (int)(R))) 2274 2275 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvtsh_i32(__m128h __A) { 2276 return (int)__builtin_ia32_vcvtsh2si32((__v8hf)__A, _MM_FROUND_CUR_DIRECTION); 2277 } 2278 2279 #define _mm_cvt_roundsh_u32(A, R) \ 2280 ((unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)(A), (int)(R))) 2281 2282 static __inline__ unsigned int __DEFAULT_FN_ATTRS128 2283 _mm_cvtsh_u32(__m128h __A) { 2284 return (unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)__A, 2285 _MM_FROUND_CUR_DIRECTION); 2286 } 2287 2288 #ifdef __x86_64__ 2289 #define _mm_cvt_roundsh_i64(A, R) \ 2290 ((long long)__builtin_ia32_vcvtsh2si64((__v8hf)(A), (int)(R))) 2291 2292 static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvtsh_i64(__m128h __A) { 2293 return (long long)__builtin_ia32_vcvtsh2si64((__v8hf)__A, 2294 _MM_FROUND_CUR_DIRECTION); 2295 } 2296 2297 #define _mm_cvt_roundsh_u64(A, R) \ 2298 ((unsigned long long)__builtin_ia32_vcvtsh2usi64((__v8hf)(A), (int)(R))) 2299 2300 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128 2301 _mm_cvtsh_u64(__m128h __A) { 2302 return (unsigned long long)__builtin_ia32_vcvtsh2usi64( 2303 (__v8hf)__A, _MM_FROUND_CUR_DIRECTION); 2304 } 2305 #endif // __x86_64__ 2306 2307 #define _mm_cvt_roundu32_sh(A, B, R) \ 2308 ((__m128h)__builtin_ia32_vcvtusi2sh((__v8hf)(A), (unsigned int)(B), (int)(R))) 2309 2310 static __inline__ __m128h __DEFAULT_FN_ATTRS128 2311 _mm_cvtu32_sh(__m128h __A, unsigned int __B) { 2312 __A[0] = __B; 2313 return __A; 2314 } 2315 2316 #ifdef __x86_64__ 2317 #define _mm_cvt_roundu64_sh(A, B, R) \ 2318 ((__m128h)__builtin_ia32_vcvtusi642sh((__v8hf)(A), (unsigned long long)(B), \ 2319 (int)(R))) 2320 2321 static __inline__ __m128h __DEFAULT_FN_ATTRS128 2322 _mm_cvtu64_sh(__m128h __A, unsigned long long __B) { 2323 __A[0] = __B; 2324 return __A; 2325 } 2326 #endif 2327 2328 #define _mm_cvt_roundi32_sh(A, B, R) \ 2329 ((__m128h)__builtin_ia32_vcvtsi2sh((__v8hf)(A), (int)(B), (int)(R))) 2330 2331 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvti32_sh(__m128h __A, 2332 int __B) { 2333 __A[0] = __B; 2334 return __A; 2335 } 2336 2337 #ifdef __x86_64__ 2338 #define _mm_cvt_roundi64_sh(A, B, R) \ 2339 ((__m128h)__builtin_ia32_vcvtsi642sh((__v8hf)(A), (long long)(B), (int)(R))) 2340 2341 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvti64_sh(__m128h __A, 2342 long long __B) { 2343 __A[0] = __B; 2344 return __A; 2345 } 2346 #endif 2347 2348 #define _mm_cvtt_roundsh_i32(A, R) \ 2349 ((int)__builtin_ia32_vcvttsh2si32((__v8hf)(A), (int)(R))) 2350 2351 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvttsh_i32(__m128h __A) { 2352 return (int)__builtin_ia32_vcvttsh2si32((__v8hf)__A, 2353 _MM_FROUND_CUR_DIRECTION); 2354 } 2355 2356 #ifdef __x86_64__ 2357 #define _mm_cvtt_roundsh_i64(A, R) \ 2358 ((long long)__builtin_ia32_vcvttsh2si64((__v8hf)(A), (int)(R))) 2359 2360 static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvttsh_i64(__m128h __A) { 2361 return (long long)__builtin_ia32_vcvttsh2si64((__v8hf)__A, 2362 _MM_FROUND_CUR_DIRECTION); 2363 } 2364 #endif 2365 2366 #define _mm_cvtt_roundsh_u32(A, R) \ 2367 ((unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)(A), (int)(R))) 2368 2369 static __inline__ unsigned int __DEFAULT_FN_ATTRS128 2370 _mm_cvttsh_u32(__m128h __A) { 2371 return (unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)__A, 2372 _MM_FROUND_CUR_DIRECTION); 2373 } 2374 2375 #ifdef __x86_64__ 2376 #define _mm_cvtt_roundsh_u64(A, R) \ 2377 ((unsigned long long)__builtin_ia32_vcvttsh2usi64((__v8hf)(A), (int)(R))) 2378 2379 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128 2380 _mm_cvttsh_u64(__m128h __A) { 2381 return (unsigned long long)__builtin_ia32_vcvttsh2usi64( 2382 (__v8hf)__A, _MM_FROUND_CUR_DIRECTION); 2383 } 2384 #endif 2385 2386 #define _mm512_cvtx_roundph_ps(A, R) \ 2387 ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), \ 2388 (__v16sf)_mm512_undefined_ps(), \ 2389 (__mmask16)(-1), (int)(R))) 2390 2391 #define _mm512_mask_cvtx_roundph_ps(W, U, A, R) \ 2392 ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), (__v16sf)(W), \ 2393 (__mmask16)(U), (int)(R))) 2394 2395 #define _mm512_maskz_cvtx_roundph_ps(U, A, R) \ 2396 ((__m512)__builtin_ia32_vcvtph2psx512_mask( \ 2397 (__v16hf)(A), (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), (int)(R))) 2398 2399 static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtxph_ps(__m256h __A) { 2400 return (__m512)__builtin_ia32_vcvtph2psx512_mask( 2401 (__v16hf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)-1, 2402 _MM_FROUND_CUR_DIRECTION); 2403 } 2404 2405 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2406 _mm512_mask_cvtxph_ps(__m512 __W, __mmask16 __U, __m256h __A) { 2407 return (__m512)__builtin_ia32_vcvtph2psx512_mask( 2408 (__v16hf)__A, (__v16sf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); 2409 } 2410 2411 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2412 _mm512_maskz_cvtxph_ps(__mmask16 __U, __m256h __A) { 2413 return (__m512)__builtin_ia32_vcvtph2psx512_mask( 2414 (__v16hf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, 2415 _MM_FROUND_CUR_DIRECTION); 2416 } 2417 2418 #define _mm512_cvtx_roundps_ph(A, R) \ 2419 ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), \ 2420 (__v16hf)_mm256_undefined_ph(), \ 2421 (__mmask16)(-1), (int)(R))) 2422 2423 #define _mm512_mask_cvtx_roundps_ph(W, U, A, R) \ 2424 ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), (__v16hf)(W), \ 2425 (__mmask16)(U), (int)(R))) 2426 2427 #define _mm512_maskz_cvtx_roundps_ph(U, A, R) \ 2428 ((__m256h)__builtin_ia32_vcvtps2phx512_mask( \ 2429 (__v16sf)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R))) 2430 2431 static __inline__ __m256h __DEFAULT_FN_ATTRS512 _mm512_cvtxps_ph(__m512 __A) { 2432 return (__m256h)__builtin_ia32_vcvtps2phx512_mask( 2433 (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1, 2434 _MM_FROUND_CUR_DIRECTION); 2435 } 2436 2437 static __inline__ __m256h __DEFAULT_FN_ATTRS512 2438 _mm512_mask_cvtxps_ph(__m256h __W, __mmask16 __U, __m512 __A) { 2439 return (__m256h)__builtin_ia32_vcvtps2phx512_mask( 2440 (__v16sf)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); 2441 } 2442 2443 static __inline__ __m256h __DEFAULT_FN_ATTRS512 2444 _mm512_maskz_cvtxps_ph(__mmask16 __U, __m512 __A) { 2445 return (__m256h)__builtin_ia32_vcvtps2phx512_mask( 2446 (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U, 2447 _MM_FROUND_CUR_DIRECTION); 2448 } 2449 2450 #define _mm512_fmadd_round_ph(A, B, C, R) \ 2451 ((__m512h)__builtin_ia32_vfmaddph512_mask( \ 2452 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ 2453 (__mmask32)-1, (int)(R))) 2454 2455 #define _mm512_mask_fmadd_round_ph(A, U, B, C, R) \ 2456 ((__m512h)__builtin_ia32_vfmaddph512_mask( \ 2457 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ 2458 (__mmask32)(U), (int)(R))) 2459 2460 #define _mm512_mask3_fmadd_round_ph(A, B, C, U, R) \ 2461 ((__m512h)__builtin_ia32_vfmaddph512_mask3( \ 2462 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ 2463 (__mmask32)(U), (int)(R))) 2464 2465 #define _mm512_maskz_fmadd_round_ph(U, A, B, C, R) \ 2466 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \ 2467 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ 2468 (__mmask32)(U), (int)(R))) 2469 2470 #define _mm512_fmsub_round_ph(A, B, C, R) \ 2471 ((__m512h)__builtin_ia32_vfmaddph512_mask( \ 2472 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \ 2473 (__mmask32)-1, (int)(R))) 2474 2475 #define _mm512_mask_fmsub_round_ph(A, U, B, C, R) \ 2476 ((__m512h)__builtin_ia32_vfmaddph512_mask( \ 2477 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \ 2478 (__mmask32)(U), (int)(R))) 2479 2480 #define _mm512_maskz_fmsub_round_ph(U, A, B, C, R) \ 2481 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \ 2482 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \ 2483 (__mmask32)(U), (int)(R))) 2484 2485 #define _mm512_fnmadd_round_ph(A, B, C, R) \ 2486 ((__m512h)__builtin_ia32_vfmaddph512_mask( \ 2487 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ 2488 (__mmask32)-1, (int)(R))) 2489 2490 #define _mm512_mask3_fnmadd_round_ph(A, B, C, U, R) \ 2491 ((__m512h)__builtin_ia32_vfmaddph512_mask3( \ 2492 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ 2493 (__mmask32)(U), (int)(R))) 2494 2495 #define _mm512_maskz_fnmadd_round_ph(U, A, B, C, R) \ 2496 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \ 2497 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ 2498 (__mmask32)(U), (int)(R))) 2499 2500 #define _mm512_fnmsub_round_ph(A, B, C, R) \ 2501 ((__m512h)__builtin_ia32_vfmaddph512_mask( \ 2502 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \ 2503 (__mmask32)-1, (int)(R))) 2504 2505 #define _mm512_maskz_fnmsub_round_ph(U, A, B, C, R) \ 2506 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \ 2507 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \ 2508 (__mmask32)(U), (int)(R))) 2509 2510 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_ph(__m512h __A, 2511 __m512h __B, 2512 __m512h __C) { 2513 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B, 2514 (__v32hf)__C, (__mmask32)-1, 2515 _MM_FROUND_CUR_DIRECTION); 2516 } 2517 2518 static __inline__ __m512h __DEFAULT_FN_ATTRS512 2519 _mm512_mask_fmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) { 2520 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B, 2521 (__v32hf)__C, (__mmask32)__U, 2522 _MM_FROUND_CUR_DIRECTION); 2523 } 2524 2525 static __inline__ __m512h __DEFAULT_FN_ATTRS512 2526 _mm512_mask3_fmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) { 2527 return (__m512h)__builtin_ia32_vfmaddph512_mask3((__v32hf)__A, (__v32hf)__B, 2528 (__v32hf)__C, (__mmask32)__U, 2529 _MM_FROUND_CUR_DIRECTION); 2530 } 2531 2532 static __inline__ __m512h __DEFAULT_FN_ATTRS512 2533 _mm512_maskz_fmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) { 2534 return (__m512h)__builtin_ia32_vfmaddph512_maskz((__v32hf)__A, (__v32hf)__B, 2535 (__v32hf)__C, (__mmask32)__U, 2536 _MM_FROUND_CUR_DIRECTION); 2537 } 2538 2539 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmsub_ph(__m512h __A, 2540 __m512h __B, 2541 __m512h __C) { 2542 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B, 2543 -(__v32hf)__C, (__mmask32)-1, 2544 _MM_FROUND_CUR_DIRECTION); 2545 } 2546 2547 static __inline__ __m512h __DEFAULT_FN_ATTRS512 2548 _mm512_mask_fmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) { 2549 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B, 2550 -(__v32hf)__C, (__mmask32)__U, 2551 _MM_FROUND_CUR_DIRECTION); 2552 } 2553 2554 static __inline__ __m512h __DEFAULT_FN_ATTRS512 2555 _mm512_maskz_fmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) { 2556 return (__m512h)__builtin_ia32_vfmaddph512_maskz( 2557 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U, 2558 _MM_FROUND_CUR_DIRECTION); 2559 } 2560 2561 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fnmadd_ph(__m512h __A, 2562 __m512h __B, 2563 __m512h __C) { 2564 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B, 2565 (__v32hf)__C, (__mmask32)-1, 2566 _MM_FROUND_CUR_DIRECTION); 2567 } 2568 2569 static __inline__ __m512h __DEFAULT_FN_ATTRS512 2570 _mm512_mask3_fnmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) { 2571 return (__m512h)__builtin_ia32_vfmaddph512_mask3(-(__v32hf)__A, (__v32hf)__B, 2572 (__v32hf)__C, (__mmask32)__U, 2573 _MM_FROUND_CUR_DIRECTION); 2574 } 2575 2576 static __inline__ __m512h __DEFAULT_FN_ATTRS512 2577 _mm512_maskz_fnmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) { 2578 return (__m512h)__builtin_ia32_vfmaddph512_maskz(-(__v32hf)__A, (__v32hf)__B, 2579 (__v32hf)__C, (__mmask32)__U, 2580 _MM_FROUND_CUR_DIRECTION); 2581 } 2582 2583 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fnmsub_ph(__m512h __A, 2584 __m512h __B, 2585 __m512h __C) { 2586 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B, 2587 -(__v32hf)__C, (__mmask32)-1, 2588 _MM_FROUND_CUR_DIRECTION); 2589 } 2590 2591 static __inline__ __m512h __DEFAULT_FN_ATTRS512 2592 _mm512_maskz_fnmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) { 2593 return (__m512h)__builtin_ia32_vfmaddph512_maskz( 2594 -(__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U, 2595 _MM_FROUND_CUR_DIRECTION); 2596 } 2597 2598 #define _mm512_fmaddsub_round_ph(A, B, C, R) \ 2599 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \ 2600 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ 2601 (__mmask32)-1, (int)(R))) 2602 2603 #define _mm512_mask_fmaddsub_round_ph(A, U, B, C, R) \ 2604 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \ 2605 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ 2606 (__mmask32)(U), (int)(R))) 2607 2608 #define _mm512_mask3_fmaddsub_round_ph(A, B, C, U, R) \ 2609 ((__m512h)__builtin_ia32_vfmaddsubph512_mask3( \ 2610 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ 2611 (__mmask32)(U), (int)(R))) 2612 2613 #define _mm512_maskz_fmaddsub_round_ph(U, A, B, C, R) \ 2614 ((__m512h)__builtin_ia32_vfmaddsubph512_maskz( \ 2615 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ 2616 (__mmask32)(U), (int)(R))) 2617 2618 #define _mm512_fmsubadd_round_ph(A, B, C, R) \ 2619 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \ 2620 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \ 2621 (__mmask32)-1, (int)(R))) 2622 2623 #define _mm512_mask_fmsubadd_round_ph(A, U, B, C, R) \ 2624 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \ 2625 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \ 2626 (__mmask32)(U), (int)(R))) 2627 2628 #define _mm512_maskz_fmsubadd_round_ph(U, A, B, C, R) \ 2629 ((__m512h)__builtin_ia32_vfmaddsubph512_maskz( \ 2630 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \ 2631 (__mmask32)(U), (int)(R))) 2632 2633 static __inline__ __m512h __DEFAULT_FN_ATTRS512 2634 _mm512_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C) { 2635 return (__m512h)__builtin_ia32_vfmaddsubph512_mask( 2636 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)-1, 2637 _MM_FROUND_CUR_DIRECTION); 2638 } 2639 2640 static __inline__ __m512h __DEFAULT_FN_ATTRS512 2641 _mm512_mask_fmaddsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) { 2642 return (__m512h)__builtin_ia32_vfmaddsubph512_mask( 2643 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U, 2644 _MM_FROUND_CUR_DIRECTION); 2645 } 2646 2647 static __inline__ __m512h __DEFAULT_FN_ATTRS512 2648 _mm512_mask3_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) { 2649 return (__m512h)__builtin_ia32_vfmaddsubph512_mask3( 2650 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U, 2651 _MM_FROUND_CUR_DIRECTION); 2652 } 2653 2654 static __inline__ __m512h __DEFAULT_FN_ATTRS512 2655 _mm512_maskz_fmaddsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) { 2656 return (__m512h)__builtin_ia32_vfmaddsubph512_maskz( 2657 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U, 2658 _MM_FROUND_CUR_DIRECTION); 2659 } 2660 2661 static __inline__ __m512h __DEFAULT_FN_ATTRS512 2662 _mm512_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C) { 2663 return (__m512h)__builtin_ia32_vfmaddsubph512_mask( 2664 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)-1, 2665 _MM_FROUND_CUR_DIRECTION); 2666 } 2667 2668 static __inline__ __m512h __DEFAULT_FN_ATTRS512 2669 _mm512_mask_fmsubadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) { 2670 return (__m512h)__builtin_ia32_vfmaddsubph512_mask( 2671 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U, 2672 _MM_FROUND_CUR_DIRECTION); 2673 } 2674 2675 static __inline__ __m512h __DEFAULT_FN_ATTRS512 2676 _mm512_maskz_fmsubadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) { 2677 return (__m512h)__builtin_ia32_vfmaddsubph512_maskz( 2678 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U, 2679 _MM_FROUND_CUR_DIRECTION); 2680 } 2681 2682 #define _mm512_mask3_fmsub_round_ph(A, B, C, U, R) \ 2683 ((__m512h)__builtin_ia32_vfmsubph512_mask3( \ 2684 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ 2685 (__mmask32)(U), (int)(R))) 2686 2687 static __inline__ __m512h __DEFAULT_FN_ATTRS512 2688 _mm512_mask3_fmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) { 2689 return (__m512h)__builtin_ia32_vfmsubph512_mask3((__v32hf)__A, (__v32hf)__B, 2690 (__v32hf)__C, (__mmask32)__U, 2691 _MM_FROUND_CUR_DIRECTION); 2692 } 2693 2694 #define _mm512_mask3_fmsubadd_round_ph(A, B, C, U, R) \ 2695 ((__m512h)__builtin_ia32_vfmsubaddph512_mask3( \ 2696 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ 2697 (__mmask32)(U), (int)(R))) 2698 2699 static __inline__ __m512h __DEFAULT_FN_ATTRS512 2700 _mm512_mask3_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) { 2701 return (__m512h)__builtin_ia32_vfmsubaddph512_mask3( 2702 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U, 2703 _MM_FROUND_CUR_DIRECTION); 2704 } 2705 2706 #define _mm512_mask_fnmadd_round_ph(A, U, B, C, R) \ 2707 ((__m512h)__builtin_ia32_vfmaddph512_mask( \ 2708 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ 2709 (__mmask32)(U), (int)(R))) 2710 2711 static __inline__ __m512h __DEFAULT_FN_ATTRS512 2712 _mm512_mask_fnmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) { 2713 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B, 2714 (__v32hf)__C, (__mmask32)__U, 2715 _MM_FROUND_CUR_DIRECTION); 2716 } 2717 2718 #define _mm512_mask_fnmsub_round_ph(A, U, B, C, R) \ 2719 ((__m512h)__builtin_ia32_vfmaddph512_mask( \ 2720 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \ 2721 (__mmask32)(U), (int)(R))) 2722 2723 #define _mm512_mask3_fnmsub_round_ph(A, B, C, U, R) \ 2724 ((__m512h)__builtin_ia32_vfmsubph512_mask3( \ 2725 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ 2726 (__mmask32)(U), (int)(R))) 2727 2728 static __inline__ __m512h __DEFAULT_FN_ATTRS512 2729 _mm512_mask_fnmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) { 2730 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B, 2731 -(__v32hf)__C, (__mmask32)__U, 2732 _MM_FROUND_CUR_DIRECTION); 2733 } 2734 2735 static __inline__ __m512h __DEFAULT_FN_ATTRS512 2736 _mm512_mask3_fnmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) { 2737 return (__m512h)__builtin_ia32_vfmsubph512_mask3(-(__v32hf)__A, (__v32hf)__B, 2738 (__v32hf)__C, (__mmask32)__U, 2739 _MM_FROUND_CUR_DIRECTION); 2740 } 2741 2742 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sh(__m128h __W, 2743 __m128h __A, 2744 __m128h __B) { 2745 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B, 2746 (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); 2747 } 2748 2749 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_sh(__m128h __W, 2750 __mmask8 __U, 2751 __m128h __A, 2752 __m128h __B) { 2753 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B, 2754 (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); 2755 } 2756 2757 #define _mm_fmadd_round_sh(A, B, C, R) \ 2758 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \ 2759 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \ 2760 (__mmask8)-1, (int)(R))) 2761 2762 #define _mm_mask_fmadd_round_sh(W, U, A, B, R) \ 2763 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \ 2764 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), \ 2765 (__mmask8)(U), (int)(R))) 2766 2767 static __inline__ __m128h __DEFAULT_FN_ATTRS128 2768 _mm_maskz_fmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { 2769 return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B, (__v8hf)__C, 2770 (__mmask8)__U, 2771 _MM_FROUND_CUR_DIRECTION); 2772 } 2773 2774 #define _mm_maskz_fmadd_round_sh(U, A, B, C, R) \ 2775 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \ 2776 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \ 2777 (__mmask8)(U), (int)(R))) 2778 2779 static __inline__ __m128h __DEFAULT_FN_ATTRS128 2780 _mm_mask3_fmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) { 2781 return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)__Y, 2782 (__mmask8)__U, 2783 _MM_FROUND_CUR_DIRECTION); 2784 } 2785 2786 #define _mm_mask3_fmadd_round_sh(W, X, Y, U, R) \ 2787 ((__m128h)__builtin_ia32_vfmaddsh3_mask3( \ 2788 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \ 2789 (__mmask8)(U), (int)(R))) 2790 2791 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsub_sh(__m128h __W, 2792 __m128h __A, 2793 __m128h __B) { 2794 return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, 2795 -(__v8hf)__B, (__mmask8)-1, 2796 _MM_FROUND_CUR_DIRECTION); 2797 } 2798 2799 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_sh(__m128h __W, 2800 __mmask8 __U, 2801 __m128h __A, 2802 __m128h __B) { 2803 return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, 2804 -(__v8hf)__B, (__mmask8)__U, 2805 _MM_FROUND_CUR_DIRECTION); 2806 } 2807 2808 #define _mm_fmsub_round_sh(A, B, C, R) \ 2809 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \ 2810 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \ 2811 (__mmask8)-1, (int)(R))) 2812 2813 #define _mm_mask_fmsub_round_sh(W, U, A, B, R) \ 2814 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \ 2815 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), \ 2816 (__mmask8)(U), (int)(R))) 2817 2818 static __inline__ __m128h __DEFAULT_FN_ATTRS128 2819 _mm_maskz_fmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { 2820 return (__m128h)__builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B, 2821 -(__v8hf)__C, (__mmask8)__U, 2822 _MM_FROUND_CUR_DIRECTION); 2823 } 2824 2825 #define _mm_maskz_fmsub_round_sh(U, A, B, C, R) \ 2826 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \ 2827 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \ 2828 (__mmask8)(U), (int)R)) 2829 2830 static __inline__ __m128h __DEFAULT_FN_ATTRS128 2831 _mm_mask3_fmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) { 2832 return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)__Y, 2833 (__mmask8)__U, 2834 _MM_FROUND_CUR_DIRECTION); 2835 } 2836 2837 #define _mm_mask3_fmsub_round_sh(W, X, Y, U, R) \ 2838 ((__m128h)__builtin_ia32_vfmsubsh3_mask3( \ 2839 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \ 2840 (__mmask8)(U), (int)(R))) 2841 2842 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmadd_sh(__m128h __W, 2843 __m128h __A, 2844 __m128h __B) { 2845 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B, 2846 (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); 2847 } 2848 2849 static __inline__ __m128h __DEFAULT_FN_ATTRS128 2850 _mm_mask_fnmadd_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { 2851 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B, 2852 (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); 2853 } 2854 2855 #define _mm_fnmadd_round_sh(A, B, C, R) \ 2856 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \ 2857 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \ 2858 (__mmask8)-1, (int)(R))) 2859 2860 #define _mm_mask_fnmadd_round_sh(W, U, A, B, R) \ 2861 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \ 2862 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), \ 2863 (__mmask8)(U), (int)(R))) 2864 2865 static __inline__ __m128h __DEFAULT_FN_ATTRS128 2866 _mm_maskz_fnmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { 2867 return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C, 2868 (__mmask8)__U, 2869 _MM_FROUND_CUR_DIRECTION); 2870 } 2871 2872 #define _mm_maskz_fnmadd_round_sh(U, A, B, C, R) \ 2873 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \ 2874 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \ 2875 (__mmask8)(U), (int)(R))) 2876 2877 static __inline__ __m128h __DEFAULT_FN_ATTRS128 2878 _mm_mask3_fnmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) { 2879 return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)__Y, 2880 (__mmask8)__U, 2881 _MM_FROUND_CUR_DIRECTION); 2882 } 2883 2884 #define _mm_mask3_fnmadd_round_sh(W, X, Y, U, R) \ 2885 ((__m128h)__builtin_ia32_vfmaddsh3_mask3( \ 2886 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \ 2887 (__mmask8)(U), (int)(R))) 2888 2889 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmsub_sh(__m128h __W, 2890 __m128h __A, 2891 __m128h __B) { 2892 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B, 2893 (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); 2894 } 2895 2896 static __inline__ __m128h __DEFAULT_FN_ATTRS128 2897 _mm_mask_fnmsub_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { 2898 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B, 2899 (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); 2900 } 2901 2902 #define _mm_fnmsub_round_sh(A, B, C, R) \ 2903 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \ 2904 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \ 2905 (__mmask8)-1, (int)(R))) 2906 2907 #define _mm_mask_fnmsub_round_sh(W, U, A, B, R) \ 2908 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \ 2909 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), \ 2910 (__mmask8)(U), (int)(R))) 2911 2912 static __inline__ __m128h __DEFAULT_FN_ATTRS128 2913 _mm_maskz_fnmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { 2914 return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C, 2915 (__mmask8)__U, 2916 _MM_FROUND_CUR_DIRECTION); 2917 } 2918 2919 #define _mm_maskz_fnmsub_round_sh(U, A, B, C, R) \ 2920 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \ 2921 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \ 2922 (__mmask8)(U), (int)(R))) 2923 2924 static __inline__ __m128h __DEFAULT_FN_ATTRS128 2925 _mm_mask3_fnmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) { 2926 return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)__Y, 2927 (__mmask8)__U, 2928 _MM_FROUND_CUR_DIRECTION); 2929 } 2930 2931 #define _mm_mask3_fnmsub_round_sh(W, X, Y, U, R) \ 2932 ((__m128h)__builtin_ia32_vfmsubsh3_mask3( \ 2933 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \ 2934 (__mmask8)(U), (int)(R))) 2935 2936 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmadd_sch(__m128h __A, 2937 __m128h __B, 2938 __m128h __C) { 2939 return (__m128h)__builtin_ia32_vfcmaddcsh_mask((__v4sf)__A, (__v4sf)__B, 2940 (__v4sf)__C, (__mmask8)-1, 2941 _MM_FROUND_CUR_DIRECTION); 2942 } 2943 2944 static __inline__ __m128h __DEFAULT_FN_ATTRS128 2945 _mm_mask_fcmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { 2946 return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask( 2947 (__v4sf)__A, (__v4sf)(__B), (__v4sf)(__C), __U, _MM_FROUND_CUR_DIRECTION); 2948 } 2949 2950 static __inline__ __m128h __DEFAULT_FN_ATTRS128 2951 _mm_maskz_fcmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { 2952 return (__m128h)__builtin_ia32_vfcmaddcsh_maskz((__v4sf)__A, (__v4sf)__B, 2953 (__v4sf)__C, (__mmask8)__U, 2954 _MM_FROUND_CUR_DIRECTION); 2955 } 2956 2957 static __inline__ __m128h __DEFAULT_FN_ATTRS128 2958 _mm_mask3_fcmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { 2959 return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask3( 2960 (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U, _MM_FROUND_CUR_DIRECTION); 2961 } 2962 2963 #define _mm_fcmadd_round_sch(A, B, C, R) \ 2964 ((__m128h)__builtin_ia32_vfcmaddcsh_mask( \ 2965 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \ 2966 (__mmask8)-1, (int)(R))) 2967 2968 #define _mm_mask_fcmadd_round_sch(A, U, B, C, R) \ 2969 ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask( \ 2970 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \ 2971 (__mmask8)(U), (int)(R))) 2972 2973 #define _mm_maskz_fcmadd_round_sch(U, A, B, C, R) \ 2974 ((__m128h)__builtin_ia32_vfcmaddcsh_maskz( \ 2975 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \ 2976 (__mmask8)(U), (int)(R))) 2977 2978 #define _mm_mask3_fcmadd_round_sch(A, B, C, U, R) \ 2979 ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask3( \ 2980 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \ 2981 (__mmask8)(U), (int)(R))) 2982 2983 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sch(__m128h __A, 2984 __m128h __B, 2985 __m128h __C) { 2986 return (__m128h)__builtin_ia32_vfmaddcsh_mask((__v4sf)__A, (__v4sf)__B, 2987 (__v4sf)__C, (__mmask8)-1, 2988 _MM_FROUND_CUR_DIRECTION); 2989 } 2990 2991 static __inline__ __m128h __DEFAULT_FN_ATTRS128 2992 _mm_mask_fmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { 2993 return (__m128h)__builtin_ia32_vfmaddcsh_round_mask( 2994 (__v4sf)__A, (__v4sf)(__B), (__v4sf)(__C), __U, _MM_FROUND_CUR_DIRECTION); 2995 } 2996 2997 static __inline__ __m128h __DEFAULT_FN_ATTRS128 2998 _mm_maskz_fmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { 2999 return (__m128h)__builtin_ia32_vfmaddcsh_maskz((__v4sf)__A, (__v4sf)__B, 3000 (__v4sf)__C, (__mmask8)__U, 3001 _MM_FROUND_CUR_DIRECTION); 3002 } 3003 3004 static __inline__ __m128h __DEFAULT_FN_ATTRS128 3005 _mm_mask3_fmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { 3006 return (__m128h)__builtin_ia32_vfmaddcsh_round_mask3( 3007 (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U, _MM_FROUND_CUR_DIRECTION); 3008 } 3009 3010 #define _mm_fmadd_round_sch(A, B, C, R) \ 3011 ((__m128h)__builtin_ia32_vfmaddcsh_mask( \ 3012 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \ 3013 (__mmask8)-1, (int)(R))) 3014 3015 #define _mm_mask_fmadd_round_sch(A, U, B, C, R) \ 3016 ((__m128h)__builtin_ia32_vfmaddcsh_round_mask( \ 3017 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \ 3018 (__mmask8)(U), (int)(R))) 3019 3020 #define _mm_maskz_fmadd_round_sch(U, A, B, C, R) \ 3021 ((__m128h)__builtin_ia32_vfmaddcsh_maskz( \ 3022 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \ 3023 (__mmask8)(U), (int)(R))) 3024 3025 #define _mm_mask3_fmadd_round_sch(A, B, C, U, R) \ 3026 ((__m128h)__builtin_ia32_vfmaddcsh_round_mask3( \ 3027 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \ 3028 (__mmask8)(U), (int)(R))) 3029 3030 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmul_sch(__m128h __A, 3031 __m128h __B) { 3032 return (__m128h)__builtin_ia32_vfcmulcsh_mask( 3033 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1, 3034 _MM_FROUND_CUR_DIRECTION); 3035 } 3036 3037 static __inline__ __m128h __DEFAULT_FN_ATTRS128 3038 _mm_mask_fcmul_sch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { 3039 return (__m128h)__builtin_ia32_vfcmulcsh_mask((__v4sf)__A, (__v4sf)__B, 3040 (__v4sf)__W, (__mmask8)__U, 3041 _MM_FROUND_CUR_DIRECTION); 3042 } 3043 3044 static __inline__ __m128h __DEFAULT_FN_ATTRS128 3045 _mm_maskz_fcmul_sch(__mmask8 __U, __m128h __A, __m128h __B) { 3046 return (__m128h)__builtin_ia32_vfcmulcsh_mask( 3047 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U, 3048 _MM_FROUND_CUR_DIRECTION); 3049 } 3050 3051 #define _mm_fcmul_round_sch(A, B, R) \ 3052 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \ 3053 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \ 3054 (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R))) 3055 3056 #define _mm_mask_fcmul_round_sch(W, U, A, B, R) \ 3057 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \ 3058 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W), \ 3059 (__mmask8)(U), (int)(R))) 3060 3061 #define _mm_maskz_fcmul_round_sch(U, A, B, R) \ 3062 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \ 3063 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \ 3064 (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R))) 3065 3066 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmul_sch(__m128h __A, 3067 __m128h __B) { 3068 return (__m128h)__builtin_ia32_vfmulcsh_mask( 3069 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1, 3070 _MM_FROUND_CUR_DIRECTION); 3071 } 3072 3073 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmul_sch(__m128h __W, 3074 __mmask8 __U, 3075 __m128h __A, 3076 __m128h __B) { 3077 return (__m128h)__builtin_ia32_vfmulcsh_mask((__v4sf)__A, (__v4sf)__B, 3078 (__v4sf)__W, (__mmask8)__U, 3079 _MM_FROUND_CUR_DIRECTION); 3080 } 3081 3082 static __inline__ __m128h __DEFAULT_FN_ATTRS128 3083 _mm_maskz_fmul_sch(__mmask8 __U, __m128h __A, __m128h __B) { 3084 return (__m128h)__builtin_ia32_vfmulcsh_mask( 3085 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U, 3086 _MM_FROUND_CUR_DIRECTION); 3087 } 3088 3089 #define _mm_fmul_round_sch(A, B, R) \ 3090 ((__m128h)__builtin_ia32_vfmulcsh_mask( \ 3091 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \ 3092 (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R))) 3093 3094 #define _mm_mask_fmul_round_sch(W, U, A, B, R) \ 3095 ((__m128h)__builtin_ia32_vfmulcsh_mask( \ 3096 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W), \ 3097 (__mmask8)(U), (int)(R))) 3098 3099 #define _mm_maskz_fmul_round_sch(U, A, B, R) \ 3100 ((__m128h)__builtin_ia32_vfmulcsh_mask( \ 3101 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \ 3102 (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R))) 3103 3104 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmul_pch(__m512h __A, 3105 __m512h __B) { 3106 return (__m512h)__builtin_ia32_vfcmulcph512_mask( 3107 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (__mmask16)-1, 3108 _MM_FROUND_CUR_DIRECTION); 3109 } 3110 3111 static __inline__ __m512h __DEFAULT_FN_ATTRS512 3112 _mm512_mask_fcmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) { 3113 return (__m512h)__builtin_ia32_vfcmulcph512_mask((__v16sf)__A, (__v16sf)__B, 3114 (__v16sf)__W, (__mmask16)__U, 3115 _MM_FROUND_CUR_DIRECTION); 3116 } 3117 3118 static __inline__ __m512h __DEFAULT_FN_ATTRS512 3119 _mm512_maskz_fcmul_pch(__mmask16 __U, __m512h __A, __m512h __B) { 3120 return (__m512h)__builtin_ia32_vfcmulcph512_mask( 3121 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (__mmask16)__U, 3122 _MM_FROUND_CUR_DIRECTION); 3123 } 3124 3125 #define _mm512_fcmul_round_pch(A, B, R) \ 3126 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \ 3127 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \ 3128 (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R))) 3129 3130 #define _mm512_mask_fcmul_round_pch(W, U, A, B, R) \ 3131 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \ 3132 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W), \ 3133 (__mmask16)(U), (int)(R))) 3134 3135 #define _mm512_maskz_fcmul_round_pch(U, A, B, R) \ 3136 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \ 3137 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \ 3138 (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R))) 3139 3140 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmul_pch(__m512h __A, 3141 __m512h __B) { 3142 return (__m512h)__builtin_ia32_vfmulcph512_mask( 3143 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (__mmask16)-1, 3144 _MM_FROUND_CUR_DIRECTION); 3145 } 3146 3147 static __inline__ __m512h __DEFAULT_FN_ATTRS512 3148 _mm512_mask_fmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) { 3149 return (__m512h)__builtin_ia32_vfmulcph512_mask((__v16sf)__A, (__v16sf)__B, 3150 (__v16sf)__W, (__mmask16)__U, 3151 _MM_FROUND_CUR_DIRECTION); 3152 } 3153 3154 static __inline__ __m512h __DEFAULT_FN_ATTRS512 3155 _mm512_maskz_fmul_pch(__mmask16 __U, __m512h __A, __m512h __B) { 3156 return (__m512h)__builtin_ia32_vfmulcph512_mask( 3157 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (__mmask16)__U, 3158 _MM_FROUND_CUR_DIRECTION); 3159 } 3160 3161 #define _mm512_fmul_round_pch(A, B, R) \ 3162 ((__m512h)__builtin_ia32_vfmulcph512_mask( \ 3163 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \ 3164 (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R))) 3165 3166 #define _mm512_mask_fmul_round_pch(W, U, A, B, R) \ 3167 ((__m512h)__builtin_ia32_vfmulcph512_mask( \ 3168 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W), \ 3169 (__mmask16)(U), (int)(R))) 3170 3171 #define _mm512_maskz_fmul_round_pch(U, A, B, R) \ 3172 ((__m512h)__builtin_ia32_vfmulcph512_mask( \ 3173 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \ 3174 (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R))) 3175 3176 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmadd_pch(__m512h __A, 3177 __m512h __B, 3178 __m512h __C) { 3179 return (__m512h)__builtin_ia32_vfcmaddcph512_mask3( 3180 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)-1, 3181 _MM_FROUND_CUR_DIRECTION); 3182 } 3183 3184 static __inline__ __m512h __DEFAULT_FN_ATTRS512 3185 _mm512_mask_fcmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) { 3186 return (__m512h)__builtin_ia32_vfcmaddcph512_mask( 3187 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, 3188 _MM_FROUND_CUR_DIRECTION); 3189 } 3190 3191 static __inline__ __m512h __DEFAULT_FN_ATTRS512 3192 _mm512_mask3_fcmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) { 3193 return (__m512h)__builtin_ia32_vfcmaddcph512_mask3( 3194 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, 3195 _MM_FROUND_CUR_DIRECTION); 3196 } 3197 3198 static __inline__ __m512h __DEFAULT_FN_ATTRS512 3199 _mm512_maskz_fcmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) { 3200 return (__m512h)__builtin_ia32_vfcmaddcph512_maskz( 3201 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, 3202 _MM_FROUND_CUR_DIRECTION); 3203 } 3204 3205 #define _mm512_fcmadd_round_pch(A, B, C, R) \ 3206 ((__m512h)__builtin_ia32_vfcmaddcph512_mask3( \ 3207 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \ 3208 (__mmask16)-1, (int)(R))) 3209 3210 #define _mm512_mask_fcmadd_round_pch(A, U, B, C, R) \ 3211 ((__m512h)__builtin_ia32_vfcmaddcph512_mask( \ 3212 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \ 3213 (__mmask16)(U), (int)(R))) 3214 3215 #define _mm512_mask3_fcmadd_round_pch(A, B, C, U, R) \ 3216 ((__m512h)__builtin_ia32_vfcmaddcph512_mask3( \ 3217 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \ 3218 (__mmask16)(U), (int)(R))) 3219 3220 #define _mm512_maskz_fcmadd_round_pch(U, A, B, C, R) \ 3221 ((__m512h)__builtin_ia32_vfcmaddcph512_maskz( \ 3222 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \ 3223 (__mmask16)(U), (int)(R))) 3224 3225 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_pch(__m512h __A, 3226 __m512h __B, 3227 __m512h __C) { 3228 return (__m512h)__builtin_ia32_vfmaddcph512_mask3((__v16sf)__A, (__v16sf)__B, 3229 (__v16sf)__C, (__mmask16)-1, 3230 _MM_FROUND_CUR_DIRECTION); 3231 } 3232 3233 static __inline__ __m512h __DEFAULT_FN_ATTRS512 3234 _mm512_mask_fmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) { 3235 return (__m512h)__builtin_ia32_vfmaddcph512_mask((__v16sf)__A, (__v16sf)__B, 3236 (__v16sf)__C, (__mmask16)__U, 3237 _MM_FROUND_CUR_DIRECTION); 3238 } 3239 3240 static __inline__ __m512h __DEFAULT_FN_ATTRS512 3241 _mm512_mask3_fmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) { 3242 return (__m512h)__builtin_ia32_vfmaddcph512_mask3( 3243 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, 3244 _MM_FROUND_CUR_DIRECTION); 3245 } 3246 3247 static __inline__ __m512h __DEFAULT_FN_ATTRS512 3248 _mm512_maskz_fmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) { 3249 return (__m512h)__builtin_ia32_vfmaddcph512_maskz( 3250 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, 3251 _MM_FROUND_CUR_DIRECTION); 3252 } 3253 3254 #define _mm512_fmadd_round_pch(A, B, C, R) \ 3255 ((__m512h)__builtin_ia32_vfmaddcph512_mask3( \ 3256 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \ 3257 (__mmask16)-1, (int)(R))) 3258 3259 #define _mm512_mask_fmadd_round_pch(A, U, B, C, R) \ 3260 ((__m512h)__builtin_ia32_vfmaddcph512_mask( \ 3261 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \ 3262 (__mmask16)(U), (int)(R))) 3263 3264 #define _mm512_mask3_fmadd_round_pch(A, B, C, U, R) \ 3265 ((__m512h)__builtin_ia32_vfmaddcph512_mask3( \ 3266 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \ 3267 (__mmask16)(U), (int)(R))) 3268 3269 #define _mm512_maskz_fmadd_round_pch(U, A, B, C, R) \ 3270 ((__m512h)__builtin_ia32_vfmaddcph512_maskz( \ 3271 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \ 3272 (__mmask16)(U), (int)(R))) 3273 3274 static __inline__ _Float16 __DEFAULT_FN_ATTRS512 3275 _mm512_reduce_add_ph(__m512h __W) { 3276 return __builtin_ia32_reduce_fadd_ph512(-0.0f16, __W); 3277 } 3278 3279 static __inline__ _Float16 __DEFAULT_FN_ATTRS512 3280 _mm512_reduce_mul_ph(__m512h __W) { 3281 return __builtin_ia32_reduce_fmul_ph512(1.0f16, __W); 3282 } 3283 3284 static __inline__ _Float16 __DEFAULT_FN_ATTRS512 3285 _mm512_reduce_max_ph(__m512h __V) { 3286 return __builtin_ia32_reduce_fmax_ph512(__V); 3287 } 3288 3289 static __inline__ _Float16 __DEFAULT_FN_ATTRS512 3290 _mm512_reduce_min_ph(__m512h __V) { 3291 return __builtin_ia32_reduce_fmin_ph512(__V); 3292 } 3293 3294 static __inline__ __m512h __DEFAULT_FN_ATTRS512 3295 _mm512_mask_blend_ph(__mmask32 __U, __m512h __A, __m512h __W) { 3296 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, (__v32hf)__W, 3297 (__v32hf)__A); 3298 } 3299 3300 static __inline__ __m512h __DEFAULT_FN_ATTRS512 3301 _mm512_permutex2var_ph(__m512h __A, __m512i __I, __m512h __B) { 3302 return (__m512h)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I, 3303 (__v32hi)__B); 3304 } 3305 3306 static __inline__ __m512h __DEFAULT_FN_ATTRS512 3307 _mm512_permutexvar_ph(__m512i __A, __m512h __B) { 3308 return (__m512h)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A); 3309 } 3310 3311 // intrinsics below are alias for f*mul_*ch 3312 #define _mm512_mul_pch(A, B) _mm512_fmul_pch(A, B) 3313 #define _mm512_mask_mul_pch(W, U, A, B) _mm512_mask_fmul_pch(W, U, A, B) 3314 #define _mm512_maskz_mul_pch(U, A, B) _mm512_maskz_fmul_pch(U, A, B) 3315 #define _mm512_mul_round_pch(A, B, R) _mm512_fmul_round_pch(A, B, R) 3316 #define _mm512_mask_mul_round_pch(W, U, A, B, R) \ 3317 _mm512_mask_fmul_round_pch(W, U, A, B, R) 3318 #define _mm512_maskz_mul_round_pch(U, A, B, R) \ 3319 _mm512_maskz_fmul_round_pch(U, A, B, R) 3320 3321 #define _mm512_cmul_pch(A, B) _mm512_fcmul_pch(A, B) 3322 #define _mm512_mask_cmul_pch(W, U, A, B) _mm512_mask_fcmul_pch(W, U, A, B) 3323 #define _mm512_maskz_cmul_pch(U, A, B) _mm512_maskz_fcmul_pch(U, A, B) 3324 #define _mm512_cmul_round_pch(A, B, R) _mm512_fcmul_round_pch(A, B, R) 3325 #define _mm512_mask_cmul_round_pch(W, U, A, B, R) \ 3326 _mm512_mask_fcmul_round_pch(W, U, A, B, R) 3327 #define _mm512_maskz_cmul_round_pch(U, A, B, R) \ 3328 _mm512_maskz_fcmul_round_pch(U, A, B, R) 3329 3330 #define _mm_mul_sch(A, B) _mm_fmul_sch(A, B) 3331 #define _mm_mask_mul_sch(W, U, A, B) _mm_mask_fmul_sch(W, U, A, B) 3332 #define _mm_maskz_mul_sch(U, A, B) _mm_maskz_fmul_sch(U, A, B) 3333 #define _mm_mul_round_sch(A, B, R) _mm_fmul_round_sch(A, B, R) 3334 #define _mm_mask_mul_round_sch(W, U, A, B, R) \ 3335 _mm_mask_fmul_round_sch(W, U, A, B, R) 3336 #define _mm_maskz_mul_round_sch(U, A, B, R) _mm_maskz_fmul_round_sch(U, A, B, R) 3337 3338 #define _mm_cmul_sch(A, B) _mm_fcmul_sch(A, B) 3339 #define _mm_mask_cmul_sch(W, U, A, B) _mm_mask_fcmul_sch(W, U, A, B) 3340 #define _mm_maskz_cmul_sch(U, A, B) _mm_maskz_fcmul_sch(U, A, B) 3341 #define _mm_cmul_round_sch(A, B, R) _mm_fcmul_round_sch(A, B, R) 3342 #define _mm_mask_cmul_round_sch(W, U, A, B, R) \ 3343 _mm_mask_fcmul_round_sch(W, U, A, B, R) 3344 #define _mm_maskz_cmul_round_sch(U, A, B, R) \ 3345 _mm_maskz_fcmul_round_sch(U, A, B, R) 3346 3347 #undef __DEFAULT_FN_ATTRS128 3348 #undef __DEFAULT_FN_ATTRS256 3349 #undef __DEFAULT_FN_ATTRS512 3350 3351 #endif 3352 #endif 3353