1 /*===----------- avx512fp16intrin.h - AVX512-FP16 intrinsics ---------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 #ifndef __IMMINTRIN_H 10 #error "Never use <avx512fp16intrin.h> directly; include <immintrin.h> instead." 11 #endif 12 13 #ifdef __SSE2__ 14 15 #ifndef __AVX512FP16INTRIN_H 16 #define __AVX512FP16INTRIN_H 17 18 /* Define the default attributes for the functions in this file. */ 19 typedef _Float16 __v32hf __attribute__((__vector_size__(64), __aligned__(64))); 20 typedef _Float16 __m512h __attribute__((__vector_size__(64), __aligned__(64))); 21 typedef _Float16 __m512h_u __attribute__((__vector_size__(64), __aligned__(1))); 22 23 /* Define the default attributes for the functions in this file. */ 24 #define __DEFAULT_FN_ATTRS512 \ 25 __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \ 26 __min_vector_width__(512))) 27 #define __DEFAULT_FN_ATTRS256 \ 28 __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \ 29 __min_vector_width__(256))) 30 #define __DEFAULT_FN_ATTRS128 \ 31 __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \ 32 __min_vector_width__(128))) 33 34 static __inline__ _Float16 __DEFAULT_FN_ATTRS512 _mm512_cvtsh_h(__m512h __a) { 35 return __a[0]; 36 } 37 38 static __inline __m128h __DEFAULT_FN_ATTRS128 _mm_setzero_ph(void) { 39 return (__m128h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; 40 } 41 42 static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_setzero_ph(void) { 43 return (__m256h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 44 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; 45 } 46 47 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_undefined_ph(void) { 48 return (__m256h)__builtin_ia32_undef256(); 49 } 50 51 static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_setzero_ph(void) { 52 return (__m512h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 53 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 54 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; 55 } 56 57 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_undefined_ph(void) { 58 return (__m128h)__builtin_ia32_undef128(); 59 } 60 61 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_undefined_ph(void) { 62 return (__m512h)__builtin_ia32_undef512(); 63 } 64 65 static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_set1_ph(_Float16 __h) { 66 return (__m512h)(__v32hf){__h, __h, __h, __h, __h, __h, __h, __h, 67 __h, __h, __h, __h, __h, __h, __h, __h, 68 __h, __h, __h, __h, __h, __h, __h, __h, 69 __h, __h, __h, __h, __h, __h, __h, __h}; 70 } 71 72 static __inline __m512h __DEFAULT_FN_ATTRS512 73 _mm512_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4, 74 _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8, 75 _Float16 __h9, _Float16 __h10, _Float16 __h11, _Float16 __h12, 76 _Float16 __h13, _Float16 __h14, _Float16 __h15, _Float16 __h16, 77 _Float16 __h17, _Float16 __h18, _Float16 __h19, _Float16 __h20, 78 _Float16 __h21, _Float16 __h22, _Float16 __h23, _Float16 __h24, 79 _Float16 __h25, _Float16 __h26, _Float16 __h27, _Float16 __h28, 80 _Float16 __h29, _Float16 __h30, _Float16 __h31, _Float16 __h32) { 81 return (__m512h)(__v32hf){__h32, __h31, __h30, __h29, __h28, __h27, __h26, 82 __h25, __h24, __h23, __h22, __h21, __h20, __h19, 83 __h18, __h17, __h16, __h15, __h14, __h13, __h12, 84 __h11, __h10, __h9, __h8, __h7, __h6, __h5, 85 __h4, __h3, __h2, __h1}; 86 } 87 88 #define _mm512_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11, h12, h13, \ 89 h14, h15, h16, h17, h18, h19, h20, h21, h22, h23, h24, \ 90 h25, h26, h27, h28, h29, h30, h31, h32) \ 91 _mm512_set_ph((h32), (h31), (h30), (h29), (h28), (h27), (h26), (h25), (h24), \ 92 (h23), (h22), (h21), (h20), (h19), (h18), (h17), (h16), (h15), \ 93 (h14), (h13), (h12), (h11), (h10), (h9), (h8), (h7), (h6), \ 94 (h5), (h4), (h3), (h2), (h1)) 95 96 static __inline __m512h __DEFAULT_FN_ATTRS512 97 _mm512_set1_pch(_Float16 _Complex h) { 98 return (__m512h)_mm512_set1_ps(__builtin_bit_cast(float, h)); 99 } 100 101 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_castph_ps(__m128h __a) { 102 return (__m128)__a; 103 } 104 105 static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_castph_ps(__m256h __a) { 106 return (__m256)__a; 107 } 108 109 static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_castph_ps(__m512h __a) { 110 return (__m512)__a; 111 } 112 113 static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_castph_pd(__m128h __a) { 114 return (__m128d)__a; 115 } 116 117 static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_castph_pd(__m256h __a) { 118 return (__m256d)__a; 119 } 120 121 static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_castph_pd(__m512h __a) { 122 return (__m512d)__a; 123 } 124 125 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_castph_si128(__m128h __a) { 126 return (__m128i)__a; 127 } 128 129 static __inline__ __m256i __DEFAULT_FN_ATTRS256 130 _mm256_castph_si256(__m256h __a) { 131 return (__m256i)__a; 132 } 133 134 static __inline__ __m512i __DEFAULT_FN_ATTRS512 135 _mm512_castph_si512(__m512h __a) { 136 return (__m512i)__a; 137 } 138 139 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castps_ph(__m128 __a) { 140 return (__m128h)__a; 141 } 142 143 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_castps_ph(__m256 __a) { 144 return (__m256h)__a; 145 } 146 147 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_castps_ph(__m512 __a) { 148 return (__m512h)__a; 149 } 150 151 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castpd_ph(__m128d __a) { 152 return (__m128h)__a; 153 } 154 155 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_castpd_ph(__m256d __a) { 156 return (__m256h)__a; 157 } 158 159 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_castpd_ph(__m512d __a) { 160 return (__m512h)__a; 161 } 162 163 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castsi128_ph(__m128i __a) { 164 return (__m128h)__a; 165 } 166 167 static __inline__ __m256h __DEFAULT_FN_ATTRS256 168 _mm256_castsi256_ph(__m256i __a) { 169 return (__m256h)__a; 170 } 171 172 static __inline__ __m512h __DEFAULT_FN_ATTRS512 173 _mm512_castsi512_ph(__m512i __a) { 174 return (__m512h)__a; 175 } 176 177 static __inline__ __m128h __DEFAULT_FN_ATTRS256 178 _mm256_castph256_ph128(__m256h __a) { 179 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7); 180 } 181 182 static __inline__ __m128h __DEFAULT_FN_ATTRS512 183 _mm512_castph512_ph128(__m512h __a) { 184 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7); 185 } 186 187 static __inline__ __m256h __DEFAULT_FN_ATTRS512 188 _mm512_castph512_ph256(__m512h __a) { 189 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 190 12, 13, 14, 15); 191 } 192 193 static __inline__ __m256h __DEFAULT_FN_ATTRS256 194 _mm256_castph128_ph256(__m128h __a) { 195 return __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a), 196 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 197 } 198 199 static __inline__ __m512h __DEFAULT_FN_ATTRS512 200 _mm512_castph128_ph512(__m128h __a) { 201 __m256h __b = __builtin_nondeterministic_value(__b); 202 return __builtin_shufflevector( 203 __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a), 204 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), 205 __b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 206 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); 207 } 208 209 static __inline__ __m512h __DEFAULT_FN_ATTRS512 210 _mm512_castph256_ph512(__m256h __a) { 211 return __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a), 0, 212 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 213 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 214 27, 28, 29, 30, 31); 215 } 216 217 /// Constructs a 256-bit floating-point vector of [16 x half] from a 218 /// 128-bit floating-point vector of [8 x half]. The lower 128 bits 219 /// contain the value of the source vector. The upper 384 bits are set 220 /// to zero. 221 /// 222 /// \headerfile <x86intrin.h> 223 /// 224 /// This intrinsic has no corresponding instruction. 225 /// 226 /// \param __a 227 /// A 128-bit vector of [8 x half]. 228 /// \returns A 512-bit floating-point vector of [16 x half]. The lower 128 bits 229 /// contain the value of the parameter. The upper 384 bits are set to zero. 230 static __inline__ __m256h __DEFAULT_FN_ATTRS256 231 _mm256_zextph128_ph256(__m128h __a) { 232 return __builtin_shufflevector(__a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4, 233 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 234 } 235 236 /// Constructs a 512-bit floating-point vector of [32 x half] from a 237 /// 128-bit floating-point vector of [8 x half]. The lower 128 bits 238 /// contain the value of the source vector. The upper 384 bits are set 239 /// to zero. 240 /// 241 /// \headerfile <x86intrin.h> 242 /// 243 /// This intrinsic has no corresponding instruction. 244 /// 245 /// \param __a 246 /// A 128-bit vector of [8 x half]. 247 /// \returns A 512-bit floating-point vector of [32 x half]. The lower 128 bits 248 /// contain the value of the parameter. The upper 384 bits are set to zero. 249 static __inline__ __m512h __DEFAULT_FN_ATTRS512 250 _mm512_zextph128_ph512(__m128h __a) { 251 return __builtin_shufflevector( 252 __a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 253 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15); 254 } 255 256 /// Constructs a 512-bit floating-point vector of [32 x half] from a 257 /// 256-bit floating-point vector of [16 x half]. The lower 256 bits 258 /// contain the value of the source vector. The upper 256 bits are set 259 /// to zero. 260 /// 261 /// \headerfile <x86intrin.h> 262 /// 263 /// This intrinsic has no corresponding instruction. 264 /// 265 /// \param __a 266 /// A 256-bit vector of [16 x half]. 267 /// \returns A 512-bit floating-point vector of [32 x half]. The lower 256 bits 268 /// contain the value of the parameter. The upper 256 bits are set to zero. 269 static __inline__ __m512h __DEFAULT_FN_ATTRS512 270 _mm512_zextph256_ph512(__m256h __a) { 271 return __builtin_shufflevector(__a, (__v16hf)_mm256_setzero_ph(), 0, 1, 2, 3, 272 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 273 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 274 29, 30, 31); 275 } 276 277 #define _mm_comi_round_sh(A, B, P, R) \ 278 __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, (int)(P), (int)(R)) 279 280 #define _mm_comi_sh(A, B, pred) \ 281 _mm_comi_round_sh((A), (B), (pred), _MM_FROUND_CUR_DIRECTION) 282 283 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comieq_sh(__m128h A, 284 __m128h B) { 285 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_EQ_OS, 286 _MM_FROUND_CUR_DIRECTION); 287 } 288 289 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comilt_sh(__m128h A, 290 __m128h B) { 291 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LT_OS, 292 _MM_FROUND_CUR_DIRECTION); 293 } 294 295 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comile_sh(__m128h A, 296 __m128h B) { 297 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LE_OS, 298 _MM_FROUND_CUR_DIRECTION); 299 } 300 301 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comigt_sh(__m128h A, 302 __m128h B) { 303 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GT_OS, 304 _MM_FROUND_CUR_DIRECTION); 305 } 306 307 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comige_sh(__m128h A, 308 __m128h B) { 309 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GE_OS, 310 _MM_FROUND_CUR_DIRECTION); 311 } 312 313 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comineq_sh(__m128h A, 314 __m128h B) { 315 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_NEQ_US, 316 _MM_FROUND_CUR_DIRECTION); 317 } 318 319 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomieq_sh(__m128h A, 320 __m128h B) { 321 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_EQ_OQ, 322 _MM_FROUND_CUR_DIRECTION); 323 } 324 325 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomilt_sh(__m128h A, 326 __m128h B) { 327 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LT_OQ, 328 _MM_FROUND_CUR_DIRECTION); 329 } 330 331 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomile_sh(__m128h A, 332 __m128h B) { 333 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LE_OQ, 334 _MM_FROUND_CUR_DIRECTION); 335 } 336 337 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomigt_sh(__m128h A, 338 __m128h B) { 339 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GT_OQ, 340 _MM_FROUND_CUR_DIRECTION); 341 } 342 343 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomige_sh(__m128h A, 344 __m128h B) { 345 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GE_OQ, 346 _MM_FROUND_CUR_DIRECTION); 347 } 348 349 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomineq_sh(__m128h A, 350 __m128h B) { 351 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_NEQ_UQ, 352 _MM_FROUND_CUR_DIRECTION); 353 } 354 355 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_add_ph(__m512h __A, 356 __m512h __B) { 357 return (__m512h)((__v32hf)__A + (__v32hf)__B); 358 } 359 360 static __inline__ __m512h __DEFAULT_FN_ATTRS512 361 _mm512_mask_add_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { 362 return (__m512h)__builtin_ia32_selectph_512( 363 (__mmask32)__U, (__v32hf)_mm512_add_ph(__A, __B), (__v32hf)__W); 364 } 365 366 static __inline__ __m512h __DEFAULT_FN_ATTRS512 367 _mm512_maskz_add_ph(__mmask32 __U, __m512h __A, __m512h __B) { 368 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, 369 (__v32hf)_mm512_add_ph(__A, __B), 370 (__v32hf)_mm512_setzero_ph()); 371 } 372 373 #define _mm512_add_round_ph(A, B, R) \ 374 ((__m512h)__builtin_ia32_addph512((__v32hf)(__m512h)(A), \ 375 (__v32hf)(__m512h)(B), (int)(R))) 376 377 #define _mm512_mask_add_round_ph(W, U, A, B, R) \ 378 ((__m512h)__builtin_ia32_selectph_512( \ 379 (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)), \ 380 (__v32hf)(__m512h)(W))) 381 382 #define _mm512_maskz_add_round_ph(U, A, B, R) \ 383 ((__m512h)__builtin_ia32_selectph_512( \ 384 (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)), \ 385 (__v32hf)_mm512_setzero_ph())) 386 387 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sub_ph(__m512h __A, 388 __m512h __B) { 389 return (__m512h)((__v32hf)__A - (__v32hf)__B); 390 } 391 392 static __inline__ __m512h __DEFAULT_FN_ATTRS512 393 _mm512_mask_sub_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { 394 return (__m512h)__builtin_ia32_selectph_512( 395 (__mmask32)__U, (__v32hf)_mm512_sub_ph(__A, __B), (__v32hf)__W); 396 } 397 398 static __inline__ __m512h __DEFAULT_FN_ATTRS512 399 _mm512_maskz_sub_ph(__mmask32 __U, __m512h __A, __m512h __B) { 400 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, 401 (__v32hf)_mm512_sub_ph(__A, __B), 402 (__v32hf)_mm512_setzero_ph()); 403 } 404 405 #define _mm512_sub_round_ph(A, B, R) \ 406 ((__m512h)__builtin_ia32_subph512((__v32hf)(__m512h)(A), \ 407 (__v32hf)(__m512h)(B), (int)(R))) 408 409 #define _mm512_mask_sub_round_ph(W, U, A, B, R) \ 410 ((__m512h)__builtin_ia32_selectph_512( \ 411 (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)), \ 412 (__v32hf)(__m512h)(W))) 413 414 #define _mm512_maskz_sub_round_ph(U, A, B, R) \ 415 ((__m512h)__builtin_ia32_selectph_512( \ 416 (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)), \ 417 (__v32hf)_mm512_setzero_ph())) 418 419 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_mul_ph(__m512h __A, 420 __m512h __B) { 421 return (__m512h)((__v32hf)__A * (__v32hf)__B); 422 } 423 424 static __inline__ __m512h __DEFAULT_FN_ATTRS512 425 _mm512_mask_mul_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { 426 return (__m512h)__builtin_ia32_selectph_512( 427 (__mmask32)__U, (__v32hf)_mm512_mul_ph(__A, __B), (__v32hf)__W); 428 } 429 430 static __inline__ __m512h __DEFAULT_FN_ATTRS512 431 _mm512_maskz_mul_ph(__mmask32 __U, __m512h __A, __m512h __B) { 432 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, 433 (__v32hf)_mm512_mul_ph(__A, __B), 434 (__v32hf)_mm512_setzero_ph()); 435 } 436 437 #define _mm512_mul_round_ph(A, B, R) \ 438 ((__m512h)__builtin_ia32_mulph512((__v32hf)(__m512h)(A), \ 439 (__v32hf)(__m512h)(B), (int)(R))) 440 441 #define _mm512_mask_mul_round_ph(W, U, A, B, R) \ 442 ((__m512h)__builtin_ia32_selectph_512( \ 443 (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)), \ 444 (__v32hf)(__m512h)(W))) 445 446 #define _mm512_maskz_mul_round_ph(U, A, B, R) \ 447 ((__m512h)__builtin_ia32_selectph_512( \ 448 (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)), \ 449 (__v32hf)_mm512_setzero_ph())) 450 451 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_div_ph(__m512h __A, 452 __m512h __B) { 453 return (__m512h)((__v32hf)__A / (__v32hf)__B); 454 } 455 456 static __inline__ __m512h __DEFAULT_FN_ATTRS512 457 _mm512_mask_div_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { 458 return (__m512h)__builtin_ia32_selectph_512( 459 (__mmask32)__U, (__v32hf)_mm512_div_ph(__A, __B), (__v32hf)__W); 460 } 461 462 static __inline__ __m512h __DEFAULT_FN_ATTRS512 463 _mm512_maskz_div_ph(__mmask32 __U, __m512h __A, __m512h __B) { 464 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, 465 (__v32hf)_mm512_div_ph(__A, __B), 466 (__v32hf)_mm512_setzero_ph()); 467 } 468 469 #define _mm512_div_round_ph(A, B, R) \ 470 ((__m512h)__builtin_ia32_divph512((__v32hf)(__m512h)(A), \ 471 (__v32hf)(__m512h)(B), (int)(R))) 472 473 #define _mm512_mask_div_round_ph(W, U, A, B, R) \ 474 ((__m512h)__builtin_ia32_selectph_512( \ 475 (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)), \ 476 (__v32hf)(__m512h)(W))) 477 478 #define _mm512_maskz_div_round_ph(U, A, B, R) \ 479 ((__m512h)__builtin_ia32_selectph_512( \ 480 (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)), \ 481 (__v32hf)_mm512_setzero_ph())) 482 483 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_min_ph(__m512h __A, 484 __m512h __B) { 485 return (__m512h)__builtin_ia32_minph512((__v32hf)__A, (__v32hf)__B, 486 _MM_FROUND_CUR_DIRECTION); 487 } 488 489 static __inline__ __m512h __DEFAULT_FN_ATTRS512 490 _mm512_mask_min_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { 491 return (__m512h)__builtin_ia32_selectph_512( 492 (__mmask32)__U, (__v32hf)_mm512_min_ph(__A, __B), (__v32hf)__W); 493 } 494 495 static __inline__ __m512h __DEFAULT_FN_ATTRS512 496 _mm512_maskz_min_ph(__mmask32 __U, __m512h __A, __m512h __B) { 497 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, 498 (__v32hf)_mm512_min_ph(__A, __B), 499 (__v32hf)_mm512_setzero_ph()); 500 } 501 502 #define _mm512_min_round_ph(A, B, R) \ 503 ((__m512h)__builtin_ia32_minph512((__v32hf)(__m512h)(A), \ 504 (__v32hf)(__m512h)(B), (int)(R))) 505 506 #define _mm512_mask_min_round_ph(W, U, A, B, R) \ 507 ((__m512h)__builtin_ia32_selectph_512( \ 508 (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)), \ 509 (__v32hf)(__m512h)(W))) 510 511 #define _mm512_maskz_min_round_ph(U, A, B, R) \ 512 ((__m512h)__builtin_ia32_selectph_512( \ 513 (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)), \ 514 (__v32hf)_mm512_setzero_ph())) 515 516 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_max_ph(__m512h __A, 517 __m512h __B) { 518 return (__m512h)__builtin_ia32_maxph512((__v32hf)__A, (__v32hf)__B, 519 _MM_FROUND_CUR_DIRECTION); 520 } 521 522 static __inline__ __m512h __DEFAULT_FN_ATTRS512 523 _mm512_mask_max_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { 524 return (__m512h)__builtin_ia32_selectph_512( 525 (__mmask32)__U, (__v32hf)_mm512_max_ph(__A, __B), (__v32hf)__W); 526 } 527 528 static __inline__ __m512h __DEFAULT_FN_ATTRS512 529 _mm512_maskz_max_ph(__mmask32 __U, __m512h __A, __m512h __B) { 530 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, 531 (__v32hf)_mm512_max_ph(__A, __B), 532 (__v32hf)_mm512_setzero_ph()); 533 } 534 535 #define _mm512_max_round_ph(A, B, R) \ 536 ((__m512h)__builtin_ia32_maxph512((__v32hf)(__m512h)(A), \ 537 (__v32hf)(__m512h)(B), (int)(R))) 538 539 #define _mm512_mask_max_round_ph(W, U, A, B, R) \ 540 ((__m512h)__builtin_ia32_selectph_512( \ 541 (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)), \ 542 (__v32hf)(__m512h)(W))) 543 544 #define _mm512_maskz_max_round_ph(U, A, B, R) \ 545 ((__m512h)__builtin_ia32_selectph_512( \ 546 (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)), \ 547 (__v32hf)_mm512_setzero_ph())) 548 549 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_abs_ph(__m512h __A) { 550 return (__m512h)_mm512_and_epi32(_mm512_set1_epi32(0x7FFF7FFF), (__m512i)__A); 551 } 552 553 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_conj_pch(__m512h __A) { 554 return (__m512h)_mm512_xor_ps((__m512)__A, _mm512_set1_ps(-0.0f)); 555 } 556 557 static __inline__ __m512h __DEFAULT_FN_ATTRS512 558 _mm512_mask_conj_pch(__m512h __W, __mmask16 __U, __m512h __A) { 559 return (__m512h)__builtin_ia32_selectps_512( 560 (__mmask16)__U, (__v16sf)_mm512_conj_pch(__A), (__v16sf)__W); 561 } 562 563 static __inline__ __m512h __DEFAULT_FN_ATTRS512 564 _mm512_maskz_conj_pch(__mmask16 __U, __m512h __A) { 565 return (__m512h)__builtin_ia32_selectps_512((__mmask16)__U, 566 (__v16sf)_mm512_conj_pch(__A), 567 (__v16sf)_mm512_setzero_ps()); 568 } 569 570 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_add_sh(__m128h __A, 571 __m128h __B) { 572 __A[0] += __B[0]; 573 return __A; 574 } 575 576 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_add_sh(__m128h __W, 577 __mmask8 __U, 578 __m128h __A, 579 __m128h __B) { 580 __A = _mm_add_sh(__A, __B); 581 return __builtin_ia32_selectsh_128(__U, __A, __W); 582 } 583 584 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_add_sh(__mmask8 __U, 585 __m128h __A, 586 __m128h __B) { 587 __A = _mm_add_sh(__A, __B); 588 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph()); 589 } 590 591 #define _mm_add_round_sh(A, B, R) \ 592 ((__m128h)__builtin_ia32_addsh_round_mask( \ 593 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ 594 (__mmask8)-1, (int)(R))) 595 596 #define _mm_mask_add_round_sh(W, U, A, B, R) \ 597 ((__m128h)__builtin_ia32_addsh_round_mask( \ 598 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ 599 (__mmask8)(U), (int)(R))) 600 601 #define _mm_maskz_add_round_sh(U, A, B, R) \ 602 ((__m128h)__builtin_ia32_addsh_round_mask( \ 603 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ 604 (__mmask8)(U), (int)(R))) 605 606 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sub_sh(__m128h __A, 607 __m128h __B) { 608 __A[0] -= __B[0]; 609 return __A; 610 } 611 612 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sub_sh(__m128h __W, 613 __mmask8 __U, 614 __m128h __A, 615 __m128h __B) { 616 __A = _mm_sub_sh(__A, __B); 617 return __builtin_ia32_selectsh_128(__U, __A, __W); 618 } 619 620 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sub_sh(__mmask8 __U, 621 __m128h __A, 622 __m128h __B) { 623 __A = _mm_sub_sh(__A, __B); 624 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph()); 625 } 626 627 #define _mm_sub_round_sh(A, B, R) \ 628 ((__m128h)__builtin_ia32_subsh_round_mask( \ 629 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ 630 (__mmask8)-1, (int)(R))) 631 632 #define _mm_mask_sub_round_sh(W, U, A, B, R) \ 633 ((__m128h)__builtin_ia32_subsh_round_mask( \ 634 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ 635 (__mmask8)(U), (int)(R))) 636 637 #define _mm_maskz_sub_round_sh(U, A, B, R) \ 638 ((__m128h)__builtin_ia32_subsh_round_mask( \ 639 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ 640 (__mmask8)(U), (int)(R))) 641 642 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mul_sh(__m128h __A, 643 __m128h __B) { 644 __A[0] *= __B[0]; 645 return __A; 646 } 647 648 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_mul_sh(__m128h __W, 649 __mmask8 __U, 650 __m128h __A, 651 __m128h __B) { 652 __A = _mm_mul_sh(__A, __B); 653 return __builtin_ia32_selectsh_128(__U, __A, __W); 654 } 655 656 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_mul_sh(__mmask8 __U, 657 __m128h __A, 658 __m128h __B) { 659 __A = _mm_mul_sh(__A, __B); 660 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph()); 661 } 662 663 #define _mm_mul_round_sh(A, B, R) \ 664 ((__m128h)__builtin_ia32_mulsh_round_mask( \ 665 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ 666 (__mmask8)-1, (int)(R))) 667 668 #define _mm_mask_mul_round_sh(W, U, A, B, R) \ 669 ((__m128h)__builtin_ia32_mulsh_round_mask( \ 670 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ 671 (__mmask8)(U), (int)(R))) 672 673 #define _mm_maskz_mul_round_sh(U, A, B, R) \ 674 ((__m128h)__builtin_ia32_mulsh_round_mask( \ 675 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ 676 (__mmask8)(U), (int)(R))) 677 678 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_div_sh(__m128h __A, 679 __m128h __B) { 680 __A[0] /= __B[0]; 681 return __A; 682 } 683 684 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_div_sh(__m128h __W, 685 __mmask8 __U, 686 __m128h __A, 687 __m128h __B) { 688 __A = _mm_div_sh(__A, __B); 689 return __builtin_ia32_selectsh_128(__U, __A, __W); 690 } 691 692 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_div_sh(__mmask8 __U, 693 __m128h __A, 694 __m128h __B) { 695 __A = _mm_div_sh(__A, __B); 696 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph()); 697 } 698 699 #define _mm_div_round_sh(A, B, R) \ 700 ((__m128h)__builtin_ia32_divsh_round_mask( \ 701 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ 702 (__mmask8)-1, (int)(R))) 703 704 #define _mm_mask_div_round_sh(W, U, A, B, R) \ 705 ((__m128h)__builtin_ia32_divsh_round_mask( \ 706 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ 707 (__mmask8)(U), (int)(R))) 708 709 #define _mm_maskz_div_round_sh(U, A, B, R) \ 710 ((__m128h)__builtin_ia32_divsh_round_mask( \ 711 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ 712 (__mmask8)(U), (int)(R))) 713 714 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_min_sh(__m128h __A, 715 __m128h __B) { 716 return (__m128h)__builtin_ia32_minsh_round_mask( 717 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1, 718 _MM_FROUND_CUR_DIRECTION); 719 } 720 721 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_min_sh(__m128h __W, 722 __mmask8 __U, 723 __m128h __A, 724 __m128h __B) { 725 return (__m128h)__builtin_ia32_minsh_round_mask((__v8hf)__A, (__v8hf)__B, 726 (__v8hf)__W, (__mmask8)__U, 727 _MM_FROUND_CUR_DIRECTION); 728 } 729 730 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_min_sh(__mmask8 __U, 731 __m128h __A, 732 __m128h __B) { 733 return (__m128h)__builtin_ia32_minsh_round_mask( 734 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U, 735 _MM_FROUND_CUR_DIRECTION); 736 } 737 738 #define _mm_min_round_sh(A, B, R) \ 739 ((__m128h)__builtin_ia32_minsh_round_mask( \ 740 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ 741 (__mmask8)-1, (int)(R))) 742 743 #define _mm_mask_min_round_sh(W, U, A, B, R) \ 744 ((__m128h)__builtin_ia32_minsh_round_mask( \ 745 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ 746 (__mmask8)(U), (int)(R))) 747 748 #define _mm_maskz_min_round_sh(U, A, B, R) \ 749 ((__m128h)__builtin_ia32_minsh_round_mask( \ 750 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ 751 (__mmask8)(U), (int)(R))) 752 753 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_max_sh(__m128h __A, 754 __m128h __B) { 755 return (__m128h)__builtin_ia32_maxsh_round_mask( 756 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1, 757 _MM_FROUND_CUR_DIRECTION); 758 } 759 760 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_max_sh(__m128h __W, 761 __mmask8 __U, 762 __m128h __A, 763 __m128h __B) { 764 return (__m128h)__builtin_ia32_maxsh_round_mask((__v8hf)__A, (__v8hf)__B, 765 (__v8hf)__W, (__mmask8)__U, 766 _MM_FROUND_CUR_DIRECTION); 767 } 768 769 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_max_sh(__mmask8 __U, 770 __m128h __A, 771 __m128h __B) { 772 return (__m128h)__builtin_ia32_maxsh_round_mask( 773 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U, 774 _MM_FROUND_CUR_DIRECTION); 775 } 776 777 #define _mm_max_round_sh(A, B, R) \ 778 ((__m128h)__builtin_ia32_maxsh_round_mask( \ 779 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ 780 (__mmask8)-1, (int)(R))) 781 782 #define _mm_mask_max_round_sh(W, U, A, B, R) \ 783 ((__m128h)__builtin_ia32_maxsh_round_mask( \ 784 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ 785 (__mmask8)(U), (int)(R))) 786 787 #define _mm_maskz_max_round_sh(U, A, B, R) \ 788 ((__m128h)__builtin_ia32_maxsh_round_mask( \ 789 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ 790 (__mmask8)(U), (int)(R))) 791 792 #define _mm512_cmp_round_ph_mask(A, B, P, R) \ 793 ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A), \ 794 (__v32hf)(__m512h)(B), (int)(P), \ 795 (__mmask32)-1, (int)(R))) 796 797 #define _mm512_mask_cmp_round_ph_mask(U, A, B, P, R) \ 798 ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A), \ 799 (__v32hf)(__m512h)(B), (int)(P), \ 800 (__mmask32)(U), (int)(R))) 801 802 #define _mm512_cmp_ph_mask(A, B, P) \ 803 _mm512_cmp_round_ph_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION) 804 805 #define _mm512_mask_cmp_ph_mask(U, A, B, P) \ 806 _mm512_mask_cmp_round_ph_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION) 807 808 #define _mm_cmp_round_sh_mask(X, Y, P, R) \ 809 ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X), \ 810 (__v8hf)(__m128h)(Y), (int)(P), \ 811 (__mmask8)-1, (int)(R))) 812 813 #define _mm_mask_cmp_round_sh_mask(M, X, Y, P, R) \ 814 ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X), \ 815 (__v8hf)(__m128h)(Y), (int)(P), \ 816 (__mmask8)(M), (int)(R))) 817 818 #define _mm_cmp_sh_mask(X, Y, P) \ 819 ((__mmask8)__builtin_ia32_cmpsh_mask( \ 820 (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)-1, \ 821 _MM_FROUND_CUR_DIRECTION)) 822 823 #define _mm_mask_cmp_sh_mask(M, X, Y, P) \ 824 ((__mmask8)__builtin_ia32_cmpsh_mask( \ 825 (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)(M), \ 826 _MM_FROUND_CUR_DIRECTION)) 827 // loads with vmovsh: 828 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_sh(void const *__dp) { 829 struct __mm_load_sh_struct { 830 _Float16 __u; 831 } __attribute__((__packed__, __may_alias__)); 832 _Float16 __u = ((const struct __mm_load_sh_struct *)__dp)->__u; 833 return (__m128h){__u, 0, 0, 0, 0, 0, 0, 0}; 834 } 835 836 static __inline__ __m128h __DEFAULT_FN_ATTRS128 837 _mm_mask_load_sh(__m128h __W, __mmask8 __U, const void *__A) { 838 __m128h src = (__v8hf)__builtin_shufflevector( 839 (__v8hf)__W, (__v8hf)_mm_setzero_ph(), 0, 8, 8, 8, 8, 8, 8, 8); 840 841 return (__m128h)__builtin_ia32_loadsh128_mask((const __v8hf *)__A, src, __U & 1); 842 } 843 844 static __inline__ __m128h __DEFAULT_FN_ATTRS128 845 _mm_maskz_load_sh(__mmask8 __U, const void *__A) { 846 return (__m128h)__builtin_ia32_loadsh128_mask( 847 (const __v8hf *)__A, (__v8hf)_mm_setzero_ph(), __U & 1); 848 } 849 850 static __inline__ __m512h __DEFAULT_FN_ATTRS512 851 _mm512_load_ph(void const *__p) { 852 return *(const __m512h *)__p; 853 } 854 855 static __inline__ __m256h __DEFAULT_FN_ATTRS256 856 _mm256_load_ph(void const *__p) { 857 return *(const __m256h *)__p; 858 } 859 860 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_ph(void const *__p) { 861 return *(const __m128h *)__p; 862 } 863 864 static __inline__ __m512h __DEFAULT_FN_ATTRS512 865 _mm512_loadu_ph(void const *__p) { 866 struct __loadu_ph { 867 __m512h_u __v; 868 } __attribute__((__packed__, __may_alias__)); 869 return ((const struct __loadu_ph *)__p)->__v; 870 } 871 872 static __inline__ __m256h __DEFAULT_FN_ATTRS256 873 _mm256_loadu_ph(void const *__p) { 874 struct __loadu_ph { 875 __m256h_u __v; 876 } __attribute__((__packed__, __may_alias__)); 877 return ((const struct __loadu_ph *)__p)->__v; 878 } 879 880 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_loadu_ph(void const *__p) { 881 struct __loadu_ph { 882 __m128h_u __v; 883 } __attribute__((__packed__, __may_alias__)); 884 return ((const struct __loadu_ph *)__p)->__v; 885 } 886 887 // stores with vmovsh: 888 static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_sh(void *__dp, 889 __m128h __a) { 890 struct __mm_store_sh_struct { 891 _Float16 __u; 892 } __attribute__((__packed__, __may_alias__)); 893 ((struct __mm_store_sh_struct *)__dp)->__u = __a[0]; 894 } 895 896 static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_store_sh(void *__W, 897 __mmask8 __U, 898 __m128h __A) { 899 __builtin_ia32_storesh128_mask((__v8hf *)__W, __A, __U & 1); 900 } 901 902 static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_store_ph(void *__P, 903 __m512h __A) { 904 *(__m512h *)__P = __A; 905 } 906 907 static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_store_ph(void *__P, 908 __m256h __A) { 909 *(__m256h *)__P = __A; 910 } 911 912 static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_ph(void *__P, 913 __m128h __A) { 914 *(__m128h *)__P = __A; 915 } 916 917 static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_storeu_ph(void *__P, 918 __m512h __A) { 919 struct __storeu_ph { 920 __m512h_u __v; 921 } __attribute__((__packed__, __may_alias__)); 922 ((struct __storeu_ph *)__P)->__v = __A; 923 } 924 925 static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_storeu_ph(void *__P, 926 __m256h __A) { 927 struct __storeu_ph { 928 __m256h_u __v; 929 } __attribute__((__packed__, __may_alias__)); 930 ((struct __storeu_ph *)__P)->__v = __A; 931 } 932 933 static __inline__ void __DEFAULT_FN_ATTRS128 _mm_storeu_ph(void *__P, 934 __m128h __A) { 935 struct __storeu_ph { 936 __m128h_u __v; 937 } __attribute__((__packed__, __may_alias__)); 938 ((struct __storeu_ph *)__P)->__v = __A; 939 } 940 941 // moves with vmovsh: 942 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_move_sh(__m128h __a, 943 __m128h __b) { 944 __a[0] = __b[0]; 945 return __a; 946 } 947 948 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_move_sh(__m128h __W, 949 __mmask8 __U, 950 __m128h __A, 951 __m128h __B) { 952 return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B), __W); 953 } 954 955 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_move_sh(__mmask8 __U, 956 __m128h __A, 957 __m128h __B) { 958 return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B), 959 _mm_setzero_ph()); 960 } 961 962 // vmovw: 963 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtsi16_si128(short __a) { 964 return (__m128i)(__v8hi){__a, 0, 0, 0, 0, 0, 0, 0}; 965 } 966 967 static __inline__ short __DEFAULT_FN_ATTRS128 _mm_cvtsi128_si16(__m128i __a) { 968 __v8hi __b = (__v8hi)__a; 969 return __b[0]; 970 } 971 972 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rcp_ph(__m512h __A) { 973 return (__m512h)__builtin_ia32_rcpph512_mask( 974 (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1); 975 } 976 977 static __inline__ __m512h __DEFAULT_FN_ATTRS512 978 _mm512_mask_rcp_ph(__m512h __W, __mmask32 __U, __m512h __A) { 979 return (__m512h)__builtin_ia32_rcpph512_mask((__v32hf)__A, (__v32hf)__W, 980 (__mmask32)__U); 981 } 982 983 static __inline__ __m512h __DEFAULT_FN_ATTRS512 984 _mm512_maskz_rcp_ph(__mmask32 __U, __m512h __A) { 985 return (__m512h)__builtin_ia32_rcpph512_mask( 986 (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U); 987 } 988 989 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rsqrt_ph(__m512h __A) { 990 return (__m512h)__builtin_ia32_rsqrtph512_mask( 991 (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1); 992 } 993 994 static __inline__ __m512h __DEFAULT_FN_ATTRS512 995 _mm512_mask_rsqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) { 996 return (__m512h)__builtin_ia32_rsqrtph512_mask((__v32hf)__A, (__v32hf)__W, 997 (__mmask32)__U); 998 } 999 1000 static __inline__ __m512h __DEFAULT_FN_ATTRS512 1001 _mm512_maskz_rsqrt_ph(__mmask32 __U, __m512h __A) { 1002 return (__m512h)__builtin_ia32_rsqrtph512_mask( 1003 (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U); 1004 } 1005 1006 #define _mm512_getmant_ph(A, B, C) \ 1007 ((__m512h)__builtin_ia32_getmantph512_mask( \ 1008 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \ 1009 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, \ 1010 _MM_FROUND_CUR_DIRECTION)) 1011 1012 #define _mm512_mask_getmant_ph(W, U, A, B, C) \ 1013 ((__m512h)__builtin_ia32_getmantph512_mask( \ 1014 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W), \ 1015 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION)) 1016 1017 #define _mm512_maskz_getmant_ph(U, A, B, C) \ 1018 ((__m512h)__builtin_ia32_getmantph512_mask( \ 1019 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \ 1020 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), _MM_FROUND_CUR_DIRECTION)) 1021 1022 #define _mm512_getmant_round_ph(A, B, C, R) \ 1023 ((__m512h)__builtin_ia32_getmantph512_mask( \ 1024 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \ 1025 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R))) 1026 1027 #define _mm512_mask_getmant_round_ph(W, U, A, B, C, R) \ 1028 ((__m512h)__builtin_ia32_getmantph512_mask( \ 1029 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W), \ 1030 (__mmask32)(U), (int)(R))) 1031 1032 #define _mm512_maskz_getmant_round_ph(U, A, B, C, R) \ 1033 ((__m512h)__builtin_ia32_getmantph512_mask( \ 1034 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \ 1035 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R))) 1036 1037 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_getexp_ph(__m512h __A) { 1038 return (__m512h)__builtin_ia32_getexpph512_mask( 1039 (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, 1040 _MM_FROUND_CUR_DIRECTION); 1041 } 1042 1043 static __inline__ __m512h __DEFAULT_FN_ATTRS512 1044 _mm512_mask_getexp_ph(__m512h __W, __mmask32 __U, __m512h __A) { 1045 return (__m512h)__builtin_ia32_getexpph512_mask( 1046 (__v32hf)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION); 1047 } 1048 1049 static __inline__ __m512h __DEFAULT_FN_ATTRS512 1050 _mm512_maskz_getexp_ph(__mmask32 __U, __m512h __A) { 1051 return (__m512h)__builtin_ia32_getexpph512_mask( 1052 (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U, 1053 _MM_FROUND_CUR_DIRECTION); 1054 } 1055 1056 #define _mm512_getexp_round_ph(A, R) \ 1057 ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \ 1058 (__v32hf)_mm512_undefined_ph(), \ 1059 (__mmask32)-1, (int)(R))) 1060 1061 #define _mm512_mask_getexp_round_ph(W, U, A, R) \ 1062 ((__m512h)__builtin_ia32_getexpph512_mask( \ 1063 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(W), (__mmask32)(U), (int)(R))) 1064 1065 #define _mm512_maskz_getexp_round_ph(U, A, R) \ 1066 ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \ 1067 (__v32hf)_mm512_setzero_ph(), \ 1068 (__mmask32)(U), (int)(R))) 1069 1070 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_scalef_ph(__m512h __A, 1071 __m512h __B) { 1072 return (__m512h)__builtin_ia32_scalefph512_mask( 1073 (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, 1074 _MM_FROUND_CUR_DIRECTION); 1075 } 1076 1077 static __inline__ __m512h __DEFAULT_FN_ATTRS512 1078 _mm512_mask_scalef_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { 1079 return (__m512h)__builtin_ia32_scalefph512_mask((__v32hf)__A, (__v32hf)__B, 1080 (__v32hf)__W, (__mmask32)__U, 1081 _MM_FROUND_CUR_DIRECTION); 1082 } 1083 1084 static __inline__ __m512h __DEFAULT_FN_ATTRS512 1085 _mm512_maskz_scalef_ph(__mmask32 __U, __m512h __A, __m512h __B) { 1086 return (__m512h)__builtin_ia32_scalefph512_mask( 1087 (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U, 1088 _MM_FROUND_CUR_DIRECTION); 1089 } 1090 1091 #define _mm512_scalef_round_ph(A, B, R) \ 1092 ((__m512h)__builtin_ia32_scalefph512_mask( \ 1093 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), \ 1094 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R))) 1095 1096 #define _mm512_mask_scalef_round_ph(W, U, A, B, R) \ 1097 ((__m512h)__builtin_ia32_scalefph512_mask( \ 1098 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(W), \ 1099 (__mmask32)(U), (int)(R))) 1100 1101 #define _mm512_maskz_scalef_round_ph(U, A, B, R) \ 1102 ((__m512h)__builtin_ia32_scalefph512_mask( \ 1103 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), \ 1104 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R))) 1105 1106 #define _mm512_roundscale_ph(A, B) \ 1107 ((__m512h)__builtin_ia32_rndscaleph_mask( \ 1108 (__v32hf)(__m512h)(A), (int)(B), (__v32hf)(__m512h)(A), (__mmask32)-1, \ 1109 _MM_FROUND_CUR_DIRECTION)) 1110 1111 #define _mm512_mask_roundscale_ph(A, B, C, imm) \ 1112 ((__m512h)__builtin_ia32_rndscaleph_mask( \ 1113 (__v32hf)(__m512h)(C), (int)(imm), (__v32hf)(__m512h)(A), \ 1114 (__mmask32)(B), _MM_FROUND_CUR_DIRECTION)) 1115 1116 #define _mm512_maskz_roundscale_ph(A, B, imm) \ 1117 ((__m512h)__builtin_ia32_rndscaleph_mask( \ 1118 (__v32hf)(__m512h)(B), (int)(imm), (__v32hf)_mm512_setzero_ph(), \ 1119 (__mmask32)(A), _MM_FROUND_CUR_DIRECTION)) 1120 1121 #define _mm512_mask_roundscale_round_ph(A, B, C, imm, R) \ 1122 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(C), (int)(imm), \ 1123 (__v32hf)(__m512h)(A), \ 1124 (__mmask32)(B), (int)(R))) 1125 1126 #define _mm512_maskz_roundscale_round_ph(A, B, imm, R) \ 1127 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(B), (int)(imm), \ 1128 (__v32hf)_mm512_setzero_ph(), \ 1129 (__mmask32)(A), (int)(R))) 1130 1131 #define _mm512_roundscale_round_ph(A, imm, R) \ 1132 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(A), (int)(imm), \ 1133 (__v32hf)_mm512_undefined_ph(), \ 1134 (__mmask32)-1, (int)(R))) 1135 1136 #define _mm512_reduce_ph(A, imm) \ 1137 ((__m512h)__builtin_ia32_reduceph512_mask( \ 1138 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_undefined_ph(), \ 1139 (__mmask32)-1, _MM_FROUND_CUR_DIRECTION)) 1140 1141 #define _mm512_mask_reduce_ph(W, U, A, imm) \ 1142 ((__m512h)__builtin_ia32_reduceph512_mask( \ 1143 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)(__m512h)(W), \ 1144 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION)) 1145 1146 #define _mm512_maskz_reduce_ph(U, A, imm) \ 1147 ((__m512h)__builtin_ia32_reduceph512_mask( \ 1148 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_setzero_ph(), \ 1149 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION)) 1150 1151 #define _mm512_mask_reduce_round_ph(W, U, A, imm, R) \ 1152 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \ 1153 (__v32hf)(__m512h)(W), \ 1154 (__mmask32)(U), (int)(R))) 1155 1156 #define _mm512_maskz_reduce_round_ph(U, A, imm, R) \ 1157 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \ 1158 (__v32hf)_mm512_setzero_ph(), \ 1159 (__mmask32)(U), (int)(R))) 1160 1161 #define _mm512_reduce_round_ph(A, imm, R) \ 1162 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \ 1163 (__v32hf)_mm512_undefined_ph(), \ 1164 (__mmask32)-1, (int)(R))) 1165 1166 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rcp_sh(__m128h __A, 1167 __m128h __B) { 1168 return (__m128h)__builtin_ia32_rcpsh_mask( 1169 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1); 1170 } 1171 1172 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rcp_sh(__m128h __W, 1173 __mmask8 __U, 1174 __m128h __A, 1175 __m128h __B) { 1176 return (__m128h)__builtin_ia32_rcpsh_mask((__v8hf)__A, (__v8hf)__B, 1177 (__v8hf)__W, (__mmask8)__U); 1178 } 1179 1180 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_rcp_sh(__mmask8 __U, 1181 __m128h __A, 1182 __m128h __B) { 1183 return (__m128h)__builtin_ia32_rcpsh_mask( 1184 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); 1185 } 1186 1187 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rsqrt_sh(__m128h __A, 1188 __m128h __B) { 1189 return (__m128h)__builtin_ia32_rsqrtsh_mask( 1190 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1); 1191 } 1192 1193 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rsqrt_sh(__m128h __W, 1194 __mmask8 __U, 1195 __m128h __A, 1196 __m128h __B) { 1197 return (__m128h)__builtin_ia32_rsqrtsh_mask((__v8hf)__A, (__v8hf)__B, 1198 (__v8hf)__W, (__mmask8)__U); 1199 } 1200 1201 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1202 _mm_maskz_rsqrt_sh(__mmask8 __U, __m128h __A, __m128h __B) { 1203 return (__m128h)__builtin_ia32_rsqrtsh_mask( 1204 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); 1205 } 1206 1207 #define _mm_getmant_round_sh(A, B, C, D, R) \ 1208 ((__m128h)__builtin_ia32_getmantsh_round_mask( \ 1209 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \ 1210 (__v8hf)_mm_setzero_ph(), (__mmask8)-1, (int)(R))) 1211 1212 #define _mm_getmant_sh(A, B, C, D) \ 1213 ((__m128h)__builtin_ia32_getmantsh_round_mask( \ 1214 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \ 1215 (__v8hf)_mm_setzero_ph(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION)) 1216 1217 #define _mm_mask_getmant_sh(W, U, A, B, C, D) \ 1218 ((__m128h)__builtin_ia32_getmantsh_round_mask( \ 1219 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \ 1220 (__v8hf)(__m128h)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) 1221 1222 #define _mm_mask_getmant_round_sh(W, U, A, B, C, D, R) \ 1223 ((__m128h)__builtin_ia32_getmantsh_round_mask( \ 1224 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \ 1225 (__v8hf)(__m128h)(W), (__mmask8)(U), (int)(R))) 1226 1227 #define _mm_maskz_getmant_sh(U, A, B, C, D) \ 1228 ((__m128h)__builtin_ia32_getmantsh_round_mask( \ 1229 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \ 1230 (__v8hf)_mm_setzero_ph(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) 1231 1232 #define _mm_maskz_getmant_round_sh(U, A, B, C, D, R) \ 1233 ((__m128h)__builtin_ia32_getmantsh_round_mask( \ 1234 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \ 1235 (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R))) 1236 1237 #define _mm_getexp_round_sh(A, B, R) \ 1238 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \ 1239 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ 1240 (__mmask8)-1, (int)(R))) 1241 1242 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_getexp_sh(__m128h __A, 1243 __m128h __B) { 1244 return (__m128h)__builtin_ia32_getexpsh128_round_mask( 1245 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1, 1246 _MM_FROUND_CUR_DIRECTION); 1247 } 1248 1249 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1250 _mm_mask_getexp_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { 1251 return (__m128h)__builtin_ia32_getexpsh128_round_mask( 1252 (__v8hf)__A, (__v8hf)__B, (__v8hf)__W, (__mmask8)__U, 1253 _MM_FROUND_CUR_DIRECTION); 1254 } 1255 1256 #define _mm_mask_getexp_round_sh(W, U, A, B, R) \ 1257 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \ 1258 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ 1259 (__mmask8)(U), (int)(R))) 1260 1261 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1262 _mm_maskz_getexp_sh(__mmask8 __U, __m128h __A, __m128h __B) { 1263 return (__m128h)__builtin_ia32_getexpsh128_round_mask( 1264 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U, 1265 _MM_FROUND_CUR_DIRECTION); 1266 } 1267 1268 #define _mm_maskz_getexp_round_sh(U, A, B, R) \ 1269 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \ 1270 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ 1271 (__mmask8)(U), (int)(R))) 1272 1273 #define _mm_scalef_round_sh(A, B, R) \ 1274 ((__m128h)__builtin_ia32_scalefsh_round_mask( \ 1275 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ 1276 (__mmask8)-1, (int)(R))) 1277 1278 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_scalef_sh(__m128h __A, 1279 __m128h __B) { 1280 return (__m128h)__builtin_ia32_scalefsh_round_mask( 1281 (__v8hf)__A, (__v8hf)(__B), (__v8hf)_mm_setzero_ph(), (__mmask8)-1, 1282 _MM_FROUND_CUR_DIRECTION); 1283 } 1284 1285 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1286 _mm_mask_scalef_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { 1287 return (__m128h)__builtin_ia32_scalefsh_round_mask((__v8hf)__A, (__v8hf)__B, 1288 (__v8hf)__W, (__mmask8)__U, 1289 _MM_FROUND_CUR_DIRECTION); 1290 } 1291 1292 #define _mm_mask_scalef_round_sh(W, U, A, B, R) \ 1293 ((__m128h)__builtin_ia32_scalefsh_round_mask( \ 1294 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ 1295 (__mmask8)(U), (int)(R))) 1296 1297 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1298 _mm_maskz_scalef_sh(__mmask8 __U, __m128h __A, __m128h __B) { 1299 return (__m128h)__builtin_ia32_scalefsh_round_mask( 1300 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U, 1301 _MM_FROUND_CUR_DIRECTION); 1302 } 1303 1304 #define _mm_maskz_scalef_round_sh(U, A, B, R) \ 1305 ((__m128h)__builtin_ia32_scalefsh_round_mask( \ 1306 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ 1307 (__mmask8)(U), (int)(R))) 1308 1309 #define _mm_roundscale_round_sh(A, B, imm, R) \ 1310 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \ 1311 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ 1312 (__mmask8)-1, (int)(imm), (int)(R))) 1313 1314 #define _mm_roundscale_sh(A, B, imm) \ 1315 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \ 1316 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ 1317 (__mmask8)-1, (int)(imm), _MM_FROUND_CUR_DIRECTION)) 1318 1319 #define _mm_mask_roundscale_sh(W, U, A, B, I) \ 1320 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \ 1321 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ 1322 (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION)) 1323 1324 #define _mm_mask_roundscale_round_sh(W, U, A, B, I, R) \ 1325 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \ 1326 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ 1327 (__mmask8)(U), (int)(I), (int)(R))) 1328 1329 #define _mm_maskz_roundscale_sh(U, A, B, I) \ 1330 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \ 1331 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ 1332 (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION)) 1333 1334 #define _mm_maskz_roundscale_round_sh(U, A, B, I, R) \ 1335 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \ 1336 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ 1337 (__mmask8)(U), (int)(I), (int)(R))) 1338 1339 #define _mm_reduce_sh(A, B, C) \ 1340 ((__m128h)__builtin_ia32_reducesh_mask( \ 1341 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ 1342 (__mmask8)-1, (int)(C), _MM_FROUND_CUR_DIRECTION)) 1343 1344 #define _mm_mask_reduce_sh(W, U, A, B, C) \ 1345 ((__m128h)__builtin_ia32_reducesh_mask( \ 1346 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ 1347 (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION)) 1348 1349 #define _mm_maskz_reduce_sh(U, A, B, C) \ 1350 ((__m128h)__builtin_ia32_reducesh_mask( \ 1351 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ 1352 (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION)) 1353 1354 #define _mm_reduce_round_sh(A, B, C, R) \ 1355 ((__m128h)__builtin_ia32_reducesh_mask( \ 1356 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ 1357 (__mmask8)-1, (int)(C), (int)(R))) 1358 1359 #define _mm_mask_reduce_round_sh(W, U, A, B, C, R) \ 1360 ((__m128h)__builtin_ia32_reducesh_mask( \ 1361 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ 1362 (__mmask8)(U), (int)(C), (int)(R))) 1363 1364 #define _mm_maskz_reduce_round_sh(U, A, B, C, R) \ 1365 ((__m128h)__builtin_ia32_reducesh_mask( \ 1366 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ 1367 (__mmask8)(U), (int)(C), (int)(R))) 1368 1369 #define _mm512_sqrt_round_ph(A, R) \ 1370 ((__m512h)__builtin_ia32_sqrtph512((__v32hf)(__m512h)(A), (int)(R))) 1371 1372 #define _mm512_mask_sqrt_round_ph(W, U, A, R) \ 1373 ((__m512h)__builtin_ia32_selectph_512( \ 1374 (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)), \ 1375 (__v32hf)(__m512h)(W))) 1376 1377 #define _mm512_maskz_sqrt_round_ph(U, A, R) \ 1378 ((__m512h)__builtin_ia32_selectph_512( \ 1379 (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)), \ 1380 (__v32hf)_mm512_setzero_ph())) 1381 1382 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sqrt_ph(__m512h __A) { 1383 return (__m512h)__builtin_ia32_sqrtph512((__v32hf)__A, 1384 _MM_FROUND_CUR_DIRECTION); 1385 } 1386 1387 static __inline__ __m512h __DEFAULT_FN_ATTRS512 1388 _mm512_mask_sqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) { 1389 return (__m512h)__builtin_ia32_selectph_512( 1390 (__mmask32)(__U), 1391 (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)), 1392 (__v32hf)(__m512h)(__W)); 1393 } 1394 1395 static __inline__ __m512h __DEFAULT_FN_ATTRS512 1396 _mm512_maskz_sqrt_ph(__mmask32 __U, __m512h __A) { 1397 return (__m512h)__builtin_ia32_selectph_512( 1398 (__mmask32)(__U), 1399 (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)), 1400 (__v32hf)_mm512_setzero_ph()); 1401 } 1402 1403 #define _mm_sqrt_round_sh(A, B, R) \ 1404 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \ 1405 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ 1406 (__mmask8)-1, (int)(R))) 1407 1408 #define _mm_mask_sqrt_round_sh(W, U, A, B, R) \ 1409 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \ 1410 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ 1411 (__mmask8)(U), (int)(R))) 1412 1413 #define _mm_maskz_sqrt_round_sh(U, A, B, R) \ 1414 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \ 1415 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ 1416 (__mmask8)(U), (int)(R))) 1417 1418 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_sh(__m128h __A, 1419 __m128h __B) { 1420 return (__m128h)__builtin_ia32_sqrtsh_round_mask( 1421 (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(), 1422 (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); 1423 } 1424 1425 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_sh(__m128h __W, 1426 __mmask32 __U, 1427 __m128h __A, 1428 __m128h __B) { 1429 return (__m128h)__builtin_ia32_sqrtsh_round_mask( 1430 (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)(__m128h)(__W), 1431 (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION); 1432 } 1433 1434 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_sh(__mmask32 __U, 1435 __m128h __A, 1436 __m128h __B) { 1437 return (__m128h)__builtin_ia32_sqrtsh_round_mask( 1438 (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(), 1439 (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION); 1440 } 1441 1442 #define _mm512_mask_fpclass_ph_mask(U, A, imm) \ 1443 ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A), \ 1444 (int)(imm), (__mmask32)(U))) 1445 1446 #define _mm512_fpclass_ph_mask(A, imm) \ 1447 ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A), \ 1448 (int)(imm), (__mmask32)-1)) 1449 1450 #define _mm_fpclass_sh_mask(A, imm) \ 1451 ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm), \ 1452 (__mmask8)-1)) 1453 1454 #define _mm_mask_fpclass_sh_mask(U, A, imm) \ 1455 ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm), \ 1456 (__mmask8)(U))) 1457 1458 #define _mm512_cvt_roundpd_ph(A, R) \ 1459 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask( \ 1460 (__v8df)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R))) 1461 1462 #define _mm512_mask_cvt_roundpd_ph(W, U, A, R) \ 1463 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask((__v8df)(A), (__v8hf)(W), \ 1464 (__mmask8)(U), (int)(R))) 1465 1466 #define _mm512_maskz_cvt_roundpd_ph(U, A, R) \ 1467 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask( \ 1468 (__v8df)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R))) 1469 1470 static __inline__ __m128h __DEFAULT_FN_ATTRS512 _mm512_cvtpd_ph(__m512d __A) { 1471 return (__m128h)__builtin_ia32_vcvtpd2ph512_mask( 1472 (__v8df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1, 1473 _MM_FROUND_CUR_DIRECTION); 1474 } 1475 1476 static __inline__ __m128h __DEFAULT_FN_ATTRS512 1477 _mm512_mask_cvtpd_ph(__m128h __W, __mmask8 __U, __m512d __A) { 1478 return (__m128h)__builtin_ia32_vcvtpd2ph512_mask( 1479 (__v8df)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); 1480 } 1481 1482 static __inline__ __m128h __DEFAULT_FN_ATTRS512 1483 _mm512_maskz_cvtpd_ph(__mmask8 __U, __m512d __A) { 1484 return (__m128h)__builtin_ia32_vcvtpd2ph512_mask( 1485 (__v8df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U, 1486 _MM_FROUND_CUR_DIRECTION); 1487 } 1488 1489 #define _mm512_cvt_roundph_pd(A, R) \ 1490 ((__m512d)__builtin_ia32_vcvtph2pd512_mask( \ 1491 (__v8hf)(A), (__v8df)_mm512_undefined_pd(), (__mmask8)(-1), (int)(R))) 1492 1493 #define _mm512_mask_cvt_roundph_pd(W, U, A, R) \ 1494 ((__m512d)__builtin_ia32_vcvtph2pd512_mask((__v8hf)(A), (__v8df)(W), \ 1495 (__mmask8)(U), (int)(R))) 1496 1497 #define _mm512_maskz_cvt_roundph_pd(U, A, R) \ 1498 ((__m512d)__builtin_ia32_vcvtph2pd512_mask( \ 1499 (__v8hf)(A), (__v8df)_mm512_setzero_pd(), (__mmask8)(U), (int)(R))) 1500 1501 static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_cvtph_pd(__m128h __A) { 1502 return (__m512d)__builtin_ia32_vcvtph2pd512_mask( 1503 (__v8hf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)-1, 1504 _MM_FROUND_CUR_DIRECTION); 1505 } 1506 1507 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1508 _mm512_mask_cvtph_pd(__m512d __W, __mmask8 __U, __m128h __A) { 1509 return (__m512d)__builtin_ia32_vcvtph2pd512_mask( 1510 (__v8hf)__A, (__v8df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); 1511 } 1512 1513 static __inline__ __m512d __DEFAULT_FN_ATTRS512 1514 _mm512_maskz_cvtph_pd(__mmask8 __U, __m128h __A) { 1515 return (__m512d)__builtin_ia32_vcvtph2pd512_mask( 1516 (__v8hf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, 1517 _MM_FROUND_CUR_DIRECTION); 1518 } 1519 1520 #define _mm_cvt_roundsh_ss(A, B, R) \ 1521 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B), \ 1522 (__v4sf)_mm_undefined_ps(), \ 1523 (__mmask8)(-1), (int)(R))) 1524 1525 #define _mm_mask_cvt_roundsh_ss(W, U, A, B, R) \ 1526 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask( \ 1527 (__v4sf)(A), (__v8hf)(B), (__v4sf)(W), (__mmask8)(U), (int)(R))) 1528 1529 #define _mm_maskz_cvt_roundsh_ss(U, A, B, R) \ 1530 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B), \ 1531 (__v4sf)_mm_setzero_ps(), \ 1532 (__mmask8)(U), (int)(R))) 1533 1534 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtsh_ss(__m128 __A, 1535 __m128h __B) { 1536 return (__m128)__builtin_ia32_vcvtsh2ss_round_mask( 1537 (__v4sf)__A, (__v8hf)__B, (__v4sf)_mm_undefined_ps(), (__mmask8)-1, 1538 _MM_FROUND_CUR_DIRECTION); 1539 } 1540 1541 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtsh_ss(__m128 __W, 1542 __mmask8 __U, 1543 __m128 __A, 1544 __m128h __B) { 1545 return (__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)__A, (__v8hf)__B, 1546 (__v4sf)__W, (__mmask8)__U, 1547 _MM_FROUND_CUR_DIRECTION); 1548 } 1549 1550 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_cvtsh_ss(__mmask8 __U, 1551 __m128 __A, 1552 __m128h __B) { 1553 return (__m128)__builtin_ia32_vcvtsh2ss_round_mask( 1554 (__v4sf)__A, (__v8hf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, 1555 _MM_FROUND_CUR_DIRECTION); 1556 } 1557 1558 #define _mm_cvt_roundss_sh(A, B, R) \ 1559 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B), \ 1560 (__v8hf)_mm_undefined_ph(), \ 1561 (__mmask8)(-1), (int)(R))) 1562 1563 #define _mm_mask_cvt_roundss_sh(W, U, A, B, R) \ 1564 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask( \ 1565 (__v8hf)(A), (__v4sf)(B), (__v8hf)(W), (__mmask8)(U), (int)(R))) 1566 1567 #define _mm_maskz_cvt_roundss_sh(U, A, B, R) \ 1568 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B), \ 1569 (__v8hf)_mm_setzero_ph(), \ 1570 (__mmask8)(U), (int)(R))) 1571 1572 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtss_sh(__m128h __A, 1573 __m128 __B) { 1574 return (__m128h)__builtin_ia32_vcvtss2sh_round_mask( 1575 (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_undefined_ph(), (__mmask8)-1, 1576 _MM_FROUND_CUR_DIRECTION); 1577 } 1578 1579 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtss_sh(__m128h __W, 1580 __mmask8 __U, 1581 __m128h __A, 1582 __m128 __B) { 1583 return (__m128h)__builtin_ia32_vcvtss2sh_round_mask( 1584 (__v8hf)__A, (__v4sf)__B, (__v8hf)__W, (__mmask8)__U, 1585 _MM_FROUND_CUR_DIRECTION); 1586 } 1587 1588 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_cvtss_sh(__mmask8 __U, 1589 __m128h __A, 1590 __m128 __B) { 1591 return (__m128h)__builtin_ia32_vcvtss2sh_round_mask( 1592 (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U, 1593 _MM_FROUND_CUR_DIRECTION); 1594 } 1595 1596 #define _mm_cvt_roundsd_sh(A, B, R) \ 1597 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B), \ 1598 (__v8hf)_mm_undefined_ph(), \ 1599 (__mmask8)(-1), (int)(R))) 1600 1601 #define _mm_mask_cvt_roundsd_sh(W, U, A, B, R) \ 1602 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask( \ 1603 (__v8hf)(A), (__v2df)(B), (__v8hf)(W), (__mmask8)(U), (int)(R))) 1604 1605 #define _mm_maskz_cvt_roundsd_sh(U, A, B, R) \ 1606 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B), \ 1607 (__v8hf)_mm_setzero_ph(), \ 1608 (__mmask8)(U), (int)(R))) 1609 1610 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtsd_sh(__m128h __A, 1611 __m128d __B) { 1612 return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask( 1613 (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_undefined_ph(), (__mmask8)-1, 1614 _MM_FROUND_CUR_DIRECTION); 1615 } 1616 1617 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtsd_sh(__m128h __W, 1618 __mmask8 __U, 1619 __m128h __A, 1620 __m128d __B) { 1621 return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask( 1622 (__v8hf)__A, (__v2df)__B, (__v8hf)__W, (__mmask8)__U, 1623 _MM_FROUND_CUR_DIRECTION); 1624 } 1625 1626 static __inline__ __m128h __DEFAULT_FN_ATTRS128 1627 _mm_maskz_cvtsd_sh(__mmask8 __U, __m128h __A, __m128d __B) { 1628 return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask( 1629 (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U, 1630 _MM_FROUND_CUR_DIRECTION); 1631 } 1632 1633 #define _mm_cvt_roundsh_sd(A, B, R) \ 1634 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B), \ 1635 (__v2df)_mm_undefined_pd(), \ 1636 (__mmask8)(-1), (int)(R))) 1637 1638 #define _mm_mask_cvt_roundsh_sd(W, U, A, B, R) \ 1639 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask( \ 1640 (__v2df)(A), (__v8hf)(B), (__v2df)(W), (__mmask8)(U), (int)(R))) 1641 1642 #define _mm_maskz_cvt_roundsh_sd(U, A, B, R) \ 1643 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B), \ 1644 (__v2df)_mm_setzero_pd(), \ 1645 (__mmask8)(U), (int)(R))) 1646 1647 static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_cvtsh_sd(__m128d __A, 1648 __m128h __B) { 1649 return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask( 1650 (__v2df)__A, (__v8hf)__B, (__v2df)_mm_undefined_pd(), (__mmask8)-1, 1651 _MM_FROUND_CUR_DIRECTION); 1652 } 1653 1654 static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_cvtsh_sd(__m128d __W, 1655 __mmask8 __U, 1656 __m128d __A, 1657 __m128h __B) { 1658 return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask( 1659 (__v2df)__A, (__v8hf)__B, (__v2df)__W, (__mmask8)__U, 1660 _MM_FROUND_CUR_DIRECTION); 1661 } 1662 1663 static __inline__ __m128d __DEFAULT_FN_ATTRS128 1664 _mm_maskz_cvtsh_sd(__mmask8 __U, __m128d __A, __m128h __B) { 1665 return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask( 1666 (__v2df)__A, (__v8hf)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U, 1667 _MM_FROUND_CUR_DIRECTION); 1668 } 1669 1670 #define _mm512_cvt_roundph_epi16(A, R) \ 1671 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), \ 1672 (__v32hi)_mm512_undefined_epi32(), \ 1673 (__mmask32)(-1), (int)(R))) 1674 1675 #define _mm512_mask_cvt_roundph_epi16(W, U, A, R) \ 1676 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), (__v32hi)(W), \ 1677 (__mmask32)(U), (int)(R))) 1678 1679 #define _mm512_maskz_cvt_roundph_epi16(U, A, R) \ 1680 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), \ 1681 (__v32hi)_mm512_setzero_epi32(), \ 1682 (__mmask32)(U), (int)(R))) 1683 1684 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1685 _mm512_cvtph_epi16(__m512h __A) { 1686 return (__m512i)__builtin_ia32_vcvtph2w512_mask( 1687 (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)-1, 1688 _MM_FROUND_CUR_DIRECTION); 1689 } 1690 1691 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1692 _mm512_mask_cvtph_epi16(__m512i __W, __mmask32 __U, __m512h __A) { 1693 return (__m512i)__builtin_ia32_vcvtph2w512_mask( 1694 (__v32hf)__A, (__v32hi)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION); 1695 } 1696 1697 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1698 _mm512_maskz_cvtph_epi16(__mmask32 __U, __m512h __A) { 1699 return (__m512i)__builtin_ia32_vcvtph2w512_mask( 1700 (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)__U, 1701 _MM_FROUND_CUR_DIRECTION); 1702 } 1703 1704 #define _mm512_cvtt_roundph_epi16(A, R) \ 1705 ((__m512i)__builtin_ia32_vcvttph2w512_mask( \ 1706 (__v32hf)(A), (__v32hi)_mm512_undefined_epi32(), (__mmask32)(-1), \ 1707 (int)(R))) 1708 1709 #define _mm512_mask_cvtt_roundph_epi16(W, U, A, R) \ 1710 ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), (__v32hi)(W), \ 1711 (__mmask32)(U), (int)(R))) 1712 1713 #define _mm512_maskz_cvtt_roundph_epi16(U, A, R) \ 1714 ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), \ 1715 (__v32hi)_mm512_setzero_epi32(), \ 1716 (__mmask32)(U), (int)(R))) 1717 1718 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1719 _mm512_cvttph_epi16(__m512h __A) { 1720 return (__m512i)__builtin_ia32_vcvttph2w512_mask( 1721 (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)-1, 1722 _MM_FROUND_CUR_DIRECTION); 1723 } 1724 1725 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1726 _mm512_mask_cvttph_epi16(__m512i __W, __mmask32 __U, __m512h __A) { 1727 return (__m512i)__builtin_ia32_vcvttph2w512_mask( 1728 (__v32hf)__A, (__v32hi)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION); 1729 } 1730 1731 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1732 _mm512_maskz_cvttph_epi16(__mmask32 __U, __m512h __A) { 1733 return (__m512i)__builtin_ia32_vcvttph2w512_mask( 1734 (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)__U, 1735 _MM_FROUND_CUR_DIRECTION); 1736 } 1737 1738 #define _mm512_cvt_roundepi16_ph(A, R) \ 1739 ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), \ 1740 (__v32hf)_mm512_undefined_ph(), \ 1741 (__mmask32)(-1), (int)(R))) 1742 1743 #define _mm512_mask_cvt_roundepi16_ph(W, U, A, R) \ 1744 ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), (__v32hf)(W), \ 1745 (__mmask32)(U), (int)(R))) 1746 1747 #define _mm512_maskz_cvt_roundepi16_ph(U, A, R) \ 1748 ((__m512h)__builtin_ia32_vcvtw2ph512_mask( \ 1749 (__v32hi)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R))) 1750 1751 static __inline__ __m512h __DEFAULT_FN_ATTRS512 1752 _mm512_cvtepi16_ph(__m512i __A) { 1753 return (__m512h)__builtin_ia32_vcvtw2ph512_mask( 1754 (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)-1, 1755 _MM_FROUND_CUR_DIRECTION); 1756 } 1757 1758 static __inline__ __m512h __DEFAULT_FN_ATTRS512 1759 _mm512_mask_cvtepi16_ph(__m512h __W, __mmask32 __U, __m512i __A) { 1760 return (__m512h)__builtin_ia32_vcvtw2ph512_mask( 1761 (__v32hi)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION); 1762 } 1763 1764 static __inline__ __m512h __DEFAULT_FN_ATTRS512 1765 _mm512_maskz_cvtepi16_ph(__mmask32 __U, __m512i __A) { 1766 return (__m512h)__builtin_ia32_vcvtw2ph512_mask( 1767 (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U, 1768 _MM_FROUND_CUR_DIRECTION); 1769 } 1770 1771 #define _mm512_cvt_roundph_epu16(A, R) \ 1772 ((__m512i)__builtin_ia32_vcvtph2uw512_mask( \ 1773 (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1), \ 1774 (int)(R))) 1775 1776 #define _mm512_mask_cvt_roundph_epu16(W, U, A, R) \ 1777 ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), (__v32hu)(W), \ 1778 (__mmask32)(U), (int)(R))) 1779 1780 #define _mm512_maskz_cvt_roundph_epu16(U, A, R) \ 1781 ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), \ 1782 (__v32hu)_mm512_setzero_epi32(), \ 1783 (__mmask32)(U), (int)(R))) 1784 1785 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1786 _mm512_cvtph_epu16(__m512h __A) { 1787 return (__m512i)__builtin_ia32_vcvtph2uw512_mask( 1788 (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)-1, 1789 _MM_FROUND_CUR_DIRECTION); 1790 } 1791 1792 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1793 _mm512_mask_cvtph_epu16(__m512i __W, __mmask32 __U, __m512h __A) { 1794 return (__m512i)__builtin_ia32_vcvtph2uw512_mask( 1795 (__v32hf)__A, (__v32hu)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION); 1796 } 1797 1798 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1799 _mm512_maskz_cvtph_epu16(__mmask32 __U, __m512h __A) { 1800 return (__m512i)__builtin_ia32_vcvtph2uw512_mask( 1801 (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)__U, 1802 _MM_FROUND_CUR_DIRECTION); 1803 } 1804 1805 #define _mm512_cvtt_roundph_epu16(A, R) \ 1806 ((__m512i)__builtin_ia32_vcvttph2uw512_mask( \ 1807 (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1), \ 1808 (int)(R))) 1809 1810 #define _mm512_mask_cvtt_roundph_epu16(W, U, A, R) \ 1811 ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), (__v32hu)(W), \ 1812 (__mmask32)(U), (int)(R))) 1813 1814 #define _mm512_maskz_cvtt_roundph_epu16(U, A, R) \ 1815 ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), \ 1816 (__v32hu)_mm512_setzero_epi32(), \ 1817 (__mmask32)(U), (int)(R))) 1818 1819 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1820 _mm512_cvttph_epu16(__m512h __A) { 1821 return (__m512i)__builtin_ia32_vcvttph2uw512_mask( 1822 (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)-1, 1823 _MM_FROUND_CUR_DIRECTION); 1824 } 1825 1826 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1827 _mm512_mask_cvttph_epu16(__m512i __W, __mmask32 __U, __m512h __A) { 1828 return (__m512i)__builtin_ia32_vcvttph2uw512_mask( 1829 (__v32hf)__A, (__v32hu)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION); 1830 } 1831 1832 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1833 _mm512_maskz_cvttph_epu16(__mmask32 __U, __m512h __A) { 1834 return (__m512i)__builtin_ia32_vcvttph2uw512_mask( 1835 (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)__U, 1836 _MM_FROUND_CUR_DIRECTION); 1837 } 1838 1839 #define _mm512_cvt_roundepu16_ph(A, R) \ 1840 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), \ 1841 (__v32hf)_mm512_undefined_ph(), \ 1842 (__mmask32)(-1), (int)(R))) 1843 1844 #define _mm512_mask_cvt_roundepu16_ph(W, U, A, R) \ 1845 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), (__v32hf)(W), \ 1846 (__mmask32)(U), (int)(R))) 1847 1848 #define _mm512_maskz_cvt_roundepu16_ph(U, A, R) \ 1849 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask( \ 1850 (__v32hu)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R))) 1851 1852 static __inline__ __m512h __DEFAULT_FN_ATTRS512 1853 _mm512_cvtepu16_ph(__m512i __A) { 1854 return (__m512h)__builtin_ia32_vcvtuw2ph512_mask( 1855 (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)-1, 1856 _MM_FROUND_CUR_DIRECTION); 1857 } 1858 1859 static __inline__ __m512h __DEFAULT_FN_ATTRS512 1860 _mm512_mask_cvtepu16_ph(__m512h __W, __mmask32 __U, __m512i __A) { 1861 return (__m512h)__builtin_ia32_vcvtuw2ph512_mask( 1862 (__v32hu)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION); 1863 } 1864 1865 static __inline__ __m512h __DEFAULT_FN_ATTRS512 1866 _mm512_maskz_cvtepu16_ph(__mmask32 __U, __m512i __A) { 1867 return (__m512h)__builtin_ia32_vcvtuw2ph512_mask( 1868 (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U, 1869 _MM_FROUND_CUR_DIRECTION); 1870 } 1871 1872 #define _mm512_cvt_roundph_epi32(A, R) \ 1873 ((__m512i)__builtin_ia32_vcvtph2dq512_mask( \ 1874 (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1), \ 1875 (int)(R))) 1876 1877 #define _mm512_mask_cvt_roundph_epi32(W, U, A, R) \ 1878 ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), (__v16si)(W), \ 1879 (__mmask16)(U), (int)(R))) 1880 1881 #define _mm512_maskz_cvt_roundph_epi32(U, A, R) \ 1882 ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), \ 1883 (__v16si)_mm512_setzero_epi32(), \ 1884 (__mmask16)(U), (int)(R))) 1885 1886 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1887 _mm512_cvtph_epi32(__m256h __A) { 1888 return (__m512i)__builtin_ia32_vcvtph2dq512_mask( 1889 (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)-1, 1890 _MM_FROUND_CUR_DIRECTION); 1891 } 1892 1893 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1894 _mm512_mask_cvtph_epi32(__m512i __W, __mmask16 __U, __m256h __A) { 1895 return (__m512i)__builtin_ia32_vcvtph2dq512_mask( 1896 (__v16hf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); 1897 } 1898 1899 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1900 _mm512_maskz_cvtph_epi32(__mmask16 __U, __m256h __A) { 1901 return (__m512i)__builtin_ia32_vcvtph2dq512_mask( 1902 (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)__U, 1903 _MM_FROUND_CUR_DIRECTION); 1904 } 1905 1906 #define _mm512_cvt_roundph_epu32(A, R) \ 1907 ((__m512i)__builtin_ia32_vcvtph2udq512_mask( \ 1908 (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1), \ 1909 (int)(R))) 1910 1911 #define _mm512_mask_cvt_roundph_epu32(W, U, A, R) \ 1912 ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), (__v16su)(W), \ 1913 (__mmask16)(U), (int)(R))) 1914 1915 #define _mm512_maskz_cvt_roundph_epu32(U, A, R) \ 1916 ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), \ 1917 (__v16su)_mm512_setzero_epi32(), \ 1918 (__mmask16)(U), (int)(R))) 1919 1920 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1921 _mm512_cvtph_epu32(__m256h __A) { 1922 return (__m512i)__builtin_ia32_vcvtph2udq512_mask( 1923 (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)-1, 1924 _MM_FROUND_CUR_DIRECTION); 1925 } 1926 1927 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1928 _mm512_mask_cvtph_epu32(__m512i __W, __mmask16 __U, __m256h __A) { 1929 return (__m512i)__builtin_ia32_vcvtph2udq512_mask( 1930 (__v16hf)__A, (__v16su)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); 1931 } 1932 1933 static __inline__ __m512i __DEFAULT_FN_ATTRS512 1934 _mm512_maskz_cvtph_epu32(__mmask16 __U, __m256h __A) { 1935 return (__m512i)__builtin_ia32_vcvtph2udq512_mask( 1936 (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)__U, 1937 _MM_FROUND_CUR_DIRECTION); 1938 } 1939 1940 #define _mm512_cvt_roundepi32_ph(A, R) \ 1941 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), \ 1942 (__v16hf)_mm256_undefined_ph(), \ 1943 (__mmask16)(-1), (int)(R))) 1944 1945 #define _mm512_mask_cvt_roundepi32_ph(W, U, A, R) \ 1946 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), (__v16hf)(W), \ 1947 (__mmask16)(U), (int)(R))) 1948 1949 #define _mm512_maskz_cvt_roundepi32_ph(U, A, R) \ 1950 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask( \ 1951 (__v16si)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R))) 1952 1953 static __inline__ __m256h __DEFAULT_FN_ATTRS512 1954 _mm512_cvtepi32_ph(__m512i __A) { 1955 return (__m256h)__builtin_ia32_vcvtdq2ph512_mask( 1956 (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1, 1957 _MM_FROUND_CUR_DIRECTION); 1958 } 1959 1960 static __inline__ __m256h __DEFAULT_FN_ATTRS512 1961 _mm512_mask_cvtepi32_ph(__m256h __W, __mmask16 __U, __m512i __A) { 1962 return (__m256h)__builtin_ia32_vcvtdq2ph512_mask( 1963 (__v16si)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); 1964 } 1965 1966 static __inline__ __m256h __DEFAULT_FN_ATTRS512 1967 _mm512_maskz_cvtepi32_ph(__mmask16 __U, __m512i __A) { 1968 return (__m256h)__builtin_ia32_vcvtdq2ph512_mask( 1969 (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U, 1970 _MM_FROUND_CUR_DIRECTION); 1971 } 1972 1973 #define _mm512_cvt_roundepu32_ph(A, R) \ 1974 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), \ 1975 (__v16hf)_mm256_undefined_ph(), \ 1976 (__mmask16)(-1), (int)(R))) 1977 1978 #define _mm512_mask_cvt_roundepu32_ph(W, U, A, R) \ 1979 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), (__v16hf)(W), \ 1980 (__mmask16)(U), (int)(R))) 1981 1982 #define _mm512_maskz_cvt_roundepu32_ph(U, A, R) \ 1983 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask( \ 1984 (__v16su)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R))) 1985 1986 static __inline__ __m256h __DEFAULT_FN_ATTRS512 1987 _mm512_cvtepu32_ph(__m512i __A) { 1988 return (__m256h)__builtin_ia32_vcvtudq2ph512_mask( 1989 (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1, 1990 _MM_FROUND_CUR_DIRECTION); 1991 } 1992 1993 static __inline__ __m256h __DEFAULT_FN_ATTRS512 1994 _mm512_mask_cvtepu32_ph(__m256h __W, __mmask16 __U, __m512i __A) { 1995 return (__m256h)__builtin_ia32_vcvtudq2ph512_mask( 1996 (__v16su)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); 1997 } 1998 1999 static __inline__ __m256h __DEFAULT_FN_ATTRS512 2000 _mm512_maskz_cvtepu32_ph(__mmask16 __U, __m512i __A) { 2001 return (__m256h)__builtin_ia32_vcvtudq2ph512_mask( 2002 (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U, 2003 _MM_FROUND_CUR_DIRECTION); 2004 } 2005 2006 #define _mm512_cvtt_roundph_epi32(A, R) \ 2007 ((__m512i)__builtin_ia32_vcvttph2dq512_mask( \ 2008 (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1), \ 2009 (int)(R))) 2010 2011 #define _mm512_mask_cvtt_roundph_epi32(W, U, A, R) \ 2012 ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), (__v16si)(W), \ 2013 (__mmask16)(U), (int)(R))) 2014 2015 #define _mm512_maskz_cvtt_roundph_epi32(U, A, R) \ 2016 ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), \ 2017 (__v16si)_mm512_setzero_epi32(), \ 2018 (__mmask16)(U), (int)(R))) 2019 2020 static __inline__ __m512i __DEFAULT_FN_ATTRS512 2021 _mm512_cvttph_epi32(__m256h __A) { 2022 return (__m512i)__builtin_ia32_vcvttph2dq512_mask( 2023 (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)-1, 2024 _MM_FROUND_CUR_DIRECTION); 2025 } 2026 2027 static __inline__ __m512i __DEFAULT_FN_ATTRS512 2028 _mm512_mask_cvttph_epi32(__m512i __W, __mmask16 __U, __m256h __A) { 2029 return (__m512i)__builtin_ia32_vcvttph2dq512_mask( 2030 (__v16hf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); 2031 } 2032 2033 static __inline__ __m512i __DEFAULT_FN_ATTRS512 2034 _mm512_maskz_cvttph_epi32(__mmask16 __U, __m256h __A) { 2035 return (__m512i)__builtin_ia32_vcvttph2dq512_mask( 2036 (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)__U, 2037 _MM_FROUND_CUR_DIRECTION); 2038 } 2039 2040 #define _mm512_cvtt_roundph_epu32(A, R) \ 2041 ((__m512i)__builtin_ia32_vcvttph2udq512_mask( \ 2042 (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1), \ 2043 (int)(R))) 2044 2045 #define _mm512_mask_cvtt_roundph_epu32(W, U, A, R) \ 2046 ((__m512i)__builtin_ia32_vcvttph2udq512_mask((__v16hf)(A), (__v16su)(W), \ 2047 (__mmask16)(U), (int)(R))) 2048 2049 #define _mm512_maskz_cvtt_roundph_epu32(U, A, R) \ 2050 ((__m512i)__builtin_ia32_vcvttph2udq512_mask( \ 2051 (__v16hf)(A), (__v16su)_mm512_setzero_epi32(), (__mmask16)(U), \ 2052 (int)(R))) 2053 2054 static __inline__ __m512i __DEFAULT_FN_ATTRS512 2055 _mm512_cvttph_epu32(__m256h __A) { 2056 return (__m512i)__builtin_ia32_vcvttph2udq512_mask( 2057 (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)-1, 2058 _MM_FROUND_CUR_DIRECTION); 2059 } 2060 2061 static __inline__ __m512i __DEFAULT_FN_ATTRS512 2062 _mm512_mask_cvttph_epu32(__m512i __W, __mmask16 __U, __m256h __A) { 2063 return (__m512i)__builtin_ia32_vcvttph2udq512_mask( 2064 (__v16hf)__A, (__v16su)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); 2065 } 2066 2067 static __inline__ __m512i __DEFAULT_FN_ATTRS512 2068 _mm512_maskz_cvttph_epu32(__mmask16 __U, __m256h __A) { 2069 return (__m512i)__builtin_ia32_vcvttph2udq512_mask( 2070 (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)__U, 2071 _MM_FROUND_CUR_DIRECTION); 2072 } 2073 2074 #define _mm512_cvt_roundepi64_ph(A, R) \ 2075 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask( \ 2076 (__v8di)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R))) 2077 2078 #define _mm512_mask_cvt_roundepi64_ph(W, U, A, R) \ 2079 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask((__v8di)(A), (__v8hf)(W), \ 2080 (__mmask8)(U), (int)(R))) 2081 2082 #define _mm512_maskz_cvt_roundepi64_ph(U, A, R) \ 2083 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask( \ 2084 (__v8di)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R))) 2085 2086 static __inline__ __m128h __DEFAULT_FN_ATTRS512 2087 _mm512_cvtepi64_ph(__m512i __A) { 2088 return (__m128h)__builtin_ia32_vcvtqq2ph512_mask( 2089 (__v8di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1, 2090 _MM_FROUND_CUR_DIRECTION); 2091 } 2092 2093 static __inline__ __m128h __DEFAULT_FN_ATTRS512 2094 _mm512_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m512i __A) { 2095 return (__m128h)__builtin_ia32_vcvtqq2ph512_mask( 2096 (__v8di)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); 2097 } 2098 2099 static __inline__ __m128h __DEFAULT_FN_ATTRS512 2100 _mm512_maskz_cvtepi64_ph(__mmask8 __U, __m512i __A) { 2101 return (__m128h)__builtin_ia32_vcvtqq2ph512_mask( 2102 (__v8di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U, 2103 _MM_FROUND_CUR_DIRECTION); 2104 } 2105 2106 #define _mm512_cvt_roundph_epi64(A, R) \ 2107 ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), \ 2108 (__v8di)_mm512_undefined_epi32(), \ 2109 (__mmask8)(-1), (int)(R))) 2110 2111 #define _mm512_mask_cvt_roundph_epi64(W, U, A, R) \ 2112 ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), (__v8di)(W), \ 2113 (__mmask8)(U), (int)(R))) 2114 2115 #define _mm512_maskz_cvt_roundph_epi64(U, A, R) \ 2116 ((__m512i)__builtin_ia32_vcvtph2qq512_mask( \ 2117 (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R))) 2118 2119 static __inline__ __m512i __DEFAULT_FN_ATTRS512 2120 _mm512_cvtph_epi64(__m128h __A) { 2121 return (__m512i)__builtin_ia32_vcvtph2qq512_mask( 2122 (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)-1, 2123 _MM_FROUND_CUR_DIRECTION); 2124 } 2125 2126 static __inline__ __m512i __DEFAULT_FN_ATTRS512 2127 _mm512_mask_cvtph_epi64(__m512i __W, __mmask8 __U, __m128h __A) { 2128 return (__m512i)__builtin_ia32_vcvtph2qq512_mask( 2129 (__v8hf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); 2130 } 2131 2132 static __inline__ __m512i __DEFAULT_FN_ATTRS512 2133 _mm512_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) { 2134 return (__m512i)__builtin_ia32_vcvtph2qq512_mask( 2135 (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)__U, 2136 _MM_FROUND_CUR_DIRECTION); 2137 } 2138 2139 #define _mm512_cvt_roundepu64_ph(A, R) \ 2140 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask( \ 2141 (__v8du)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R))) 2142 2143 #define _mm512_mask_cvt_roundepu64_ph(W, U, A, R) \ 2144 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask((__v8du)(A), (__v8hf)(W), \ 2145 (__mmask8)(U), (int)(R))) 2146 2147 #define _mm512_maskz_cvt_roundepu64_ph(U, A, R) \ 2148 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask( \ 2149 (__v8du)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R))) 2150 2151 static __inline__ __m128h __DEFAULT_FN_ATTRS512 2152 _mm512_cvtepu64_ph(__m512i __A) { 2153 return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask( 2154 (__v8du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1, 2155 _MM_FROUND_CUR_DIRECTION); 2156 } 2157 2158 static __inline__ __m128h __DEFAULT_FN_ATTRS512 2159 _mm512_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m512i __A) { 2160 return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask( 2161 (__v8du)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); 2162 } 2163 2164 static __inline__ __m128h __DEFAULT_FN_ATTRS512 2165 _mm512_maskz_cvtepu64_ph(__mmask8 __U, __m512i __A) { 2166 return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask( 2167 (__v8du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U, 2168 _MM_FROUND_CUR_DIRECTION); 2169 } 2170 2171 #define _mm512_cvt_roundph_epu64(A, R) \ 2172 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask( \ 2173 (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1), \ 2174 (int)(R))) 2175 2176 #define _mm512_mask_cvt_roundph_epu64(W, U, A, R) \ 2177 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask((__v8hf)(A), (__v8du)(W), \ 2178 (__mmask8)(U), (int)(R))) 2179 2180 #define _mm512_maskz_cvt_roundph_epu64(U, A, R) \ 2181 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask( \ 2182 (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R))) 2183 2184 static __inline__ __m512i __DEFAULT_FN_ATTRS512 2185 _mm512_cvtph_epu64(__m128h __A) { 2186 return (__m512i)__builtin_ia32_vcvtph2uqq512_mask( 2187 (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)-1, 2188 _MM_FROUND_CUR_DIRECTION); 2189 } 2190 2191 static __inline__ __m512i __DEFAULT_FN_ATTRS512 2192 _mm512_mask_cvtph_epu64(__m512i __W, __mmask8 __U, __m128h __A) { 2193 return (__m512i)__builtin_ia32_vcvtph2uqq512_mask( 2194 (__v8hf)__A, (__v8du)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); 2195 } 2196 2197 static __inline__ __m512i __DEFAULT_FN_ATTRS512 2198 _mm512_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) { 2199 return (__m512i)__builtin_ia32_vcvtph2uqq512_mask( 2200 (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)__U, 2201 _MM_FROUND_CUR_DIRECTION); 2202 } 2203 2204 #define _mm512_cvtt_roundph_epi64(A, R) \ 2205 ((__m512i)__builtin_ia32_vcvttph2qq512_mask( \ 2206 (__v8hf)(A), (__v8di)_mm512_undefined_epi32(), (__mmask8)(-1), \ 2207 (int)(R))) 2208 2209 #define _mm512_mask_cvtt_roundph_epi64(W, U, A, R) \ 2210 ((__m512i)__builtin_ia32_vcvttph2qq512_mask((__v8hf)(A), (__v8di)(W), \ 2211 (__mmask8)(U), (int)(R))) 2212 2213 #define _mm512_maskz_cvtt_roundph_epi64(U, A, R) \ 2214 ((__m512i)__builtin_ia32_vcvttph2qq512_mask( \ 2215 (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R))) 2216 2217 static __inline__ __m512i __DEFAULT_FN_ATTRS512 2218 _mm512_cvttph_epi64(__m128h __A) { 2219 return (__m512i)__builtin_ia32_vcvttph2qq512_mask( 2220 (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)-1, 2221 _MM_FROUND_CUR_DIRECTION); 2222 } 2223 2224 static __inline__ __m512i __DEFAULT_FN_ATTRS512 2225 _mm512_mask_cvttph_epi64(__m512i __W, __mmask8 __U, __m128h __A) { 2226 return (__m512i)__builtin_ia32_vcvttph2qq512_mask( 2227 (__v8hf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); 2228 } 2229 2230 static __inline__ __m512i __DEFAULT_FN_ATTRS512 2231 _mm512_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) { 2232 return (__m512i)__builtin_ia32_vcvttph2qq512_mask( 2233 (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)__U, 2234 _MM_FROUND_CUR_DIRECTION); 2235 } 2236 2237 #define _mm512_cvtt_roundph_epu64(A, R) \ 2238 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask( \ 2239 (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1), \ 2240 (int)(R))) 2241 2242 #define _mm512_mask_cvtt_roundph_epu64(W, U, A, R) \ 2243 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask((__v8hf)(A), (__v8du)(W), \ 2244 (__mmask8)(U), (int)(R))) 2245 2246 #define _mm512_maskz_cvtt_roundph_epu64(U, A, R) \ 2247 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask( \ 2248 (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R))) 2249 2250 static __inline__ __m512i __DEFAULT_FN_ATTRS512 2251 _mm512_cvttph_epu64(__m128h __A) { 2252 return (__m512i)__builtin_ia32_vcvttph2uqq512_mask( 2253 (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)-1, 2254 _MM_FROUND_CUR_DIRECTION); 2255 } 2256 2257 static __inline__ __m512i __DEFAULT_FN_ATTRS512 2258 _mm512_mask_cvttph_epu64(__m512i __W, __mmask8 __U, __m128h __A) { 2259 return (__m512i)__builtin_ia32_vcvttph2uqq512_mask( 2260 (__v8hf)__A, (__v8du)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); 2261 } 2262 2263 static __inline__ __m512i __DEFAULT_FN_ATTRS512 2264 _mm512_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) { 2265 return (__m512i)__builtin_ia32_vcvttph2uqq512_mask( 2266 (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)__U, 2267 _MM_FROUND_CUR_DIRECTION); 2268 } 2269 2270 #define _mm_cvt_roundsh_i32(A, R) \ 2271 ((int)__builtin_ia32_vcvtsh2si32((__v8hf)(A), (int)(R))) 2272 2273 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvtsh_i32(__m128h __A) { 2274 return (int)__builtin_ia32_vcvtsh2si32((__v8hf)__A, _MM_FROUND_CUR_DIRECTION); 2275 } 2276 2277 #define _mm_cvt_roundsh_u32(A, R) \ 2278 ((unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)(A), (int)(R))) 2279 2280 static __inline__ unsigned int __DEFAULT_FN_ATTRS128 2281 _mm_cvtsh_u32(__m128h __A) { 2282 return (unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)__A, 2283 _MM_FROUND_CUR_DIRECTION); 2284 } 2285 2286 #ifdef __x86_64__ 2287 #define _mm_cvt_roundsh_i64(A, R) \ 2288 ((long long)__builtin_ia32_vcvtsh2si64((__v8hf)(A), (int)(R))) 2289 2290 static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvtsh_i64(__m128h __A) { 2291 return (long long)__builtin_ia32_vcvtsh2si64((__v8hf)__A, 2292 _MM_FROUND_CUR_DIRECTION); 2293 } 2294 2295 #define _mm_cvt_roundsh_u64(A, R) \ 2296 ((unsigned long long)__builtin_ia32_vcvtsh2usi64((__v8hf)(A), (int)(R))) 2297 2298 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128 2299 _mm_cvtsh_u64(__m128h __A) { 2300 return (unsigned long long)__builtin_ia32_vcvtsh2usi64( 2301 (__v8hf)__A, _MM_FROUND_CUR_DIRECTION); 2302 } 2303 #endif // __x86_64__ 2304 2305 #define _mm_cvt_roundu32_sh(A, B, R) \ 2306 ((__m128h)__builtin_ia32_vcvtusi2sh((__v8hf)(A), (unsigned int)(B), (int)(R))) 2307 2308 static __inline__ __m128h __DEFAULT_FN_ATTRS128 2309 _mm_cvtu32_sh(__m128h __A, unsigned int __B) { 2310 __A[0] = __B; 2311 return __A; 2312 } 2313 2314 #ifdef __x86_64__ 2315 #define _mm_cvt_roundu64_sh(A, B, R) \ 2316 ((__m128h)__builtin_ia32_vcvtusi642sh((__v8hf)(A), (unsigned long long)(B), \ 2317 (int)(R))) 2318 2319 static __inline__ __m128h __DEFAULT_FN_ATTRS128 2320 _mm_cvtu64_sh(__m128h __A, unsigned long long __B) { 2321 __A[0] = __B; 2322 return __A; 2323 } 2324 #endif 2325 2326 #define _mm_cvt_roundi32_sh(A, B, R) \ 2327 ((__m128h)__builtin_ia32_vcvtsi2sh((__v8hf)(A), (int)(B), (int)(R))) 2328 2329 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvti32_sh(__m128h __A, 2330 int __B) { 2331 __A[0] = __B; 2332 return __A; 2333 } 2334 2335 #ifdef __x86_64__ 2336 #define _mm_cvt_roundi64_sh(A, B, R) \ 2337 ((__m128h)__builtin_ia32_vcvtsi642sh((__v8hf)(A), (long long)(B), (int)(R))) 2338 2339 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvti64_sh(__m128h __A, 2340 long long __B) { 2341 __A[0] = __B; 2342 return __A; 2343 } 2344 #endif 2345 2346 #define _mm_cvtt_roundsh_i32(A, R) \ 2347 ((int)__builtin_ia32_vcvttsh2si32((__v8hf)(A), (int)(R))) 2348 2349 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvttsh_i32(__m128h __A) { 2350 return (int)__builtin_ia32_vcvttsh2si32((__v8hf)__A, 2351 _MM_FROUND_CUR_DIRECTION); 2352 } 2353 2354 #ifdef __x86_64__ 2355 #define _mm_cvtt_roundsh_i64(A, R) \ 2356 ((long long)__builtin_ia32_vcvttsh2si64((__v8hf)(A), (int)(R))) 2357 2358 static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvttsh_i64(__m128h __A) { 2359 return (long long)__builtin_ia32_vcvttsh2si64((__v8hf)__A, 2360 _MM_FROUND_CUR_DIRECTION); 2361 } 2362 #endif 2363 2364 #define _mm_cvtt_roundsh_u32(A, R) \ 2365 ((unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)(A), (int)(R))) 2366 2367 static __inline__ unsigned int __DEFAULT_FN_ATTRS128 2368 _mm_cvttsh_u32(__m128h __A) { 2369 return (unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)__A, 2370 _MM_FROUND_CUR_DIRECTION); 2371 } 2372 2373 #ifdef __x86_64__ 2374 #define _mm_cvtt_roundsh_u64(A, R) \ 2375 ((unsigned long long)__builtin_ia32_vcvttsh2usi64((__v8hf)(A), (int)(R))) 2376 2377 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128 2378 _mm_cvttsh_u64(__m128h __A) { 2379 return (unsigned long long)__builtin_ia32_vcvttsh2usi64( 2380 (__v8hf)__A, _MM_FROUND_CUR_DIRECTION); 2381 } 2382 #endif 2383 2384 #define _mm512_cvtx_roundph_ps(A, R) \ 2385 ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), \ 2386 (__v16sf)_mm512_undefined_ps(), \ 2387 (__mmask16)(-1), (int)(R))) 2388 2389 #define _mm512_mask_cvtx_roundph_ps(W, U, A, R) \ 2390 ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), (__v16sf)(W), \ 2391 (__mmask16)(U), (int)(R))) 2392 2393 #define _mm512_maskz_cvtx_roundph_ps(U, A, R) \ 2394 ((__m512)__builtin_ia32_vcvtph2psx512_mask( \ 2395 (__v16hf)(A), (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), (int)(R))) 2396 2397 static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtxph_ps(__m256h __A) { 2398 return (__m512)__builtin_ia32_vcvtph2psx512_mask( 2399 (__v16hf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)-1, 2400 _MM_FROUND_CUR_DIRECTION); 2401 } 2402 2403 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2404 _mm512_mask_cvtxph_ps(__m512 __W, __mmask16 __U, __m256h __A) { 2405 return (__m512)__builtin_ia32_vcvtph2psx512_mask( 2406 (__v16hf)__A, (__v16sf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); 2407 } 2408 2409 static __inline__ __m512 __DEFAULT_FN_ATTRS512 2410 _mm512_maskz_cvtxph_ps(__mmask16 __U, __m256h __A) { 2411 return (__m512)__builtin_ia32_vcvtph2psx512_mask( 2412 (__v16hf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, 2413 _MM_FROUND_CUR_DIRECTION); 2414 } 2415 2416 #define _mm512_cvtx_roundps_ph(A, R) \ 2417 ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), \ 2418 (__v16hf)_mm256_undefined_ph(), \ 2419 (__mmask16)(-1), (int)(R))) 2420 2421 #define _mm512_mask_cvtx_roundps_ph(W, U, A, R) \ 2422 ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), (__v16hf)(W), \ 2423 (__mmask16)(U), (int)(R))) 2424 2425 #define _mm512_maskz_cvtx_roundps_ph(U, A, R) \ 2426 ((__m256h)__builtin_ia32_vcvtps2phx512_mask( \ 2427 (__v16sf)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R))) 2428 2429 static __inline__ __m256h __DEFAULT_FN_ATTRS512 _mm512_cvtxps_ph(__m512 __A) { 2430 return (__m256h)__builtin_ia32_vcvtps2phx512_mask( 2431 (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1, 2432 _MM_FROUND_CUR_DIRECTION); 2433 } 2434 2435 static __inline__ __m256h __DEFAULT_FN_ATTRS512 2436 _mm512_mask_cvtxps_ph(__m256h __W, __mmask16 __U, __m512 __A) { 2437 return (__m256h)__builtin_ia32_vcvtps2phx512_mask( 2438 (__v16sf)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); 2439 } 2440 2441 static __inline__ __m256h __DEFAULT_FN_ATTRS512 2442 _mm512_maskz_cvtxps_ph(__mmask16 __U, __m512 __A) { 2443 return (__m256h)__builtin_ia32_vcvtps2phx512_mask( 2444 (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U, 2445 _MM_FROUND_CUR_DIRECTION); 2446 } 2447 2448 #define _mm512_fmadd_round_ph(A, B, C, R) \ 2449 ((__m512h)__builtin_ia32_vfmaddph512_mask( \ 2450 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ 2451 (__mmask32)-1, (int)(R))) 2452 2453 #define _mm512_mask_fmadd_round_ph(A, U, B, C, R) \ 2454 ((__m512h)__builtin_ia32_vfmaddph512_mask( \ 2455 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ 2456 (__mmask32)(U), (int)(R))) 2457 2458 #define _mm512_mask3_fmadd_round_ph(A, B, C, U, R) \ 2459 ((__m512h)__builtin_ia32_vfmaddph512_mask3( \ 2460 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ 2461 (__mmask32)(U), (int)(R))) 2462 2463 #define _mm512_maskz_fmadd_round_ph(U, A, B, C, R) \ 2464 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \ 2465 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ 2466 (__mmask32)(U), (int)(R))) 2467 2468 #define _mm512_fmsub_round_ph(A, B, C, R) \ 2469 ((__m512h)__builtin_ia32_vfmaddph512_mask( \ 2470 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \ 2471 (__mmask32)-1, (int)(R))) 2472 2473 #define _mm512_mask_fmsub_round_ph(A, U, B, C, R) \ 2474 ((__m512h)__builtin_ia32_vfmaddph512_mask( \ 2475 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \ 2476 (__mmask32)(U), (int)(R))) 2477 2478 #define _mm512_maskz_fmsub_round_ph(U, A, B, C, R) \ 2479 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \ 2480 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \ 2481 (__mmask32)(U), (int)(R))) 2482 2483 #define _mm512_fnmadd_round_ph(A, B, C, R) \ 2484 ((__m512h)__builtin_ia32_vfmaddph512_mask( \ 2485 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ 2486 (__mmask32)-1, (int)(R))) 2487 2488 #define _mm512_mask3_fnmadd_round_ph(A, B, C, U, R) \ 2489 ((__m512h)__builtin_ia32_vfmaddph512_mask3( \ 2490 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ 2491 (__mmask32)(U), (int)(R))) 2492 2493 #define _mm512_maskz_fnmadd_round_ph(U, A, B, C, R) \ 2494 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \ 2495 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ 2496 (__mmask32)(U), (int)(R))) 2497 2498 #define _mm512_fnmsub_round_ph(A, B, C, R) \ 2499 ((__m512h)__builtin_ia32_vfmaddph512_mask( \ 2500 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \ 2501 (__mmask32)-1, (int)(R))) 2502 2503 #define _mm512_maskz_fnmsub_round_ph(U, A, B, C, R) \ 2504 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \ 2505 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \ 2506 (__mmask32)(U), (int)(R))) 2507 2508 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_ph(__m512h __A, 2509 __m512h __B, 2510 __m512h __C) { 2511 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B, 2512 (__v32hf)__C, (__mmask32)-1, 2513 _MM_FROUND_CUR_DIRECTION); 2514 } 2515 2516 static __inline__ __m512h __DEFAULT_FN_ATTRS512 2517 _mm512_mask_fmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) { 2518 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B, 2519 (__v32hf)__C, (__mmask32)__U, 2520 _MM_FROUND_CUR_DIRECTION); 2521 } 2522 2523 static __inline__ __m512h __DEFAULT_FN_ATTRS512 2524 _mm512_mask3_fmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) { 2525 return (__m512h)__builtin_ia32_vfmaddph512_mask3((__v32hf)__A, (__v32hf)__B, 2526 (__v32hf)__C, (__mmask32)__U, 2527 _MM_FROUND_CUR_DIRECTION); 2528 } 2529 2530 static __inline__ __m512h __DEFAULT_FN_ATTRS512 2531 _mm512_maskz_fmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) { 2532 return (__m512h)__builtin_ia32_vfmaddph512_maskz((__v32hf)__A, (__v32hf)__B, 2533 (__v32hf)__C, (__mmask32)__U, 2534 _MM_FROUND_CUR_DIRECTION); 2535 } 2536 2537 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmsub_ph(__m512h __A, 2538 __m512h __B, 2539 __m512h __C) { 2540 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B, 2541 -(__v32hf)__C, (__mmask32)-1, 2542 _MM_FROUND_CUR_DIRECTION); 2543 } 2544 2545 static __inline__ __m512h __DEFAULT_FN_ATTRS512 2546 _mm512_mask_fmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) { 2547 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B, 2548 -(__v32hf)__C, (__mmask32)__U, 2549 _MM_FROUND_CUR_DIRECTION); 2550 } 2551 2552 static __inline__ __m512h __DEFAULT_FN_ATTRS512 2553 _mm512_maskz_fmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) { 2554 return (__m512h)__builtin_ia32_vfmaddph512_maskz( 2555 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U, 2556 _MM_FROUND_CUR_DIRECTION); 2557 } 2558 2559 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fnmadd_ph(__m512h __A, 2560 __m512h __B, 2561 __m512h __C) { 2562 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B, 2563 (__v32hf)__C, (__mmask32)-1, 2564 _MM_FROUND_CUR_DIRECTION); 2565 } 2566 2567 static __inline__ __m512h __DEFAULT_FN_ATTRS512 2568 _mm512_mask3_fnmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) { 2569 return (__m512h)__builtin_ia32_vfmaddph512_mask3(-(__v32hf)__A, (__v32hf)__B, 2570 (__v32hf)__C, (__mmask32)__U, 2571 _MM_FROUND_CUR_DIRECTION); 2572 } 2573 2574 static __inline__ __m512h __DEFAULT_FN_ATTRS512 2575 _mm512_maskz_fnmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) { 2576 return (__m512h)__builtin_ia32_vfmaddph512_maskz(-(__v32hf)__A, (__v32hf)__B, 2577 (__v32hf)__C, (__mmask32)__U, 2578 _MM_FROUND_CUR_DIRECTION); 2579 } 2580 2581 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fnmsub_ph(__m512h __A, 2582 __m512h __B, 2583 __m512h __C) { 2584 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B, 2585 -(__v32hf)__C, (__mmask32)-1, 2586 _MM_FROUND_CUR_DIRECTION); 2587 } 2588 2589 static __inline__ __m512h __DEFAULT_FN_ATTRS512 2590 _mm512_maskz_fnmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) { 2591 return (__m512h)__builtin_ia32_vfmaddph512_maskz( 2592 -(__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U, 2593 _MM_FROUND_CUR_DIRECTION); 2594 } 2595 2596 #define _mm512_fmaddsub_round_ph(A, B, C, R) \ 2597 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \ 2598 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ 2599 (__mmask32)-1, (int)(R))) 2600 2601 #define _mm512_mask_fmaddsub_round_ph(A, U, B, C, R) \ 2602 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \ 2603 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ 2604 (__mmask32)(U), (int)(R))) 2605 2606 #define _mm512_mask3_fmaddsub_round_ph(A, B, C, U, R) \ 2607 ((__m512h)__builtin_ia32_vfmaddsubph512_mask3( \ 2608 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ 2609 (__mmask32)(U), (int)(R))) 2610 2611 #define _mm512_maskz_fmaddsub_round_ph(U, A, B, C, R) \ 2612 ((__m512h)__builtin_ia32_vfmaddsubph512_maskz( \ 2613 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ 2614 (__mmask32)(U), (int)(R))) 2615 2616 #define _mm512_fmsubadd_round_ph(A, B, C, R) \ 2617 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \ 2618 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \ 2619 (__mmask32)-1, (int)(R))) 2620 2621 #define _mm512_mask_fmsubadd_round_ph(A, U, B, C, R) \ 2622 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \ 2623 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \ 2624 (__mmask32)(U), (int)(R))) 2625 2626 #define _mm512_maskz_fmsubadd_round_ph(U, A, B, C, R) \ 2627 ((__m512h)__builtin_ia32_vfmaddsubph512_maskz( \ 2628 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \ 2629 (__mmask32)(U), (int)(R))) 2630 2631 static __inline__ __m512h __DEFAULT_FN_ATTRS512 2632 _mm512_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C) { 2633 return (__m512h)__builtin_ia32_vfmaddsubph512_mask( 2634 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)-1, 2635 _MM_FROUND_CUR_DIRECTION); 2636 } 2637 2638 static __inline__ __m512h __DEFAULT_FN_ATTRS512 2639 _mm512_mask_fmaddsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) { 2640 return (__m512h)__builtin_ia32_vfmaddsubph512_mask( 2641 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U, 2642 _MM_FROUND_CUR_DIRECTION); 2643 } 2644 2645 static __inline__ __m512h __DEFAULT_FN_ATTRS512 2646 _mm512_mask3_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) { 2647 return (__m512h)__builtin_ia32_vfmaddsubph512_mask3( 2648 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U, 2649 _MM_FROUND_CUR_DIRECTION); 2650 } 2651 2652 static __inline__ __m512h __DEFAULT_FN_ATTRS512 2653 _mm512_maskz_fmaddsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) { 2654 return (__m512h)__builtin_ia32_vfmaddsubph512_maskz( 2655 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U, 2656 _MM_FROUND_CUR_DIRECTION); 2657 } 2658 2659 static __inline__ __m512h __DEFAULT_FN_ATTRS512 2660 _mm512_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C) { 2661 return (__m512h)__builtin_ia32_vfmaddsubph512_mask( 2662 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)-1, 2663 _MM_FROUND_CUR_DIRECTION); 2664 } 2665 2666 static __inline__ __m512h __DEFAULT_FN_ATTRS512 2667 _mm512_mask_fmsubadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) { 2668 return (__m512h)__builtin_ia32_vfmaddsubph512_mask( 2669 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U, 2670 _MM_FROUND_CUR_DIRECTION); 2671 } 2672 2673 static __inline__ __m512h __DEFAULT_FN_ATTRS512 2674 _mm512_maskz_fmsubadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) { 2675 return (__m512h)__builtin_ia32_vfmaddsubph512_maskz( 2676 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U, 2677 _MM_FROUND_CUR_DIRECTION); 2678 } 2679 2680 #define _mm512_mask3_fmsub_round_ph(A, B, C, U, R) \ 2681 ((__m512h)__builtin_ia32_vfmsubph512_mask3( \ 2682 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ 2683 (__mmask32)(U), (int)(R))) 2684 2685 static __inline__ __m512h __DEFAULT_FN_ATTRS512 2686 _mm512_mask3_fmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) { 2687 return (__m512h)__builtin_ia32_vfmsubph512_mask3((__v32hf)__A, (__v32hf)__B, 2688 (__v32hf)__C, (__mmask32)__U, 2689 _MM_FROUND_CUR_DIRECTION); 2690 } 2691 2692 #define _mm512_mask3_fmsubadd_round_ph(A, B, C, U, R) \ 2693 ((__m512h)__builtin_ia32_vfmsubaddph512_mask3( \ 2694 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ 2695 (__mmask32)(U), (int)(R))) 2696 2697 static __inline__ __m512h __DEFAULT_FN_ATTRS512 2698 _mm512_mask3_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) { 2699 return (__m512h)__builtin_ia32_vfmsubaddph512_mask3( 2700 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U, 2701 _MM_FROUND_CUR_DIRECTION); 2702 } 2703 2704 #define _mm512_mask_fnmadd_round_ph(A, U, B, C, R) \ 2705 ((__m512h)__builtin_ia32_vfmaddph512_mask( \ 2706 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ 2707 (__mmask32)(U), (int)(R))) 2708 2709 static __inline__ __m512h __DEFAULT_FN_ATTRS512 2710 _mm512_mask_fnmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) { 2711 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B, 2712 (__v32hf)__C, (__mmask32)__U, 2713 _MM_FROUND_CUR_DIRECTION); 2714 } 2715 2716 #define _mm512_mask_fnmsub_round_ph(A, U, B, C, R) \ 2717 ((__m512h)__builtin_ia32_vfmaddph512_mask( \ 2718 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \ 2719 (__mmask32)(U), (int)(R))) 2720 2721 #define _mm512_mask3_fnmsub_round_ph(A, B, C, U, R) \ 2722 ((__m512h)__builtin_ia32_vfmsubph512_mask3( \ 2723 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ 2724 (__mmask32)(U), (int)(R))) 2725 2726 static __inline__ __m512h __DEFAULT_FN_ATTRS512 2727 _mm512_mask_fnmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) { 2728 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B, 2729 -(__v32hf)__C, (__mmask32)__U, 2730 _MM_FROUND_CUR_DIRECTION); 2731 } 2732 2733 static __inline__ __m512h __DEFAULT_FN_ATTRS512 2734 _mm512_mask3_fnmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) { 2735 return (__m512h)__builtin_ia32_vfmsubph512_mask3(-(__v32hf)__A, (__v32hf)__B, 2736 (__v32hf)__C, (__mmask32)__U, 2737 _MM_FROUND_CUR_DIRECTION); 2738 } 2739 2740 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sh(__m128h __W, 2741 __m128h __A, 2742 __m128h __B) { 2743 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B, 2744 (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); 2745 } 2746 2747 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_sh(__m128h __W, 2748 __mmask8 __U, 2749 __m128h __A, 2750 __m128h __B) { 2751 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B, 2752 (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); 2753 } 2754 2755 #define _mm_fmadd_round_sh(A, B, C, R) \ 2756 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \ 2757 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \ 2758 (__mmask8)-1, (int)(R))) 2759 2760 #define _mm_mask_fmadd_round_sh(W, U, A, B, R) \ 2761 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \ 2762 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), \ 2763 (__mmask8)(U), (int)(R))) 2764 2765 static __inline__ __m128h __DEFAULT_FN_ATTRS128 2766 _mm_maskz_fmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { 2767 return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B, (__v8hf)__C, 2768 (__mmask8)__U, 2769 _MM_FROUND_CUR_DIRECTION); 2770 } 2771 2772 #define _mm_maskz_fmadd_round_sh(U, A, B, C, R) \ 2773 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \ 2774 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \ 2775 (__mmask8)(U), (int)(R))) 2776 2777 static __inline__ __m128h __DEFAULT_FN_ATTRS128 2778 _mm_mask3_fmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) { 2779 return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)__Y, 2780 (__mmask8)__U, 2781 _MM_FROUND_CUR_DIRECTION); 2782 } 2783 2784 #define _mm_mask3_fmadd_round_sh(W, X, Y, U, R) \ 2785 ((__m128h)__builtin_ia32_vfmaddsh3_mask3( \ 2786 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \ 2787 (__mmask8)(U), (int)(R))) 2788 2789 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsub_sh(__m128h __W, 2790 __m128h __A, 2791 __m128h __B) { 2792 return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, 2793 -(__v8hf)__B, (__mmask8)-1, 2794 _MM_FROUND_CUR_DIRECTION); 2795 } 2796 2797 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_sh(__m128h __W, 2798 __mmask8 __U, 2799 __m128h __A, 2800 __m128h __B) { 2801 return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, 2802 -(__v8hf)__B, (__mmask8)__U, 2803 _MM_FROUND_CUR_DIRECTION); 2804 } 2805 2806 #define _mm_fmsub_round_sh(A, B, C, R) \ 2807 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \ 2808 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \ 2809 (__mmask8)-1, (int)(R))) 2810 2811 #define _mm_mask_fmsub_round_sh(W, U, A, B, R) \ 2812 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \ 2813 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), \ 2814 (__mmask8)(U), (int)(R))) 2815 2816 static __inline__ __m128h __DEFAULT_FN_ATTRS128 2817 _mm_maskz_fmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { 2818 return (__m128h)__builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B, 2819 -(__v8hf)__C, (__mmask8)__U, 2820 _MM_FROUND_CUR_DIRECTION); 2821 } 2822 2823 #define _mm_maskz_fmsub_round_sh(U, A, B, C, R) \ 2824 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \ 2825 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \ 2826 (__mmask8)(U), (int)R)) 2827 2828 static __inline__ __m128h __DEFAULT_FN_ATTRS128 2829 _mm_mask3_fmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) { 2830 return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)__Y, 2831 (__mmask8)__U, 2832 _MM_FROUND_CUR_DIRECTION); 2833 } 2834 2835 #define _mm_mask3_fmsub_round_sh(W, X, Y, U, R) \ 2836 ((__m128h)__builtin_ia32_vfmsubsh3_mask3( \ 2837 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \ 2838 (__mmask8)(U), (int)(R))) 2839 2840 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmadd_sh(__m128h __W, 2841 __m128h __A, 2842 __m128h __B) { 2843 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B, 2844 (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); 2845 } 2846 2847 static __inline__ __m128h __DEFAULT_FN_ATTRS128 2848 _mm_mask_fnmadd_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { 2849 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B, 2850 (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); 2851 } 2852 2853 #define _mm_fnmadd_round_sh(A, B, C, R) \ 2854 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \ 2855 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \ 2856 (__mmask8)-1, (int)(R))) 2857 2858 #define _mm_mask_fnmadd_round_sh(W, U, A, B, R) \ 2859 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \ 2860 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), \ 2861 (__mmask8)(U), (int)(R))) 2862 2863 static __inline__ __m128h __DEFAULT_FN_ATTRS128 2864 _mm_maskz_fnmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { 2865 return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C, 2866 (__mmask8)__U, 2867 _MM_FROUND_CUR_DIRECTION); 2868 } 2869 2870 #define _mm_maskz_fnmadd_round_sh(U, A, B, C, R) \ 2871 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \ 2872 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \ 2873 (__mmask8)(U), (int)(R))) 2874 2875 static __inline__ __m128h __DEFAULT_FN_ATTRS128 2876 _mm_mask3_fnmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) { 2877 return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)__Y, 2878 (__mmask8)__U, 2879 _MM_FROUND_CUR_DIRECTION); 2880 } 2881 2882 #define _mm_mask3_fnmadd_round_sh(W, X, Y, U, R) \ 2883 ((__m128h)__builtin_ia32_vfmaddsh3_mask3( \ 2884 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \ 2885 (__mmask8)(U), (int)(R))) 2886 2887 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmsub_sh(__m128h __W, 2888 __m128h __A, 2889 __m128h __B) { 2890 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B, 2891 (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); 2892 } 2893 2894 static __inline__ __m128h __DEFAULT_FN_ATTRS128 2895 _mm_mask_fnmsub_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { 2896 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B, 2897 (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); 2898 } 2899 2900 #define _mm_fnmsub_round_sh(A, B, C, R) \ 2901 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \ 2902 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \ 2903 (__mmask8)-1, (int)(R))) 2904 2905 #define _mm_mask_fnmsub_round_sh(W, U, A, B, R) \ 2906 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \ 2907 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), \ 2908 (__mmask8)(U), (int)(R))) 2909 2910 static __inline__ __m128h __DEFAULT_FN_ATTRS128 2911 _mm_maskz_fnmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { 2912 return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C, 2913 (__mmask8)__U, 2914 _MM_FROUND_CUR_DIRECTION); 2915 } 2916 2917 #define _mm_maskz_fnmsub_round_sh(U, A, B, C, R) \ 2918 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \ 2919 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \ 2920 (__mmask8)(U), (int)(R))) 2921 2922 static __inline__ __m128h __DEFAULT_FN_ATTRS128 2923 _mm_mask3_fnmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) { 2924 return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)__Y, 2925 (__mmask8)__U, 2926 _MM_FROUND_CUR_DIRECTION); 2927 } 2928 2929 #define _mm_mask3_fnmsub_round_sh(W, X, Y, U, R) \ 2930 ((__m128h)__builtin_ia32_vfmsubsh3_mask3( \ 2931 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \ 2932 (__mmask8)(U), (int)(R))) 2933 2934 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmadd_sch(__m128h __A, 2935 __m128h __B, 2936 __m128h __C) { 2937 return (__m128h)__builtin_ia32_vfcmaddcsh_mask((__v4sf)__A, (__v4sf)__B, 2938 (__v4sf)__C, (__mmask8)-1, 2939 _MM_FROUND_CUR_DIRECTION); 2940 } 2941 2942 static __inline__ __m128h __DEFAULT_FN_ATTRS128 2943 _mm_mask_fcmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { 2944 return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask( 2945 (__v4sf)__A, (__v4sf)(__B), (__v4sf)(__C), __U, _MM_FROUND_CUR_DIRECTION); 2946 } 2947 2948 static __inline__ __m128h __DEFAULT_FN_ATTRS128 2949 _mm_maskz_fcmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { 2950 return (__m128h)__builtin_ia32_vfcmaddcsh_maskz((__v4sf)__A, (__v4sf)__B, 2951 (__v4sf)__C, (__mmask8)__U, 2952 _MM_FROUND_CUR_DIRECTION); 2953 } 2954 2955 static __inline__ __m128h __DEFAULT_FN_ATTRS128 2956 _mm_mask3_fcmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { 2957 return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask3( 2958 (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U, _MM_FROUND_CUR_DIRECTION); 2959 } 2960 2961 #define _mm_fcmadd_round_sch(A, B, C, R) \ 2962 ((__m128h)__builtin_ia32_vfcmaddcsh_mask( \ 2963 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \ 2964 (__mmask8)-1, (int)(R))) 2965 2966 #define _mm_mask_fcmadd_round_sch(A, U, B, C, R) \ 2967 ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask( \ 2968 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \ 2969 (__mmask8)(U), (int)(R))) 2970 2971 #define _mm_maskz_fcmadd_round_sch(U, A, B, C, R) \ 2972 ((__m128h)__builtin_ia32_vfcmaddcsh_maskz( \ 2973 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \ 2974 (__mmask8)(U), (int)(R))) 2975 2976 #define _mm_mask3_fcmadd_round_sch(A, B, C, U, R) \ 2977 ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask3( \ 2978 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \ 2979 (__mmask8)(U), (int)(R))) 2980 2981 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sch(__m128h __A, 2982 __m128h __B, 2983 __m128h __C) { 2984 return (__m128h)__builtin_ia32_vfmaddcsh_mask((__v4sf)__A, (__v4sf)__B, 2985 (__v4sf)__C, (__mmask8)-1, 2986 _MM_FROUND_CUR_DIRECTION); 2987 } 2988 2989 static __inline__ __m128h __DEFAULT_FN_ATTRS128 2990 _mm_mask_fmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { 2991 return (__m128h)__builtin_ia32_vfmaddcsh_round_mask( 2992 (__v4sf)__A, (__v4sf)(__B), (__v4sf)(__C), __U, _MM_FROUND_CUR_DIRECTION); 2993 } 2994 2995 static __inline__ __m128h __DEFAULT_FN_ATTRS128 2996 _mm_maskz_fmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { 2997 return (__m128h)__builtin_ia32_vfmaddcsh_maskz((__v4sf)__A, (__v4sf)__B, 2998 (__v4sf)__C, (__mmask8)__U, 2999 _MM_FROUND_CUR_DIRECTION); 3000 } 3001 3002 static __inline__ __m128h __DEFAULT_FN_ATTRS128 3003 _mm_mask3_fmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { 3004 return (__m128h)__builtin_ia32_vfmaddcsh_round_mask3( 3005 (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U, _MM_FROUND_CUR_DIRECTION); 3006 } 3007 3008 #define _mm_fmadd_round_sch(A, B, C, R) \ 3009 ((__m128h)__builtin_ia32_vfmaddcsh_mask( \ 3010 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \ 3011 (__mmask8)-1, (int)(R))) 3012 3013 #define _mm_mask_fmadd_round_sch(A, U, B, C, R) \ 3014 ((__m128h)__builtin_ia32_vfmaddcsh_round_mask( \ 3015 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \ 3016 (__mmask8)(U), (int)(R))) 3017 3018 #define _mm_maskz_fmadd_round_sch(U, A, B, C, R) \ 3019 ((__m128h)__builtin_ia32_vfmaddcsh_maskz( \ 3020 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \ 3021 (__mmask8)(U), (int)(R))) 3022 3023 #define _mm_mask3_fmadd_round_sch(A, B, C, U, R) \ 3024 ((__m128h)__builtin_ia32_vfmaddcsh_round_mask3( \ 3025 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \ 3026 (__mmask8)(U), (int)(R))) 3027 3028 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmul_sch(__m128h __A, 3029 __m128h __B) { 3030 return (__m128h)__builtin_ia32_vfcmulcsh_mask( 3031 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1, 3032 _MM_FROUND_CUR_DIRECTION); 3033 } 3034 3035 static __inline__ __m128h __DEFAULT_FN_ATTRS128 3036 _mm_mask_fcmul_sch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { 3037 return (__m128h)__builtin_ia32_vfcmulcsh_mask((__v4sf)__A, (__v4sf)__B, 3038 (__v4sf)__W, (__mmask8)__U, 3039 _MM_FROUND_CUR_DIRECTION); 3040 } 3041 3042 static __inline__ __m128h __DEFAULT_FN_ATTRS128 3043 _mm_maskz_fcmul_sch(__mmask8 __U, __m128h __A, __m128h __B) { 3044 return (__m128h)__builtin_ia32_vfcmulcsh_mask( 3045 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U, 3046 _MM_FROUND_CUR_DIRECTION); 3047 } 3048 3049 #define _mm_fcmul_round_sch(A, B, R) \ 3050 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \ 3051 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \ 3052 (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R))) 3053 3054 #define _mm_mask_fcmul_round_sch(W, U, A, B, R) \ 3055 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \ 3056 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W), \ 3057 (__mmask8)(U), (int)(R))) 3058 3059 #define _mm_maskz_fcmul_round_sch(U, A, B, R) \ 3060 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \ 3061 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \ 3062 (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R))) 3063 3064 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmul_sch(__m128h __A, 3065 __m128h __B) { 3066 return (__m128h)__builtin_ia32_vfmulcsh_mask( 3067 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1, 3068 _MM_FROUND_CUR_DIRECTION); 3069 } 3070 3071 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmul_sch(__m128h __W, 3072 __mmask8 __U, 3073 __m128h __A, 3074 __m128h __B) { 3075 return (__m128h)__builtin_ia32_vfmulcsh_mask((__v4sf)__A, (__v4sf)__B, 3076 (__v4sf)__W, (__mmask8)__U, 3077 _MM_FROUND_CUR_DIRECTION); 3078 } 3079 3080 static __inline__ __m128h __DEFAULT_FN_ATTRS128 3081 _mm_maskz_fmul_sch(__mmask8 __U, __m128h __A, __m128h __B) { 3082 return (__m128h)__builtin_ia32_vfmulcsh_mask( 3083 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U, 3084 _MM_FROUND_CUR_DIRECTION); 3085 } 3086 3087 #define _mm_fmul_round_sch(A, B, R) \ 3088 ((__m128h)__builtin_ia32_vfmulcsh_mask( \ 3089 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \ 3090 (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R))) 3091 3092 #define _mm_mask_fmul_round_sch(W, U, A, B, R) \ 3093 ((__m128h)__builtin_ia32_vfmulcsh_mask( \ 3094 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W), \ 3095 (__mmask8)(U), (int)(R))) 3096 3097 #define _mm_maskz_fmul_round_sch(U, A, B, R) \ 3098 ((__m128h)__builtin_ia32_vfmulcsh_mask( \ 3099 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \ 3100 (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R))) 3101 3102 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmul_pch(__m512h __A, 3103 __m512h __B) { 3104 return (__m512h)__builtin_ia32_vfcmulcph512_mask( 3105 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (__mmask16)-1, 3106 _MM_FROUND_CUR_DIRECTION); 3107 } 3108 3109 static __inline__ __m512h __DEFAULT_FN_ATTRS512 3110 _mm512_mask_fcmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) { 3111 return (__m512h)__builtin_ia32_vfcmulcph512_mask((__v16sf)__A, (__v16sf)__B, 3112 (__v16sf)__W, (__mmask16)__U, 3113 _MM_FROUND_CUR_DIRECTION); 3114 } 3115 3116 static __inline__ __m512h __DEFAULT_FN_ATTRS512 3117 _mm512_maskz_fcmul_pch(__mmask16 __U, __m512h __A, __m512h __B) { 3118 return (__m512h)__builtin_ia32_vfcmulcph512_mask( 3119 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (__mmask16)__U, 3120 _MM_FROUND_CUR_DIRECTION); 3121 } 3122 3123 #define _mm512_fcmul_round_pch(A, B, R) \ 3124 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \ 3125 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \ 3126 (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R))) 3127 3128 #define _mm512_mask_fcmul_round_pch(W, U, A, B, R) \ 3129 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \ 3130 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W), \ 3131 (__mmask16)(U), (int)(R))) 3132 3133 #define _mm512_maskz_fcmul_round_pch(U, A, B, R) \ 3134 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \ 3135 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \ 3136 (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R))) 3137 3138 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmul_pch(__m512h __A, 3139 __m512h __B) { 3140 return (__m512h)__builtin_ia32_vfmulcph512_mask( 3141 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (__mmask16)-1, 3142 _MM_FROUND_CUR_DIRECTION); 3143 } 3144 3145 static __inline__ __m512h __DEFAULT_FN_ATTRS512 3146 _mm512_mask_fmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) { 3147 return (__m512h)__builtin_ia32_vfmulcph512_mask((__v16sf)__A, (__v16sf)__B, 3148 (__v16sf)__W, (__mmask16)__U, 3149 _MM_FROUND_CUR_DIRECTION); 3150 } 3151 3152 static __inline__ __m512h __DEFAULT_FN_ATTRS512 3153 _mm512_maskz_fmul_pch(__mmask16 __U, __m512h __A, __m512h __B) { 3154 return (__m512h)__builtin_ia32_vfmulcph512_mask( 3155 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (__mmask16)__U, 3156 _MM_FROUND_CUR_DIRECTION); 3157 } 3158 3159 #define _mm512_fmul_round_pch(A, B, R) \ 3160 ((__m512h)__builtin_ia32_vfmulcph512_mask( \ 3161 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \ 3162 (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R))) 3163 3164 #define _mm512_mask_fmul_round_pch(W, U, A, B, R) \ 3165 ((__m512h)__builtin_ia32_vfmulcph512_mask( \ 3166 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W), \ 3167 (__mmask16)(U), (int)(R))) 3168 3169 #define _mm512_maskz_fmul_round_pch(U, A, B, R) \ 3170 ((__m512h)__builtin_ia32_vfmulcph512_mask( \ 3171 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \ 3172 (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R))) 3173 3174 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmadd_pch(__m512h __A, 3175 __m512h __B, 3176 __m512h __C) { 3177 return (__m512h)__builtin_ia32_vfcmaddcph512_mask3( 3178 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)-1, 3179 _MM_FROUND_CUR_DIRECTION); 3180 } 3181 3182 static __inline__ __m512h __DEFAULT_FN_ATTRS512 3183 _mm512_mask_fcmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) { 3184 return (__m512h)__builtin_ia32_vfcmaddcph512_mask( 3185 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, 3186 _MM_FROUND_CUR_DIRECTION); 3187 } 3188 3189 static __inline__ __m512h __DEFAULT_FN_ATTRS512 3190 _mm512_mask3_fcmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) { 3191 return (__m512h)__builtin_ia32_vfcmaddcph512_mask3( 3192 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, 3193 _MM_FROUND_CUR_DIRECTION); 3194 } 3195 3196 static __inline__ __m512h __DEFAULT_FN_ATTRS512 3197 _mm512_maskz_fcmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) { 3198 return (__m512h)__builtin_ia32_vfcmaddcph512_maskz( 3199 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, 3200 _MM_FROUND_CUR_DIRECTION); 3201 } 3202 3203 #define _mm512_fcmadd_round_pch(A, B, C, R) \ 3204 ((__m512h)__builtin_ia32_vfcmaddcph512_mask3( \ 3205 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \ 3206 (__mmask16)-1, (int)(R))) 3207 3208 #define _mm512_mask_fcmadd_round_pch(A, U, B, C, R) \ 3209 ((__m512h)__builtin_ia32_vfcmaddcph512_mask( \ 3210 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \ 3211 (__mmask16)(U), (int)(R))) 3212 3213 #define _mm512_mask3_fcmadd_round_pch(A, B, C, U, R) \ 3214 ((__m512h)__builtin_ia32_vfcmaddcph512_mask3( \ 3215 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \ 3216 (__mmask16)(U), (int)(R))) 3217 3218 #define _mm512_maskz_fcmadd_round_pch(U, A, B, C, R) \ 3219 ((__m512h)__builtin_ia32_vfcmaddcph512_maskz( \ 3220 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \ 3221 (__mmask16)(U), (int)(R))) 3222 3223 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_pch(__m512h __A, 3224 __m512h __B, 3225 __m512h __C) { 3226 return (__m512h)__builtin_ia32_vfmaddcph512_mask3((__v16sf)__A, (__v16sf)__B, 3227 (__v16sf)__C, (__mmask16)-1, 3228 _MM_FROUND_CUR_DIRECTION); 3229 } 3230 3231 static __inline__ __m512h __DEFAULT_FN_ATTRS512 3232 _mm512_mask_fmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) { 3233 return (__m512h)__builtin_ia32_vfmaddcph512_mask((__v16sf)__A, (__v16sf)__B, 3234 (__v16sf)__C, (__mmask16)__U, 3235 _MM_FROUND_CUR_DIRECTION); 3236 } 3237 3238 static __inline__ __m512h __DEFAULT_FN_ATTRS512 3239 _mm512_mask3_fmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) { 3240 return (__m512h)__builtin_ia32_vfmaddcph512_mask3( 3241 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, 3242 _MM_FROUND_CUR_DIRECTION); 3243 } 3244 3245 static __inline__ __m512h __DEFAULT_FN_ATTRS512 3246 _mm512_maskz_fmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) { 3247 return (__m512h)__builtin_ia32_vfmaddcph512_maskz( 3248 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, 3249 _MM_FROUND_CUR_DIRECTION); 3250 } 3251 3252 #define _mm512_fmadd_round_pch(A, B, C, R) \ 3253 ((__m512h)__builtin_ia32_vfmaddcph512_mask3( \ 3254 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \ 3255 (__mmask16)-1, (int)(R))) 3256 3257 #define _mm512_mask_fmadd_round_pch(A, U, B, C, R) \ 3258 ((__m512h)__builtin_ia32_vfmaddcph512_mask( \ 3259 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \ 3260 (__mmask16)(U), (int)(R))) 3261 3262 #define _mm512_mask3_fmadd_round_pch(A, B, C, U, R) \ 3263 ((__m512h)__builtin_ia32_vfmaddcph512_mask3( \ 3264 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \ 3265 (__mmask16)(U), (int)(R))) 3266 3267 #define _mm512_maskz_fmadd_round_pch(U, A, B, C, R) \ 3268 ((__m512h)__builtin_ia32_vfmaddcph512_maskz( \ 3269 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \ 3270 (__mmask16)(U), (int)(R))) 3271 3272 static __inline__ _Float16 __DEFAULT_FN_ATTRS512 3273 _mm512_reduce_add_ph(__m512h __W) { 3274 return __builtin_ia32_reduce_fadd_ph512(-0.0f16, __W); 3275 } 3276 3277 static __inline__ _Float16 __DEFAULT_FN_ATTRS512 3278 _mm512_reduce_mul_ph(__m512h __W) { 3279 return __builtin_ia32_reduce_fmul_ph512(1.0f16, __W); 3280 } 3281 3282 static __inline__ _Float16 __DEFAULT_FN_ATTRS512 3283 _mm512_reduce_max_ph(__m512h __V) { 3284 return __builtin_ia32_reduce_fmax_ph512(__V); 3285 } 3286 3287 static __inline__ _Float16 __DEFAULT_FN_ATTRS512 3288 _mm512_reduce_min_ph(__m512h __V) { 3289 return __builtin_ia32_reduce_fmin_ph512(__V); 3290 } 3291 3292 static __inline__ __m512h __DEFAULT_FN_ATTRS512 3293 _mm512_mask_blend_ph(__mmask32 __U, __m512h __A, __m512h __W) { 3294 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, (__v32hf)__W, 3295 (__v32hf)__A); 3296 } 3297 3298 static __inline__ __m512h __DEFAULT_FN_ATTRS512 3299 _mm512_permutex2var_ph(__m512h __A, __m512i __I, __m512h __B) { 3300 return (__m512h)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I, 3301 (__v32hi)__B); 3302 } 3303 3304 static __inline__ __m512h __DEFAULT_FN_ATTRS512 3305 _mm512_permutexvar_ph(__m512i __A, __m512h __B) { 3306 return (__m512h)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A); 3307 } 3308 3309 // intrinsics below are alias for f*mul_*ch 3310 #define _mm512_mul_pch(A, B) _mm512_fmul_pch(A, B) 3311 #define _mm512_mask_mul_pch(W, U, A, B) _mm512_mask_fmul_pch(W, U, A, B) 3312 #define _mm512_maskz_mul_pch(U, A, B) _mm512_maskz_fmul_pch(U, A, B) 3313 #define _mm512_mul_round_pch(A, B, R) _mm512_fmul_round_pch(A, B, R) 3314 #define _mm512_mask_mul_round_pch(W, U, A, B, R) \ 3315 _mm512_mask_fmul_round_pch(W, U, A, B, R) 3316 #define _mm512_maskz_mul_round_pch(U, A, B, R) \ 3317 _mm512_maskz_fmul_round_pch(U, A, B, R) 3318 3319 #define _mm512_cmul_pch(A, B) _mm512_fcmul_pch(A, B) 3320 #define _mm512_mask_cmul_pch(W, U, A, B) _mm512_mask_fcmul_pch(W, U, A, B) 3321 #define _mm512_maskz_cmul_pch(U, A, B) _mm512_maskz_fcmul_pch(U, A, B) 3322 #define _mm512_cmul_round_pch(A, B, R) _mm512_fcmul_round_pch(A, B, R) 3323 #define _mm512_mask_cmul_round_pch(W, U, A, B, R) \ 3324 _mm512_mask_fcmul_round_pch(W, U, A, B, R) 3325 #define _mm512_maskz_cmul_round_pch(U, A, B, R) \ 3326 _mm512_maskz_fcmul_round_pch(U, A, B, R) 3327 3328 #define _mm_mul_sch(A, B) _mm_fmul_sch(A, B) 3329 #define _mm_mask_mul_sch(W, U, A, B) _mm_mask_fmul_sch(W, U, A, B) 3330 #define _mm_maskz_mul_sch(U, A, B) _mm_maskz_fmul_sch(U, A, B) 3331 #define _mm_mul_round_sch(A, B, R) _mm_fmul_round_sch(A, B, R) 3332 #define _mm_mask_mul_round_sch(W, U, A, B, R) \ 3333 _mm_mask_fmul_round_sch(W, U, A, B, R) 3334 #define _mm_maskz_mul_round_sch(U, A, B, R) _mm_maskz_fmul_round_sch(U, A, B, R) 3335 3336 #define _mm_cmul_sch(A, B) _mm_fcmul_sch(A, B) 3337 #define _mm_mask_cmul_sch(W, U, A, B) _mm_mask_fcmul_sch(W, U, A, B) 3338 #define _mm_maskz_cmul_sch(U, A, B) _mm_maskz_fcmul_sch(U, A, B) 3339 #define _mm_cmul_round_sch(A, B, R) _mm_fcmul_round_sch(A, B, R) 3340 #define _mm_mask_cmul_round_sch(W, U, A, B, R) \ 3341 _mm_mask_fcmul_round_sch(W, U, A, B, R) 3342 #define _mm_maskz_cmul_round_sch(U, A, B, R) \ 3343 _mm_maskz_fcmul_round_sch(U, A, B, R) 3344 3345 #undef __DEFAULT_FN_ATTRS128 3346 #undef __DEFAULT_FN_ATTRS256 3347 #undef __DEFAULT_FN_ATTRS512 3348 3349 #endif 3350 #endif 3351