xref: /freebsd/contrib/llvm-project/clang/lib/Headers/avx512fp16intrin.h (revision 5ca8e32633c4ffbbcd6762e5888b6a4ba0708c6c)
1 /*===----------- avx512fp16intrin.h - AVX512-FP16 intrinsics ---------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 #ifndef __IMMINTRIN_H
10 #error "Never use <avx512fp16intrin.h> directly; include <immintrin.h> instead."
11 #endif
12 
13 #ifdef __SSE2__
14 
15 #ifndef __AVX512FP16INTRIN_H
16 #define __AVX512FP16INTRIN_H
17 
18 /* Define the default attributes for the functions in this file. */
19 typedef _Float16 __v32hf __attribute__((__vector_size__(64), __aligned__(64)));
20 typedef _Float16 __m512h __attribute__((__vector_size__(64), __aligned__(64)));
21 typedef _Float16 __m512h_u __attribute__((__vector_size__(64), __aligned__(1)));
22 
23 /* Define the default attributes for the functions in this file. */
24 #define __DEFAULT_FN_ATTRS512                                                  \
25   __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"),     \
26                  __min_vector_width__(512)))
27 #define __DEFAULT_FN_ATTRS256                                                  \
28   __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"),     \
29                  __min_vector_width__(256)))
30 #define __DEFAULT_FN_ATTRS128                                                  \
31   __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"),     \
32                  __min_vector_width__(128)))
33 
34 static __inline__ _Float16 __DEFAULT_FN_ATTRS512 _mm512_cvtsh_h(__m512h __a) {
35   return __a[0];
36 }
37 
38 static __inline __m128h __DEFAULT_FN_ATTRS128 _mm_setzero_ph(void) {
39   return (__m128h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
40 }
41 
42 static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_setzero_ph(void) {
43   return (__m256h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
44                    0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
45 }
46 
47 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_undefined_ph(void) {
48   return (__m256h)__builtin_ia32_undef256();
49 }
50 
51 static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_setzero_ph(void) {
52   return (__m512h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
53                    0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
54                    0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
55 }
56 
57 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_undefined_ph(void) {
58   return (__m128h)__builtin_ia32_undef128();
59 }
60 
61 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_undefined_ph(void) {
62   return (__m512h)__builtin_ia32_undef512();
63 }
64 
65 static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_set1_ph(_Float16 __h) {
66   return (__m512h)(__v32hf){__h, __h, __h, __h, __h, __h, __h, __h,
67                             __h, __h, __h, __h, __h, __h, __h, __h,
68                             __h, __h, __h, __h, __h, __h, __h, __h,
69                             __h, __h, __h, __h, __h, __h, __h, __h};
70 }
71 
72 static __inline __m512h __DEFAULT_FN_ATTRS512
73 _mm512_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4,
74               _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8,
75               _Float16 __h9, _Float16 __h10, _Float16 __h11, _Float16 __h12,
76               _Float16 __h13, _Float16 __h14, _Float16 __h15, _Float16 __h16,
77               _Float16 __h17, _Float16 __h18, _Float16 __h19, _Float16 __h20,
78               _Float16 __h21, _Float16 __h22, _Float16 __h23, _Float16 __h24,
79               _Float16 __h25, _Float16 __h26, _Float16 __h27, _Float16 __h28,
80               _Float16 __h29, _Float16 __h30, _Float16 __h31, _Float16 __h32) {
81   return (__m512h)(__v32hf){__h32, __h31, __h30, __h29, __h28, __h27, __h26,
82                             __h25, __h24, __h23, __h22, __h21, __h20, __h19,
83                             __h18, __h17, __h16, __h15, __h14, __h13, __h12,
84                             __h11, __h10, __h9,  __h8,  __h7,  __h6,  __h5,
85                             __h4,  __h3,  __h2,  __h1};
86 }
87 
88 #define _mm512_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11, h12, h13, \
89                        h14, h15, h16, h17, h18, h19, h20, h21, h22, h23, h24,  \
90                        h25, h26, h27, h28, h29, h30, h31, h32)                 \
91   _mm512_set_ph((h32), (h31), (h30), (h29), (h28), (h27), (h26), (h25), (h24), \
92                 (h23), (h22), (h21), (h20), (h19), (h18), (h17), (h16), (h15), \
93                 (h14), (h13), (h12), (h11), (h10), (h9), (h8), (h7), (h6),     \
94                 (h5), (h4), (h3), (h2), (h1))
95 
96 static __inline __m512h __DEFAULT_FN_ATTRS512
97 _mm512_set1_pch(_Float16 _Complex h) {
98   return (__m512h)_mm512_set1_ps(__builtin_bit_cast(float, h));
99 }
100 
101 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_castph_ps(__m128h __a) {
102   return (__m128)__a;
103 }
104 
105 static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_castph_ps(__m256h __a) {
106   return (__m256)__a;
107 }
108 
109 static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_castph_ps(__m512h __a) {
110   return (__m512)__a;
111 }
112 
113 static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_castph_pd(__m128h __a) {
114   return (__m128d)__a;
115 }
116 
117 static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_castph_pd(__m256h __a) {
118   return (__m256d)__a;
119 }
120 
121 static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_castph_pd(__m512h __a) {
122   return (__m512d)__a;
123 }
124 
125 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_castph_si128(__m128h __a) {
126   return (__m128i)__a;
127 }
128 
129 static __inline__ __m256i __DEFAULT_FN_ATTRS256
130 _mm256_castph_si256(__m256h __a) {
131   return (__m256i)__a;
132 }
133 
134 static __inline__ __m512i __DEFAULT_FN_ATTRS512
135 _mm512_castph_si512(__m512h __a) {
136   return (__m512i)__a;
137 }
138 
139 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castps_ph(__m128 __a) {
140   return (__m128h)__a;
141 }
142 
143 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_castps_ph(__m256 __a) {
144   return (__m256h)__a;
145 }
146 
147 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_castps_ph(__m512 __a) {
148   return (__m512h)__a;
149 }
150 
151 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castpd_ph(__m128d __a) {
152   return (__m128h)__a;
153 }
154 
155 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_castpd_ph(__m256d __a) {
156   return (__m256h)__a;
157 }
158 
159 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_castpd_ph(__m512d __a) {
160   return (__m512h)__a;
161 }
162 
163 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castsi128_ph(__m128i __a) {
164   return (__m128h)__a;
165 }
166 
167 static __inline__ __m256h __DEFAULT_FN_ATTRS256
168 _mm256_castsi256_ph(__m256i __a) {
169   return (__m256h)__a;
170 }
171 
172 static __inline__ __m512h __DEFAULT_FN_ATTRS512
173 _mm512_castsi512_ph(__m512i __a) {
174   return (__m512h)__a;
175 }
176 
177 static __inline__ __m128h __DEFAULT_FN_ATTRS256
178 _mm256_castph256_ph128(__m256h __a) {
179   return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
180 }
181 
182 static __inline__ __m128h __DEFAULT_FN_ATTRS512
183 _mm512_castph512_ph128(__m512h __a) {
184   return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
185 }
186 
187 static __inline__ __m256h __DEFAULT_FN_ATTRS512
188 _mm512_castph512_ph256(__m512h __a) {
189   return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
190                                  12, 13, 14, 15);
191 }
192 
193 static __inline__ __m256h __DEFAULT_FN_ATTRS256
194 _mm256_castph128_ph256(__m128h __a) {
195   return __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a),
196                                   0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
197 }
198 
199 static __inline__ __m512h __DEFAULT_FN_ATTRS512
200 _mm512_castph128_ph512(__m128h __a) {
201   __m256h __b = __builtin_nondeterministic_value(__b);
202   return __builtin_shufflevector(
203       __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a),
204                               0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
205       __b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
206       20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
207 }
208 
209 static __inline__ __m512h __DEFAULT_FN_ATTRS512
210 _mm512_castph256_ph512(__m256h __a) {
211   return __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a), 0,
212                                  1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
213                                  15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
214                                  27, 28, 29, 30, 31);
215 }
216 
217 /// Constructs a 256-bit floating-point vector of [16 x half] from a
218 ///    128-bit floating-point vector of [8 x half]. The lower 128 bits
219 ///    contain the value of the source vector. The upper 384 bits are set
220 ///    to zero.
221 ///
222 /// \headerfile <x86intrin.h>
223 ///
224 /// This intrinsic has no corresponding instruction.
225 ///
226 /// \param __a
227 ///    A 128-bit vector of [8 x half].
228 /// \returns A 512-bit floating-point vector of [16 x half]. The lower 128 bits
229 ///    contain the value of the parameter. The upper 384 bits are set to zero.
230 static __inline__ __m256h __DEFAULT_FN_ATTRS256
231 _mm256_zextph128_ph256(__m128h __a) {
232   return __builtin_shufflevector(__a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4,
233                                  5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
234 }
235 
236 /// Constructs a 512-bit floating-point vector of [32 x half] from a
237 ///    128-bit floating-point vector of [8 x half]. The lower 128 bits
238 ///    contain the value of the source vector. The upper 384 bits are set
239 ///    to zero.
240 ///
241 /// \headerfile <x86intrin.h>
242 ///
243 /// This intrinsic has no corresponding instruction.
244 ///
245 /// \param __a
246 ///    A 128-bit vector of [8 x half].
247 /// \returns A 512-bit floating-point vector of [32 x half]. The lower 128 bits
248 ///    contain the value of the parameter. The upper 384 bits are set to zero.
249 static __inline__ __m512h __DEFAULT_FN_ATTRS512
250 _mm512_zextph128_ph512(__m128h __a) {
251   return __builtin_shufflevector(
252       __a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
253       13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15);
254 }
255 
256 /// Constructs a 512-bit floating-point vector of [32 x half] from a
257 ///    256-bit floating-point vector of [16 x half]. The lower 256 bits
258 ///    contain the value of the source vector. The upper 256 bits are set
259 ///    to zero.
260 ///
261 /// \headerfile <x86intrin.h>
262 ///
263 /// This intrinsic has no corresponding instruction.
264 ///
265 /// \param __a
266 ///    A 256-bit vector of [16 x half].
267 /// \returns A 512-bit floating-point vector of [32 x half]. The lower 256 bits
268 ///    contain the value of the parameter. The upper 256 bits are set to zero.
269 static __inline__ __m512h __DEFAULT_FN_ATTRS512
270 _mm512_zextph256_ph512(__m256h __a) {
271   return __builtin_shufflevector(__a, (__v16hf)_mm256_setzero_ph(), 0, 1, 2, 3,
272                                  4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
273                                  17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
274                                  29, 30, 31);
275 }
276 
277 #define _mm_comi_round_sh(A, B, P, R)                                          \
278   __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, (int)(P), (int)(R))
279 
280 #define _mm_comi_sh(A, B, pred)                                                \
281   _mm_comi_round_sh((A), (B), (pred), _MM_FROUND_CUR_DIRECTION)
282 
283 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comieq_sh(__m128h A,
284                                                           __m128h B) {
285   return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_EQ_OS,
286                                 _MM_FROUND_CUR_DIRECTION);
287 }
288 
289 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comilt_sh(__m128h A,
290                                                           __m128h B) {
291   return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LT_OS,
292                                 _MM_FROUND_CUR_DIRECTION);
293 }
294 
295 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comile_sh(__m128h A,
296                                                           __m128h B) {
297   return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LE_OS,
298                                 _MM_FROUND_CUR_DIRECTION);
299 }
300 
301 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comigt_sh(__m128h A,
302                                                           __m128h B) {
303   return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GT_OS,
304                                 _MM_FROUND_CUR_DIRECTION);
305 }
306 
307 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comige_sh(__m128h A,
308                                                           __m128h B) {
309   return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GE_OS,
310                                 _MM_FROUND_CUR_DIRECTION);
311 }
312 
313 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comineq_sh(__m128h A,
314                                                            __m128h B) {
315   return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_NEQ_US,
316                                 _MM_FROUND_CUR_DIRECTION);
317 }
318 
319 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomieq_sh(__m128h A,
320                                                            __m128h B) {
321   return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_EQ_OQ,
322                                 _MM_FROUND_CUR_DIRECTION);
323 }
324 
325 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomilt_sh(__m128h A,
326                                                            __m128h B) {
327   return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LT_OQ,
328                                 _MM_FROUND_CUR_DIRECTION);
329 }
330 
331 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomile_sh(__m128h A,
332                                                            __m128h B) {
333   return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LE_OQ,
334                                 _MM_FROUND_CUR_DIRECTION);
335 }
336 
337 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomigt_sh(__m128h A,
338                                                            __m128h B) {
339   return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GT_OQ,
340                                 _MM_FROUND_CUR_DIRECTION);
341 }
342 
343 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomige_sh(__m128h A,
344                                                            __m128h B) {
345   return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GE_OQ,
346                                 _MM_FROUND_CUR_DIRECTION);
347 }
348 
349 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomineq_sh(__m128h A,
350                                                             __m128h B) {
351   return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_NEQ_UQ,
352                                 _MM_FROUND_CUR_DIRECTION);
353 }
354 
355 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_add_ph(__m512h __A,
356                                                               __m512h __B) {
357   return (__m512h)((__v32hf)__A + (__v32hf)__B);
358 }
359 
360 static __inline__ __m512h __DEFAULT_FN_ATTRS512
361 _mm512_mask_add_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
362   return (__m512h)__builtin_ia32_selectph_512(
363       (__mmask32)__U, (__v32hf)_mm512_add_ph(__A, __B), (__v32hf)__W);
364 }
365 
366 static __inline__ __m512h __DEFAULT_FN_ATTRS512
367 _mm512_maskz_add_ph(__mmask32 __U, __m512h __A, __m512h __B) {
368   return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
369                                               (__v32hf)_mm512_add_ph(__A, __B),
370                                               (__v32hf)_mm512_setzero_ph());
371 }
372 
373 #define _mm512_add_round_ph(A, B, R)                                           \
374   ((__m512h)__builtin_ia32_addph512((__v32hf)(__m512h)(A),                     \
375                                     (__v32hf)(__m512h)(B), (int)(R)))
376 
377 #define _mm512_mask_add_round_ph(W, U, A, B, R)                                \
378   ((__m512h)__builtin_ia32_selectph_512(                                       \
379       (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)),             \
380       (__v32hf)(__m512h)(W)))
381 
382 #define _mm512_maskz_add_round_ph(U, A, B, R)                                  \
383   ((__m512h)__builtin_ia32_selectph_512(                                       \
384       (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)),             \
385       (__v32hf)_mm512_setzero_ph()))
386 
387 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sub_ph(__m512h __A,
388                                                               __m512h __B) {
389   return (__m512h)((__v32hf)__A - (__v32hf)__B);
390 }
391 
392 static __inline__ __m512h __DEFAULT_FN_ATTRS512
393 _mm512_mask_sub_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
394   return (__m512h)__builtin_ia32_selectph_512(
395       (__mmask32)__U, (__v32hf)_mm512_sub_ph(__A, __B), (__v32hf)__W);
396 }
397 
398 static __inline__ __m512h __DEFAULT_FN_ATTRS512
399 _mm512_maskz_sub_ph(__mmask32 __U, __m512h __A, __m512h __B) {
400   return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
401                                               (__v32hf)_mm512_sub_ph(__A, __B),
402                                               (__v32hf)_mm512_setzero_ph());
403 }
404 
405 #define _mm512_sub_round_ph(A, B, R)                                           \
406   ((__m512h)__builtin_ia32_subph512((__v32hf)(__m512h)(A),                     \
407                                     (__v32hf)(__m512h)(B), (int)(R)))
408 
409 #define _mm512_mask_sub_round_ph(W, U, A, B, R)                                \
410   ((__m512h)__builtin_ia32_selectph_512(                                       \
411       (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)),             \
412       (__v32hf)(__m512h)(W)))
413 
414 #define _mm512_maskz_sub_round_ph(U, A, B, R)                                  \
415   ((__m512h)__builtin_ia32_selectph_512(                                       \
416       (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)),             \
417       (__v32hf)_mm512_setzero_ph()))
418 
419 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_mul_ph(__m512h __A,
420                                                               __m512h __B) {
421   return (__m512h)((__v32hf)__A * (__v32hf)__B);
422 }
423 
424 static __inline__ __m512h __DEFAULT_FN_ATTRS512
425 _mm512_mask_mul_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
426   return (__m512h)__builtin_ia32_selectph_512(
427       (__mmask32)__U, (__v32hf)_mm512_mul_ph(__A, __B), (__v32hf)__W);
428 }
429 
430 static __inline__ __m512h __DEFAULT_FN_ATTRS512
431 _mm512_maskz_mul_ph(__mmask32 __U, __m512h __A, __m512h __B) {
432   return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
433                                               (__v32hf)_mm512_mul_ph(__A, __B),
434                                               (__v32hf)_mm512_setzero_ph());
435 }
436 
437 #define _mm512_mul_round_ph(A, B, R)                                           \
438   ((__m512h)__builtin_ia32_mulph512((__v32hf)(__m512h)(A),                     \
439                                     (__v32hf)(__m512h)(B), (int)(R)))
440 
441 #define _mm512_mask_mul_round_ph(W, U, A, B, R)                                \
442   ((__m512h)__builtin_ia32_selectph_512(                                       \
443       (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)),             \
444       (__v32hf)(__m512h)(W)))
445 
446 #define _mm512_maskz_mul_round_ph(U, A, B, R)                                  \
447   ((__m512h)__builtin_ia32_selectph_512(                                       \
448       (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)),             \
449       (__v32hf)_mm512_setzero_ph()))
450 
451 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_div_ph(__m512h __A,
452                                                               __m512h __B) {
453   return (__m512h)((__v32hf)__A / (__v32hf)__B);
454 }
455 
456 static __inline__ __m512h __DEFAULT_FN_ATTRS512
457 _mm512_mask_div_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
458   return (__m512h)__builtin_ia32_selectph_512(
459       (__mmask32)__U, (__v32hf)_mm512_div_ph(__A, __B), (__v32hf)__W);
460 }
461 
462 static __inline__ __m512h __DEFAULT_FN_ATTRS512
463 _mm512_maskz_div_ph(__mmask32 __U, __m512h __A, __m512h __B) {
464   return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
465                                               (__v32hf)_mm512_div_ph(__A, __B),
466                                               (__v32hf)_mm512_setzero_ph());
467 }
468 
469 #define _mm512_div_round_ph(A, B, R)                                           \
470   ((__m512h)__builtin_ia32_divph512((__v32hf)(__m512h)(A),                     \
471                                     (__v32hf)(__m512h)(B), (int)(R)))
472 
473 #define _mm512_mask_div_round_ph(W, U, A, B, R)                                \
474   ((__m512h)__builtin_ia32_selectph_512(                                       \
475       (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)),             \
476       (__v32hf)(__m512h)(W)))
477 
478 #define _mm512_maskz_div_round_ph(U, A, B, R)                                  \
479   ((__m512h)__builtin_ia32_selectph_512(                                       \
480       (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)),             \
481       (__v32hf)_mm512_setzero_ph()))
482 
483 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_min_ph(__m512h __A,
484                                                               __m512h __B) {
485   return (__m512h)__builtin_ia32_minph512((__v32hf)__A, (__v32hf)__B,
486                                           _MM_FROUND_CUR_DIRECTION);
487 }
488 
489 static __inline__ __m512h __DEFAULT_FN_ATTRS512
490 _mm512_mask_min_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
491   return (__m512h)__builtin_ia32_selectph_512(
492       (__mmask32)__U, (__v32hf)_mm512_min_ph(__A, __B), (__v32hf)__W);
493 }
494 
495 static __inline__ __m512h __DEFAULT_FN_ATTRS512
496 _mm512_maskz_min_ph(__mmask32 __U, __m512h __A, __m512h __B) {
497   return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
498                                               (__v32hf)_mm512_min_ph(__A, __B),
499                                               (__v32hf)_mm512_setzero_ph());
500 }
501 
502 #define _mm512_min_round_ph(A, B, R)                                           \
503   ((__m512h)__builtin_ia32_minph512((__v32hf)(__m512h)(A),                     \
504                                     (__v32hf)(__m512h)(B), (int)(R)))
505 
506 #define _mm512_mask_min_round_ph(W, U, A, B, R)                                \
507   ((__m512h)__builtin_ia32_selectph_512(                                       \
508       (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)),             \
509       (__v32hf)(__m512h)(W)))
510 
511 #define _mm512_maskz_min_round_ph(U, A, B, R)                                  \
512   ((__m512h)__builtin_ia32_selectph_512(                                       \
513       (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)),             \
514       (__v32hf)_mm512_setzero_ph()))
515 
516 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_max_ph(__m512h __A,
517                                                               __m512h __B) {
518   return (__m512h)__builtin_ia32_maxph512((__v32hf)__A, (__v32hf)__B,
519                                           _MM_FROUND_CUR_DIRECTION);
520 }
521 
522 static __inline__ __m512h __DEFAULT_FN_ATTRS512
523 _mm512_mask_max_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
524   return (__m512h)__builtin_ia32_selectph_512(
525       (__mmask32)__U, (__v32hf)_mm512_max_ph(__A, __B), (__v32hf)__W);
526 }
527 
528 static __inline__ __m512h __DEFAULT_FN_ATTRS512
529 _mm512_maskz_max_ph(__mmask32 __U, __m512h __A, __m512h __B) {
530   return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
531                                               (__v32hf)_mm512_max_ph(__A, __B),
532                                               (__v32hf)_mm512_setzero_ph());
533 }
534 
535 #define _mm512_max_round_ph(A, B, R)                                           \
536   ((__m512h)__builtin_ia32_maxph512((__v32hf)(__m512h)(A),                     \
537                                     (__v32hf)(__m512h)(B), (int)(R)))
538 
539 #define _mm512_mask_max_round_ph(W, U, A, B, R)                                \
540   ((__m512h)__builtin_ia32_selectph_512(                                       \
541       (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)),             \
542       (__v32hf)(__m512h)(W)))
543 
544 #define _mm512_maskz_max_round_ph(U, A, B, R)                                  \
545   ((__m512h)__builtin_ia32_selectph_512(                                       \
546       (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)),             \
547       (__v32hf)_mm512_setzero_ph()))
548 
549 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_abs_ph(__m512h __A) {
550   return (__m512h)_mm512_and_epi32(_mm512_set1_epi32(0x7FFF7FFF), (__m512i)__A);
551 }
552 
553 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_conj_pch(__m512h __A) {
554   return (__m512h)_mm512_xor_ps((__m512)__A, _mm512_set1_ps(-0.0f));
555 }
556 
557 static __inline__ __m512h __DEFAULT_FN_ATTRS512
558 _mm512_mask_conj_pch(__m512h __W, __mmask16 __U, __m512h __A) {
559   return (__m512h)__builtin_ia32_selectps_512(
560       (__mmask16)__U, (__v16sf)_mm512_conj_pch(__A), (__v16sf)__W);
561 }
562 
563 static __inline__ __m512h __DEFAULT_FN_ATTRS512
564 _mm512_maskz_conj_pch(__mmask16 __U, __m512h __A) {
565   return (__m512h)__builtin_ia32_selectps_512((__mmask16)__U,
566                                               (__v16sf)_mm512_conj_pch(__A),
567                                               (__v16sf)_mm512_setzero_ps());
568 }
569 
570 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_add_sh(__m128h __A,
571                                                            __m128h __B) {
572   __A[0] += __B[0];
573   return __A;
574 }
575 
576 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_add_sh(__m128h __W,
577                                                                 __mmask8 __U,
578                                                                 __m128h __A,
579                                                                 __m128h __B) {
580   __A = _mm_add_sh(__A, __B);
581   return __builtin_ia32_selectsh_128(__U, __A, __W);
582 }
583 
584 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_add_sh(__mmask8 __U,
585                                                                  __m128h __A,
586                                                                  __m128h __B) {
587   __A = _mm_add_sh(__A, __B);
588   return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
589 }
590 
591 #define _mm_add_round_sh(A, B, R)                                              \
592   ((__m128h)__builtin_ia32_addsh_round_mask(                                   \
593       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
594       (__mmask8)-1, (int)(R)))
595 
596 #define _mm_mask_add_round_sh(W, U, A, B, R)                                   \
597   ((__m128h)__builtin_ia32_addsh_round_mask(                                   \
598       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
599       (__mmask8)(U), (int)(R)))
600 
601 #define _mm_maskz_add_round_sh(U, A, B, R)                                     \
602   ((__m128h)__builtin_ia32_addsh_round_mask(                                   \
603       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
604       (__mmask8)(U), (int)(R)))
605 
606 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sub_sh(__m128h __A,
607                                                            __m128h __B) {
608   __A[0] -= __B[0];
609   return __A;
610 }
611 
612 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sub_sh(__m128h __W,
613                                                                 __mmask8 __U,
614                                                                 __m128h __A,
615                                                                 __m128h __B) {
616   __A = _mm_sub_sh(__A, __B);
617   return __builtin_ia32_selectsh_128(__U, __A, __W);
618 }
619 
620 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sub_sh(__mmask8 __U,
621                                                                  __m128h __A,
622                                                                  __m128h __B) {
623   __A = _mm_sub_sh(__A, __B);
624   return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
625 }
626 
627 #define _mm_sub_round_sh(A, B, R)                                              \
628   ((__m128h)__builtin_ia32_subsh_round_mask(                                   \
629       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
630       (__mmask8)-1, (int)(R)))
631 
632 #define _mm_mask_sub_round_sh(W, U, A, B, R)                                   \
633   ((__m128h)__builtin_ia32_subsh_round_mask(                                   \
634       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
635       (__mmask8)(U), (int)(R)))
636 
637 #define _mm_maskz_sub_round_sh(U, A, B, R)                                     \
638   ((__m128h)__builtin_ia32_subsh_round_mask(                                   \
639       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
640       (__mmask8)(U), (int)(R)))
641 
642 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mul_sh(__m128h __A,
643                                                            __m128h __B) {
644   __A[0] *= __B[0];
645   return __A;
646 }
647 
648 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_mul_sh(__m128h __W,
649                                                                 __mmask8 __U,
650                                                                 __m128h __A,
651                                                                 __m128h __B) {
652   __A = _mm_mul_sh(__A, __B);
653   return __builtin_ia32_selectsh_128(__U, __A, __W);
654 }
655 
656 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_mul_sh(__mmask8 __U,
657                                                                  __m128h __A,
658                                                                  __m128h __B) {
659   __A = _mm_mul_sh(__A, __B);
660   return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
661 }
662 
663 #define _mm_mul_round_sh(A, B, R)                                              \
664   ((__m128h)__builtin_ia32_mulsh_round_mask(                                   \
665       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
666       (__mmask8)-1, (int)(R)))
667 
668 #define _mm_mask_mul_round_sh(W, U, A, B, R)                                   \
669   ((__m128h)__builtin_ia32_mulsh_round_mask(                                   \
670       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
671       (__mmask8)(U), (int)(R)))
672 
673 #define _mm_maskz_mul_round_sh(U, A, B, R)                                     \
674   ((__m128h)__builtin_ia32_mulsh_round_mask(                                   \
675       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
676       (__mmask8)(U), (int)(R)))
677 
678 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_div_sh(__m128h __A,
679                                                            __m128h __B) {
680   __A[0] /= __B[0];
681   return __A;
682 }
683 
684 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_div_sh(__m128h __W,
685                                                                 __mmask8 __U,
686                                                                 __m128h __A,
687                                                                 __m128h __B) {
688   __A = _mm_div_sh(__A, __B);
689   return __builtin_ia32_selectsh_128(__U, __A, __W);
690 }
691 
692 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_div_sh(__mmask8 __U,
693                                                                  __m128h __A,
694                                                                  __m128h __B) {
695   __A = _mm_div_sh(__A, __B);
696   return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
697 }
698 
699 #define _mm_div_round_sh(A, B, R)                                              \
700   ((__m128h)__builtin_ia32_divsh_round_mask(                                   \
701       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
702       (__mmask8)-1, (int)(R)))
703 
704 #define _mm_mask_div_round_sh(W, U, A, B, R)                                   \
705   ((__m128h)__builtin_ia32_divsh_round_mask(                                   \
706       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
707       (__mmask8)(U), (int)(R)))
708 
709 #define _mm_maskz_div_round_sh(U, A, B, R)                                     \
710   ((__m128h)__builtin_ia32_divsh_round_mask(                                   \
711       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
712       (__mmask8)(U), (int)(R)))
713 
714 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_min_sh(__m128h __A,
715                                                            __m128h __B) {
716   return (__m128h)__builtin_ia32_minsh_round_mask(
717       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
718       _MM_FROUND_CUR_DIRECTION);
719 }
720 
721 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_min_sh(__m128h __W,
722                                                                 __mmask8 __U,
723                                                                 __m128h __A,
724                                                                 __m128h __B) {
725   return (__m128h)__builtin_ia32_minsh_round_mask((__v8hf)__A, (__v8hf)__B,
726                                                   (__v8hf)__W, (__mmask8)__U,
727                                                   _MM_FROUND_CUR_DIRECTION);
728 }
729 
730 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_min_sh(__mmask8 __U,
731                                                                  __m128h __A,
732                                                                  __m128h __B) {
733   return (__m128h)__builtin_ia32_minsh_round_mask(
734       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
735       _MM_FROUND_CUR_DIRECTION);
736 }
737 
738 #define _mm_min_round_sh(A, B, R)                                              \
739   ((__m128h)__builtin_ia32_minsh_round_mask(                                   \
740       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
741       (__mmask8)-1, (int)(R)))
742 
743 #define _mm_mask_min_round_sh(W, U, A, B, R)                                   \
744   ((__m128h)__builtin_ia32_minsh_round_mask(                                   \
745       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
746       (__mmask8)(U), (int)(R)))
747 
748 #define _mm_maskz_min_round_sh(U, A, B, R)                                     \
749   ((__m128h)__builtin_ia32_minsh_round_mask(                                   \
750       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
751       (__mmask8)(U), (int)(R)))
752 
753 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_max_sh(__m128h __A,
754                                                            __m128h __B) {
755   return (__m128h)__builtin_ia32_maxsh_round_mask(
756       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
757       _MM_FROUND_CUR_DIRECTION);
758 }
759 
760 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_max_sh(__m128h __W,
761                                                                 __mmask8 __U,
762                                                                 __m128h __A,
763                                                                 __m128h __B) {
764   return (__m128h)__builtin_ia32_maxsh_round_mask((__v8hf)__A, (__v8hf)__B,
765                                                   (__v8hf)__W, (__mmask8)__U,
766                                                   _MM_FROUND_CUR_DIRECTION);
767 }
768 
769 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_max_sh(__mmask8 __U,
770                                                                  __m128h __A,
771                                                                  __m128h __B) {
772   return (__m128h)__builtin_ia32_maxsh_round_mask(
773       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
774       _MM_FROUND_CUR_DIRECTION);
775 }
776 
777 #define _mm_max_round_sh(A, B, R)                                              \
778   ((__m128h)__builtin_ia32_maxsh_round_mask(                                   \
779       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
780       (__mmask8)-1, (int)(R)))
781 
782 #define _mm_mask_max_round_sh(W, U, A, B, R)                                   \
783   ((__m128h)__builtin_ia32_maxsh_round_mask(                                   \
784       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
785       (__mmask8)(U), (int)(R)))
786 
787 #define _mm_maskz_max_round_sh(U, A, B, R)                                     \
788   ((__m128h)__builtin_ia32_maxsh_round_mask(                                   \
789       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
790       (__mmask8)(U), (int)(R)))
791 
792 #define _mm512_cmp_round_ph_mask(A, B, P, R)                                   \
793   ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A),              \
794                                            (__v32hf)(__m512h)(B), (int)(P),    \
795                                            (__mmask32)-1, (int)(R)))
796 
797 #define _mm512_mask_cmp_round_ph_mask(U, A, B, P, R)                           \
798   ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A),              \
799                                            (__v32hf)(__m512h)(B), (int)(P),    \
800                                            (__mmask32)(U), (int)(R)))
801 
802 #define _mm512_cmp_ph_mask(A, B, P)                                            \
803   _mm512_cmp_round_ph_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
804 
805 #define _mm512_mask_cmp_ph_mask(U, A, B, P)                                    \
806   _mm512_mask_cmp_round_ph_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
807 
808 #define _mm_cmp_round_sh_mask(X, Y, P, R)                                      \
809   ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X),                   \
810                                        (__v8hf)(__m128h)(Y), (int)(P),         \
811                                        (__mmask8)-1, (int)(R)))
812 
813 #define _mm_mask_cmp_round_sh_mask(M, X, Y, P, R)                              \
814   ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X),                   \
815                                        (__v8hf)(__m128h)(Y), (int)(P),         \
816                                        (__mmask8)(M), (int)(R)))
817 
818 #define _mm_cmp_sh_mask(X, Y, P)                                               \
819   ((__mmask8)__builtin_ia32_cmpsh_mask(                                        \
820       (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)-1,      \
821       _MM_FROUND_CUR_DIRECTION))
822 
823 #define _mm_mask_cmp_sh_mask(M, X, Y, P)                                       \
824   ((__mmask8)__builtin_ia32_cmpsh_mask(                                        \
825       (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)(M),     \
826       _MM_FROUND_CUR_DIRECTION))
827 // loads with vmovsh:
828 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_sh(void const *__dp) {
829   struct __mm_load_sh_struct {
830     _Float16 __u;
831   } __attribute__((__packed__, __may_alias__));
832   _Float16 __u = ((const struct __mm_load_sh_struct *)__dp)->__u;
833   return (__m128h){__u, 0, 0, 0, 0, 0, 0, 0};
834 }
835 
836 static __inline__ __m128h __DEFAULT_FN_ATTRS128
837 _mm_mask_load_sh(__m128h __W, __mmask8 __U, const void *__A) {
838   __m128h src = (__v8hf)__builtin_shufflevector(
839       (__v8hf)__W, (__v8hf)_mm_setzero_ph(), 0, 8, 8, 8, 8, 8, 8, 8);
840 
841   return (__m128h)__builtin_ia32_loadsh128_mask((const __v8hf *)__A, src, __U & 1);
842 }
843 
844 static __inline__ __m128h __DEFAULT_FN_ATTRS128
845 _mm_maskz_load_sh(__mmask8 __U, const void *__A) {
846   return (__m128h)__builtin_ia32_loadsh128_mask(
847       (const __v8hf *)__A, (__v8hf)_mm_setzero_ph(), __U & 1);
848 }
849 
850 static __inline__ __m512h __DEFAULT_FN_ATTRS512
851 _mm512_load_ph(void const *__p) {
852   return *(const __m512h *)__p;
853 }
854 
855 static __inline__ __m256h __DEFAULT_FN_ATTRS256
856 _mm256_load_ph(void const *__p) {
857   return *(const __m256h *)__p;
858 }
859 
860 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_ph(void const *__p) {
861   return *(const __m128h *)__p;
862 }
863 
864 static __inline__ __m512h __DEFAULT_FN_ATTRS512
865 _mm512_loadu_ph(void const *__p) {
866   struct __loadu_ph {
867     __m512h_u __v;
868   } __attribute__((__packed__, __may_alias__));
869   return ((const struct __loadu_ph *)__p)->__v;
870 }
871 
872 static __inline__ __m256h __DEFAULT_FN_ATTRS256
873 _mm256_loadu_ph(void const *__p) {
874   struct __loadu_ph {
875     __m256h_u __v;
876   } __attribute__((__packed__, __may_alias__));
877   return ((const struct __loadu_ph *)__p)->__v;
878 }
879 
880 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_loadu_ph(void const *__p) {
881   struct __loadu_ph {
882     __m128h_u __v;
883   } __attribute__((__packed__, __may_alias__));
884   return ((const struct __loadu_ph *)__p)->__v;
885 }
886 
887 // stores with vmovsh:
888 static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_sh(void *__dp,
889                                                           __m128h __a) {
890   struct __mm_store_sh_struct {
891     _Float16 __u;
892   } __attribute__((__packed__, __may_alias__));
893   ((struct __mm_store_sh_struct *)__dp)->__u = __a[0];
894 }
895 
896 static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_store_sh(void *__W,
897                                                                __mmask8 __U,
898                                                                __m128h __A) {
899   __builtin_ia32_storesh128_mask((__v8hf *)__W, __A, __U & 1);
900 }
901 
902 static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_store_ph(void *__P,
903                                                              __m512h __A) {
904   *(__m512h *)__P = __A;
905 }
906 
907 static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_store_ph(void *__P,
908                                                              __m256h __A) {
909   *(__m256h *)__P = __A;
910 }
911 
912 static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_ph(void *__P,
913                                                           __m128h __A) {
914   *(__m128h *)__P = __A;
915 }
916 
917 static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_storeu_ph(void *__P,
918                                                               __m512h __A) {
919   struct __storeu_ph {
920     __m512h_u __v;
921   } __attribute__((__packed__, __may_alias__));
922   ((struct __storeu_ph *)__P)->__v = __A;
923 }
924 
925 static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_storeu_ph(void *__P,
926                                                               __m256h __A) {
927   struct __storeu_ph {
928     __m256h_u __v;
929   } __attribute__((__packed__, __may_alias__));
930   ((struct __storeu_ph *)__P)->__v = __A;
931 }
932 
933 static __inline__ void __DEFAULT_FN_ATTRS128 _mm_storeu_ph(void *__P,
934                                                            __m128h __A) {
935   struct __storeu_ph {
936     __m128h_u __v;
937   } __attribute__((__packed__, __may_alias__));
938   ((struct __storeu_ph *)__P)->__v = __A;
939 }
940 
941 // moves with vmovsh:
942 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_move_sh(__m128h __a,
943                                                             __m128h __b) {
944   __a[0] = __b[0];
945   return __a;
946 }
947 
948 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_move_sh(__m128h __W,
949                                                                  __mmask8 __U,
950                                                                  __m128h __A,
951                                                                  __m128h __B) {
952   return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B), __W);
953 }
954 
955 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_move_sh(__mmask8 __U,
956                                                                   __m128h __A,
957                                                                   __m128h __B) {
958   return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B),
959                                      _mm_setzero_ph());
960 }
961 
962 // vmovw:
963 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtsi16_si128(short __a) {
964   return (__m128i)(__v8hi){__a, 0, 0, 0, 0, 0, 0, 0};
965 }
966 
967 static __inline__ short __DEFAULT_FN_ATTRS128 _mm_cvtsi128_si16(__m128i __a) {
968   __v8hi __b = (__v8hi)__a;
969   return __b[0];
970 }
971 
972 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rcp_ph(__m512h __A) {
973   return (__m512h)__builtin_ia32_rcpph512_mask(
974       (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1);
975 }
976 
977 static __inline__ __m512h __DEFAULT_FN_ATTRS512
978 _mm512_mask_rcp_ph(__m512h __W, __mmask32 __U, __m512h __A) {
979   return (__m512h)__builtin_ia32_rcpph512_mask((__v32hf)__A, (__v32hf)__W,
980                                                (__mmask32)__U);
981 }
982 
983 static __inline__ __m512h __DEFAULT_FN_ATTRS512
984 _mm512_maskz_rcp_ph(__mmask32 __U, __m512h __A) {
985   return (__m512h)__builtin_ia32_rcpph512_mask(
986       (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U);
987 }
988 
989 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rsqrt_ph(__m512h __A) {
990   return (__m512h)__builtin_ia32_rsqrtph512_mask(
991       (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1);
992 }
993 
994 static __inline__ __m512h __DEFAULT_FN_ATTRS512
995 _mm512_mask_rsqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) {
996   return (__m512h)__builtin_ia32_rsqrtph512_mask((__v32hf)__A, (__v32hf)__W,
997                                                  (__mmask32)__U);
998 }
999 
1000 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1001 _mm512_maskz_rsqrt_ph(__mmask32 __U, __m512h __A) {
1002   return (__m512h)__builtin_ia32_rsqrtph512_mask(
1003       (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U);
1004 }
1005 
1006 #define _mm512_getmant_ph(A, B, C)                                             \
1007   ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
1008       (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)),                          \
1009       (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,                           \
1010       _MM_FROUND_CUR_DIRECTION))
1011 
1012 #define _mm512_mask_getmant_ph(W, U, A, B, C)                                  \
1013   ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
1014       (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W),   \
1015       (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1016 
1017 #define _mm512_maskz_getmant_ph(U, A, B, C)                                    \
1018   ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
1019       (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)),                          \
1020       (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1021 
1022 #define _mm512_getmant_round_ph(A, B, C, R)                                    \
1023   ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
1024       (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)),                          \
1025       (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
1026 
1027 #define _mm512_mask_getmant_round_ph(W, U, A, B, C, R)                         \
1028   ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
1029       (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W),   \
1030       (__mmask32)(U), (int)(R)))
1031 
1032 #define _mm512_maskz_getmant_round_ph(U, A, B, C, R)                           \
1033   ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
1034       (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)),                          \
1035       (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1036 
1037 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_getexp_ph(__m512h __A) {
1038   return (__m512h)__builtin_ia32_getexpph512_mask(
1039       (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,
1040       _MM_FROUND_CUR_DIRECTION);
1041 }
1042 
1043 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1044 _mm512_mask_getexp_ph(__m512h __W, __mmask32 __U, __m512h __A) {
1045   return (__m512h)__builtin_ia32_getexpph512_mask(
1046       (__v32hf)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1047 }
1048 
1049 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1050 _mm512_maskz_getexp_ph(__mmask32 __U, __m512h __A) {
1051   return (__m512h)__builtin_ia32_getexpph512_mask(
1052       (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1053       _MM_FROUND_CUR_DIRECTION);
1054 }
1055 
1056 #define _mm512_getexp_round_ph(A, R)                                           \
1057   ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A),             \
1058                                             (__v32hf)_mm512_undefined_ph(),    \
1059                                             (__mmask32)-1, (int)(R)))
1060 
1061 #define _mm512_mask_getexp_round_ph(W, U, A, R)                                \
1062   ((__m512h)__builtin_ia32_getexpph512_mask(                                   \
1063       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(W), (__mmask32)(U), (int)(R)))
1064 
1065 #define _mm512_maskz_getexp_round_ph(U, A, R)                                  \
1066   ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A),             \
1067                                             (__v32hf)_mm512_setzero_ph(),      \
1068                                             (__mmask32)(U), (int)(R)))
1069 
1070 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_scalef_ph(__m512h __A,
1071                                                                  __m512h __B) {
1072   return (__m512h)__builtin_ia32_scalefph512_mask(
1073       (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,
1074       _MM_FROUND_CUR_DIRECTION);
1075 }
1076 
1077 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1078 _mm512_mask_scalef_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
1079   return (__m512h)__builtin_ia32_scalefph512_mask((__v32hf)__A, (__v32hf)__B,
1080                                                   (__v32hf)__W, (__mmask32)__U,
1081                                                   _MM_FROUND_CUR_DIRECTION);
1082 }
1083 
1084 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1085 _mm512_maskz_scalef_ph(__mmask32 __U, __m512h __A, __m512h __B) {
1086   return (__m512h)__builtin_ia32_scalefph512_mask(
1087       (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1088       _MM_FROUND_CUR_DIRECTION);
1089 }
1090 
1091 #define _mm512_scalef_round_ph(A, B, R)                                        \
1092   ((__m512h)__builtin_ia32_scalefph512_mask(                                   \
1093       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B),                            \
1094       (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
1095 
1096 #define _mm512_mask_scalef_round_ph(W, U, A, B, R)                             \
1097   ((__m512h)__builtin_ia32_scalefph512_mask(                                   \
1098       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(W),     \
1099       (__mmask32)(U), (int)(R)))
1100 
1101 #define _mm512_maskz_scalef_round_ph(U, A, B, R)                               \
1102   ((__m512h)__builtin_ia32_scalefph512_mask(                                   \
1103       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B),                            \
1104       (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1105 
1106 #define _mm512_roundscale_ph(A, B)                                             \
1107   ((__m512h)__builtin_ia32_rndscaleph_mask(                                    \
1108       (__v32hf)(__m512h)(A), (int)(B), (__v32hf)(__m512h)(A), (__mmask32)-1,   \
1109       _MM_FROUND_CUR_DIRECTION))
1110 
1111 #define _mm512_mask_roundscale_ph(A, B, C, imm)                                \
1112   ((__m512h)__builtin_ia32_rndscaleph_mask(                                    \
1113       (__v32hf)(__m512h)(C), (int)(imm), (__v32hf)(__m512h)(A),                \
1114       (__mmask32)(B), _MM_FROUND_CUR_DIRECTION))
1115 
1116 #define _mm512_maskz_roundscale_ph(A, B, imm)                                  \
1117   ((__m512h)__builtin_ia32_rndscaleph_mask(                                    \
1118       (__v32hf)(__m512h)(B), (int)(imm), (__v32hf)_mm512_setzero_ph(),         \
1119       (__mmask32)(A), _MM_FROUND_CUR_DIRECTION))
1120 
1121 #define _mm512_mask_roundscale_round_ph(A, B, C, imm, R)                       \
1122   ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(C), (int)(imm),  \
1123                                            (__v32hf)(__m512h)(A),              \
1124                                            (__mmask32)(B), (int)(R)))
1125 
1126 #define _mm512_maskz_roundscale_round_ph(A, B, imm, R)                         \
1127   ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(B), (int)(imm),  \
1128                                            (__v32hf)_mm512_setzero_ph(),       \
1129                                            (__mmask32)(A), (int)(R)))
1130 
1131 #define _mm512_roundscale_round_ph(A, imm, R)                                  \
1132   ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(A), (int)(imm),  \
1133                                            (__v32hf)_mm512_undefined_ph(),     \
1134                                            (__mmask32)-1, (int)(R)))
1135 
1136 #define _mm512_reduce_ph(A, imm)                                               \
1137   ((__m512h)__builtin_ia32_reduceph512_mask(                                   \
1138       (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_undefined_ph(),       \
1139       (__mmask32)-1, _MM_FROUND_CUR_DIRECTION))
1140 
1141 #define _mm512_mask_reduce_ph(W, U, A, imm)                                    \
1142   ((__m512h)__builtin_ia32_reduceph512_mask(                                   \
1143       (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)(__m512h)(W),                \
1144       (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1145 
1146 #define _mm512_maskz_reduce_ph(U, A, imm)                                      \
1147   ((__m512h)__builtin_ia32_reduceph512_mask(                                   \
1148       (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_setzero_ph(),         \
1149       (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1150 
1151 #define _mm512_mask_reduce_round_ph(W, U, A, imm, R)                           \
1152   ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1153                                             (__v32hf)(__m512h)(W),             \
1154                                             (__mmask32)(U), (int)(R)))
1155 
1156 #define _mm512_maskz_reduce_round_ph(U, A, imm, R)                             \
1157   ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1158                                             (__v32hf)_mm512_setzero_ph(),      \
1159                                             (__mmask32)(U), (int)(R)))
1160 
1161 #define _mm512_reduce_round_ph(A, imm, R)                                      \
1162   ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1163                                             (__v32hf)_mm512_undefined_ph(),    \
1164                                             (__mmask32)-1, (int)(R)))
1165 
1166 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rcp_sh(__m128h __A,
1167                                                            __m128h __B) {
1168   return (__m128h)__builtin_ia32_rcpsh_mask(
1169       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
1170 }
1171 
1172 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rcp_sh(__m128h __W,
1173                                                                 __mmask8 __U,
1174                                                                 __m128h __A,
1175                                                                 __m128h __B) {
1176   return (__m128h)__builtin_ia32_rcpsh_mask((__v8hf)__A, (__v8hf)__B,
1177                                             (__v8hf)__W, (__mmask8)__U);
1178 }
1179 
1180 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_rcp_sh(__mmask8 __U,
1181                                                                  __m128h __A,
1182                                                                  __m128h __B) {
1183   return (__m128h)__builtin_ia32_rcpsh_mask(
1184       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1185 }
1186 
1187 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rsqrt_sh(__m128h __A,
1188                                                              __m128h __B) {
1189   return (__m128h)__builtin_ia32_rsqrtsh_mask(
1190       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
1191 }
1192 
1193 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rsqrt_sh(__m128h __W,
1194                                                                   __mmask8 __U,
1195                                                                   __m128h __A,
1196                                                                   __m128h __B) {
1197   return (__m128h)__builtin_ia32_rsqrtsh_mask((__v8hf)__A, (__v8hf)__B,
1198                                               (__v8hf)__W, (__mmask8)__U);
1199 }
1200 
1201 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1202 _mm_maskz_rsqrt_sh(__mmask8 __U, __m128h __A, __m128h __B) {
1203   return (__m128h)__builtin_ia32_rsqrtsh_mask(
1204       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1205 }
1206 
1207 #define _mm_getmant_round_sh(A, B, C, D, R)                                    \
1208   ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
1209       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
1210       (__v8hf)_mm_setzero_ph(), (__mmask8)-1, (int)(R)))
1211 
1212 #define _mm_getmant_sh(A, B, C, D)                                             \
1213   ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
1214       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
1215       (__v8hf)_mm_setzero_ph(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))
1216 
1217 #define _mm_mask_getmant_sh(W, U, A, B, C, D)                                  \
1218   ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
1219       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
1220       (__v8hf)(__m128h)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
1221 
1222 #define _mm_mask_getmant_round_sh(W, U, A, B, C, D, R)                         \
1223   ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
1224       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
1225       (__v8hf)(__m128h)(W), (__mmask8)(U), (int)(R)))
1226 
1227 #define _mm_maskz_getmant_sh(U, A, B, C, D)                                    \
1228   ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
1229       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
1230       (__v8hf)_mm_setzero_ph(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
1231 
1232 #define _mm_maskz_getmant_round_sh(U, A, B, C, D, R)                           \
1233   ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
1234       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
1235       (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1236 
1237 #define _mm_getexp_round_sh(A, B, R)                                           \
1238   ((__m128h)__builtin_ia32_getexpsh128_round_mask(                             \
1239       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1240       (__mmask8)-1, (int)(R)))
1241 
1242 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_getexp_sh(__m128h __A,
1243                                                               __m128h __B) {
1244   return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1245       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
1246       _MM_FROUND_CUR_DIRECTION);
1247 }
1248 
1249 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1250 _mm_mask_getexp_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
1251   return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1252       (__v8hf)__A, (__v8hf)__B, (__v8hf)__W, (__mmask8)__U,
1253       _MM_FROUND_CUR_DIRECTION);
1254 }
1255 
1256 #define _mm_mask_getexp_round_sh(W, U, A, B, R)                                \
1257   ((__m128h)__builtin_ia32_getexpsh128_round_mask(                             \
1258       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
1259       (__mmask8)(U), (int)(R)))
1260 
1261 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1262 _mm_maskz_getexp_sh(__mmask8 __U, __m128h __A, __m128h __B) {
1263   return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1264       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1265       _MM_FROUND_CUR_DIRECTION);
1266 }
1267 
1268 #define _mm_maskz_getexp_round_sh(U, A, B, R)                                  \
1269   ((__m128h)__builtin_ia32_getexpsh128_round_mask(                             \
1270       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1271       (__mmask8)(U), (int)(R)))
1272 
1273 #define _mm_scalef_round_sh(A, B, R)                                           \
1274   ((__m128h)__builtin_ia32_scalefsh_round_mask(                                \
1275       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1276       (__mmask8)-1, (int)(R)))
1277 
1278 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_scalef_sh(__m128h __A,
1279                                                               __m128h __B) {
1280   return (__m128h)__builtin_ia32_scalefsh_round_mask(
1281       (__v8hf)__A, (__v8hf)(__B), (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
1282       _MM_FROUND_CUR_DIRECTION);
1283 }
1284 
1285 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1286 _mm_mask_scalef_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
1287   return (__m128h)__builtin_ia32_scalefsh_round_mask((__v8hf)__A, (__v8hf)__B,
1288                                                      (__v8hf)__W, (__mmask8)__U,
1289                                                      _MM_FROUND_CUR_DIRECTION);
1290 }
1291 
1292 #define _mm_mask_scalef_round_sh(W, U, A, B, R)                                \
1293   ((__m128h)__builtin_ia32_scalefsh_round_mask(                                \
1294       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
1295       (__mmask8)(U), (int)(R)))
1296 
1297 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1298 _mm_maskz_scalef_sh(__mmask8 __U, __m128h __A, __m128h __B) {
1299   return (__m128h)__builtin_ia32_scalefsh_round_mask(
1300       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1301       _MM_FROUND_CUR_DIRECTION);
1302 }
1303 
1304 #define _mm_maskz_scalef_round_sh(U, A, B, R)                                  \
1305   ((__m128h)__builtin_ia32_scalefsh_round_mask(                                \
1306       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1307       (__mmask8)(U), (int)(R)))
1308 
1309 #define _mm_roundscale_round_sh(A, B, imm, R)                                  \
1310   ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
1311       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1312       (__mmask8)-1, (int)(imm), (int)(R)))
1313 
1314 #define _mm_roundscale_sh(A, B, imm)                                           \
1315   ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
1316       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1317       (__mmask8)-1, (int)(imm), _MM_FROUND_CUR_DIRECTION))
1318 
1319 #define _mm_mask_roundscale_sh(W, U, A, B, I)                                  \
1320   ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
1321       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
1322       (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
1323 
1324 #define _mm_mask_roundscale_round_sh(W, U, A, B, I, R)                         \
1325   ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
1326       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
1327       (__mmask8)(U), (int)(I), (int)(R)))
1328 
1329 #define _mm_maskz_roundscale_sh(U, A, B, I)                                    \
1330   ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
1331       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1332       (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
1333 
1334 #define _mm_maskz_roundscale_round_sh(U, A, B, I, R)                           \
1335   ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
1336       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1337       (__mmask8)(U), (int)(I), (int)(R)))
1338 
1339 #define _mm_reduce_sh(A, B, C)                                                 \
1340   ((__m128h)__builtin_ia32_reducesh_mask(                                      \
1341       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1342       (__mmask8)-1, (int)(C), _MM_FROUND_CUR_DIRECTION))
1343 
1344 #define _mm_mask_reduce_sh(W, U, A, B, C)                                      \
1345   ((__m128h)__builtin_ia32_reducesh_mask(                                      \
1346       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
1347       (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
1348 
1349 #define _mm_maskz_reduce_sh(U, A, B, C)                                        \
1350   ((__m128h)__builtin_ia32_reducesh_mask(                                      \
1351       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1352       (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
1353 
1354 #define _mm_reduce_round_sh(A, B, C, R)                                        \
1355   ((__m128h)__builtin_ia32_reducesh_mask(                                      \
1356       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1357       (__mmask8)-1, (int)(C), (int)(R)))
1358 
1359 #define _mm_mask_reduce_round_sh(W, U, A, B, C, R)                             \
1360   ((__m128h)__builtin_ia32_reducesh_mask(                                      \
1361       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
1362       (__mmask8)(U), (int)(C), (int)(R)))
1363 
1364 #define _mm_maskz_reduce_round_sh(U, A, B, C, R)                               \
1365   ((__m128h)__builtin_ia32_reducesh_mask(                                      \
1366       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1367       (__mmask8)(U), (int)(C), (int)(R)))
1368 
1369 #define _mm512_sqrt_round_ph(A, R)                                             \
1370   ((__m512h)__builtin_ia32_sqrtph512((__v32hf)(__m512h)(A), (int)(R)))
1371 
1372 #define _mm512_mask_sqrt_round_ph(W, U, A, R)                                  \
1373   ((__m512h)__builtin_ia32_selectph_512(                                       \
1374       (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)),                 \
1375       (__v32hf)(__m512h)(W)))
1376 
1377 #define _mm512_maskz_sqrt_round_ph(U, A, R)                                    \
1378   ((__m512h)__builtin_ia32_selectph_512(                                       \
1379       (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)),                 \
1380       (__v32hf)_mm512_setzero_ph()))
1381 
1382 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sqrt_ph(__m512h __A) {
1383   return (__m512h)__builtin_ia32_sqrtph512((__v32hf)__A,
1384                                            _MM_FROUND_CUR_DIRECTION);
1385 }
1386 
1387 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1388 _mm512_mask_sqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) {
1389   return (__m512h)__builtin_ia32_selectph_512(
1390       (__mmask32)(__U),
1391       (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)),
1392       (__v32hf)(__m512h)(__W));
1393 }
1394 
1395 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1396 _mm512_maskz_sqrt_ph(__mmask32 __U, __m512h __A) {
1397   return (__m512h)__builtin_ia32_selectph_512(
1398       (__mmask32)(__U),
1399       (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)),
1400       (__v32hf)_mm512_setzero_ph());
1401 }
1402 
1403 #define _mm_sqrt_round_sh(A, B, R)                                             \
1404   ((__m128h)__builtin_ia32_sqrtsh_round_mask(                                  \
1405       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1406       (__mmask8)-1, (int)(R)))
1407 
1408 #define _mm_mask_sqrt_round_sh(W, U, A, B, R)                                  \
1409   ((__m128h)__builtin_ia32_sqrtsh_round_mask(                                  \
1410       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
1411       (__mmask8)(U), (int)(R)))
1412 
1413 #define _mm_maskz_sqrt_round_sh(U, A, B, R)                                    \
1414   ((__m128h)__builtin_ia32_sqrtsh_round_mask(                                  \
1415       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1416       (__mmask8)(U), (int)(R)))
1417 
1418 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_sh(__m128h __A,
1419                                                             __m128h __B) {
1420   return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1421       (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
1422       (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
1423 }
1424 
1425 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_sh(__m128h __W,
1426                                                                  __mmask32 __U,
1427                                                                  __m128h __A,
1428                                                                  __m128h __B) {
1429   return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1430       (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)(__m128h)(__W),
1431       (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
1432 }
1433 
1434 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_sh(__mmask32 __U,
1435                                                                   __m128h __A,
1436                                                                   __m128h __B) {
1437   return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1438       (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
1439       (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
1440 }
1441 
1442 #define _mm512_mask_fpclass_ph_mask(U, A, imm)                                 \
1443   ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A),          \
1444                                                (int)(imm), (__mmask32)(U)))
1445 
1446 #define _mm512_fpclass_ph_mask(A, imm)                                         \
1447   ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A),          \
1448                                                (int)(imm), (__mmask32)-1))
1449 
1450 #define _mm_fpclass_sh_mask(A, imm)                                            \
1451   ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm),   \
1452                                            (__mmask8)-1))
1453 
1454 #define _mm_mask_fpclass_sh_mask(U, A, imm)                                    \
1455   ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm),   \
1456                                            (__mmask8)(U)))
1457 
1458 #define _mm512_cvt_roundpd_ph(A, R)                                            \
1459   ((__m128h)__builtin_ia32_vcvtpd2ph512_mask(                                  \
1460       (__v8df)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
1461 
1462 #define _mm512_mask_cvt_roundpd_ph(W, U, A, R)                                 \
1463   ((__m128h)__builtin_ia32_vcvtpd2ph512_mask((__v8df)(A), (__v8hf)(W),         \
1464                                              (__mmask8)(U), (int)(R)))
1465 
1466 #define _mm512_maskz_cvt_roundpd_ph(U, A, R)                                   \
1467   ((__m128h)__builtin_ia32_vcvtpd2ph512_mask(                                  \
1468       (__v8df)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1469 
1470 static __inline__ __m128h __DEFAULT_FN_ATTRS512 _mm512_cvtpd_ph(__m512d __A) {
1471   return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1472       (__v8df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
1473       _MM_FROUND_CUR_DIRECTION);
1474 }
1475 
1476 static __inline__ __m128h __DEFAULT_FN_ATTRS512
1477 _mm512_mask_cvtpd_ph(__m128h __W, __mmask8 __U, __m512d __A) {
1478   return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1479       (__v8df)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
1480 }
1481 
1482 static __inline__ __m128h __DEFAULT_FN_ATTRS512
1483 _mm512_maskz_cvtpd_ph(__mmask8 __U, __m512d __A) {
1484   return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1485       (__v8df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1486       _MM_FROUND_CUR_DIRECTION);
1487 }
1488 
1489 #define _mm512_cvt_roundph_pd(A, R)                                            \
1490   ((__m512d)__builtin_ia32_vcvtph2pd512_mask(                                  \
1491       (__v8hf)(A), (__v8df)_mm512_undefined_pd(), (__mmask8)(-1), (int)(R)))
1492 
1493 #define _mm512_mask_cvt_roundph_pd(W, U, A, R)                                 \
1494   ((__m512d)__builtin_ia32_vcvtph2pd512_mask((__v8hf)(A), (__v8df)(W),         \
1495                                              (__mmask8)(U), (int)(R)))
1496 
1497 #define _mm512_maskz_cvt_roundph_pd(U, A, R)                                   \
1498   ((__m512d)__builtin_ia32_vcvtph2pd512_mask(                                  \
1499       (__v8hf)(A), (__v8df)_mm512_setzero_pd(), (__mmask8)(U), (int)(R)))
1500 
1501 static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_cvtph_pd(__m128h __A) {
1502   return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1503       (__v8hf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)-1,
1504       _MM_FROUND_CUR_DIRECTION);
1505 }
1506 
1507 static __inline__ __m512d __DEFAULT_FN_ATTRS512
1508 _mm512_mask_cvtph_pd(__m512d __W, __mmask8 __U, __m128h __A) {
1509   return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1510       (__v8hf)__A, (__v8df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
1511 }
1512 
1513 static __inline__ __m512d __DEFAULT_FN_ATTRS512
1514 _mm512_maskz_cvtph_pd(__mmask8 __U, __m128h __A) {
1515   return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1516       (__v8hf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U,
1517       _MM_FROUND_CUR_DIRECTION);
1518 }
1519 
1520 #define _mm_cvt_roundsh_ss(A, B, R)                                            \
1521   ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B),       \
1522                                                (__v4sf)_mm_undefined_ps(),     \
1523                                                (__mmask8)(-1), (int)(R)))
1524 
1525 #define _mm_mask_cvt_roundsh_ss(W, U, A, B, R)                                 \
1526   ((__m128)__builtin_ia32_vcvtsh2ss_round_mask(                                \
1527       (__v4sf)(A), (__v8hf)(B), (__v4sf)(W), (__mmask8)(U), (int)(R)))
1528 
1529 #define _mm_maskz_cvt_roundsh_ss(U, A, B, R)                                   \
1530   ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B),       \
1531                                                (__v4sf)_mm_setzero_ps(),       \
1532                                                (__mmask8)(U), (int)(R)))
1533 
1534 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtsh_ss(__m128 __A,
1535                                                             __m128h __B) {
1536   return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
1537       (__v4sf)__A, (__v8hf)__B, (__v4sf)_mm_undefined_ps(), (__mmask8)-1,
1538       _MM_FROUND_CUR_DIRECTION);
1539 }
1540 
1541 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtsh_ss(__m128 __W,
1542                                                                  __mmask8 __U,
1543                                                                  __m128 __A,
1544                                                                  __m128h __B) {
1545   return (__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)__A, (__v8hf)__B,
1546                                                      (__v4sf)__W, (__mmask8)__U,
1547                                                      _MM_FROUND_CUR_DIRECTION);
1548 }
1549 
1550 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_cvtsh_ss(__mmask8 __U,
1551                                                                   __m128 __A,
1552                                                                   __m128h __B) {
1553   return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
1554       (__v4sf)__A, (__v8hf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U,
1555       _MM_FROUND_CUR_DIRECTION);
1556 }
1557 
1558 #define _mm_cvt_roundss_sh(A, B, R)                                            \
1559   ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B),      \
1560                                                 (__v8hf)_mm_undefined_ph(),    \
1561                                                 (__mmask8)(-1), (int)(R)))
1562 
1563 #define _mm_mask_cvt_roundss_sh(W, U, A, B, R)                                 \
1564   ((__m128h)__builtin_ia32_vcvtss2sh_round_mask(                               \
1565       (__v8hf)(A), (__v4sf)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
1566 
1567 #define _mm_maskz_cvt_roundss_sh(U, A, B, R)                                   \
1568   ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B),      \
1569                                                 (__v8hf)_mm_setzero_ph(),      \
1570                                                 (__mmask8)(U), (int)(R)))
1571 
1572 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtss_sh(__m128h __A,
1573                                                              __m128 __B) {
1574   return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1575       (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_undefined_ph(), (__mmask8)-1,
1576       _MM_FROUND_CUR_DIRECTION);
1577 }
1578 
1579 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtss_sh(__m128h __W,
1580                                                                   __mmask8 __U,
1581                                                                   __m128h __A,
1582                                                                   __m128 __B) {
1583   return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1584       (__v8hf)__A, (__v4sf)__B, (__v8hf)__W, (__mmask8)__U,
1585       _MM_FROUND_CUR_DIRECTION);
1586 }
1587 
1588 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_cvtss_sh(__mmask8 __U,
1589                                                                    __m128h __A,
1590                                                                    __m128 __B) {
1591   return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1592       (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1593       _MM_FROUND_CUR_DIRECTION);
1594 }
1595 
1596 #define _mm_cvt_roundsd_sh(A, B, R)                                            \
1597   ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B),      \
1598                                                 (__v8hf)_mm_undefined_ph(),    \
1599                                                 (__mmask8)(-1), (int)(R)))
1600 
1601 #define _mm_mask_cvt_roundsd_sh(W, U, A, B, R)                                 \
1602   ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask(                               \
1603       (__v8hf)(A), (__v2df)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
1604 
1605 #define _mm_maskz_cvt_roundsd_sh(U, A, B, R)                                   \
1606   ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B),      \
1607                                                 (__v8hf)_mm_setzero_ph(),      \
1608                                                 (__mmask8)(U), (int)(R)))
1609 
1610 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtsd_sh(__m128h __A,
1611                                                              __m128d __B) {
1612   return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1613       (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_undefined_ph(), (__mmask8)-1,
1614       _MM_FROUND_CUR_DIRECTION);
1615 }
1616 
1617 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtsd_sh(__m128h __W,
1618                                                                   __mmask8 __U,
1619                                                                   __m128h __A,
1620                                                                   __m128d __B) {
1621   return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1622       (__v8hf)__A, (__v2df)__B, (__v8hf)__W, (__mmask8)__U,
1623       _MM_FROUND_CUR_DIRECTION);
1624 }
1625 
1626 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1627 _mm_maskz_cvtsd_sh(__mmask8 __U, __m128h __A, __m128d __B) {
1628   return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1629       (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1630       _MM_FROUND_CUR_DIRECTION);
1631 }
1632 
1633 #define _mm_cvt_roundsh_sd(A, B, R)                                            \
1634   ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B),      \
1635                                                 (__v2df)_mm_undefined_pd(),    \
1636                                                 (__mmask8)(-1), (int)(R)))
1637 
1638 #define _mm_mask_cvt_roundsh_sd(W, U, A, B, R)                                 \
1639   ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask(                               \
1640       (__v2df)(A), (__v8hf)(B), (__v2df)(W), (__mmask8)(U), (int)(R)))
1641 
1642 #define _mm_maskz_cvt_roundsh_sd(U, A, B, R)                                   \
1643   ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B),      \
1644                                                 (__v2df)_mm_setzero_pd(),      \
1645                                                 (__mmask8)(U), (int)(R)))
1646 
1647 static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_cvtsh_sd(__m128d __A,
1648                                                              __m128h __B) {
1649   return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1650       (__v2df)__A, (__v8hf)__B, (__v2df)_mm_undefined_pd(), (__mmask8)-1,
1651       _MM_FROUND_CUR_DIRECTION);
1652 }
1653 
1654 static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_cvtsh_sd(__m128d __W,
1655                                                                   __mmask8 __U,
1656                                                                   __m128d __A,
1657                                                                   __m128h __B) {
1658   return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1659       (__v2df)__A, (__v8hf)__B, (__v2df)__W, (__mmask8)__U,
1660       _MM_FROUND_CUR_DIRECTION);
1661 }
1662 
1663 static __inline__ __m128d __DEFAULT_FN_ATTRS128
1664 _mm_maskz_cvtsh_sd(__mmask8 __U, __m128d __A, __m128h __B) {
1665   return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1666       (__v2df)__A, (__v8hf)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U,
1667       _MM_FROUND_CUR_DIRECTION);
1668 }
1669 
1670 #define _mm512_cvt_roundph_epi16(A, R)                                         \
1671   ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A),                      \
1672                                             (__v32hi)_mm512_undefined_epi32(), \
1673                                             (__mmask32)(-1), (int)(R)))
1674 
1675 #define _mm512_mask_cvt_roundph_epi16(W, U, A, R)                              \
1676   ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), (__v32hi)(W),        \
1677                                             (__mmask32)(U), (int)(R)))
1678 
1679 #define _mm512_maskz_cvt_roundph_epi16(U, A, R)                                \
1680   ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A),                      \
1681                                             (__v32hi)_mm512_setzero_epi32(),   \
1682                                             (__mmask32)(U), (int)(R)))
1683 
1684 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1685 _mm512_cvtph_epi16(__m512h __A) {
1686   return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1687       (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)-1,
1688       _MM_FROUND_CUR_DIRECTION);
1689 }
1690 
1691 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1692 _mm512_mask_cvtph_epi16(__m512i __W, __mmask32 __U, __m512h __A) {
1693   return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1694       (__v32hf)__A, (__v32hi)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1695 }
1696 
1697 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1698 _mm512_maskz_cvtph_epi16(__mmask32 __U, __m512h __A) {
1699   return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1700       (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)__U,
1701       _MM_FROUND_CUR_DIRECTION);
1702 }
1703 
1704 #define _mm512_cvtt_roundph_epi16(A, R)                                        \
1705   ((__m512i)__builtin_ia32_vcvttph2w512_mask(                                  \
1706       (__v32hf)(A), (__v32hi)_mm512_undefined_epi32(), (__mmask32)(-1),        \
1707       (int)(R)))
1708 
1709 #define _mm512_mask_cvtt_roundph_epi16(W, U, A, R)                             \
1710   ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), (__v32hi)(W),       \
1711                                              (__mmask32)(U), (int)(R)))
1712 
1713 #define _mm512_maskz_cvtt_roundph_epi16(U, A, R)                               \
1714   ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A),                     \
1715                                              (__v32hi)_mm512_setzero_epi32(),  \
1716                                              (__mmask32)(U), (int)(R)))
1717 
1718 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1719 _mm512_cvttph_epi16(__m512h __A) {
1720   return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1721       (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)-1,
1722       _MM_FROUND_CUR_DIRECTION);
1723 }
1724 
1725 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1726 _mm512_mask_cvttph_epi16(__m512i __W, __mmask32 __U, __m512h __A) {
1727   return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1728       (__v32hf)__A, (__v32hi)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1729 }
1730 
1731 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1732 _mm512_maskz_cvttph_epi16(__mmask32 __U, __m512h __A) {
1733   return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1734       (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)__U,
1735       _MM_FROUND_CUR_DIRECTION);
1736 }
1737 
1738 #define _mm512_cvt_roundepi16_ph(A, R)                                         \
1739   ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A),                      \
1740                                             (__v32hf)_mm512_undefined_ph(),    \
1741                                             (__mmask32)(-1), (int)(R)))
1742 
1743 #define _mm512_mask_cvt_roundepi16_ph(W, U, A, R)                              \
1744   ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), (__v32hf)(W),        \
1745                                             (__mmask32)(U), (int)(R)))
1746 
1747 #define _mm512_maskz_cvt_roundepi16_ph(U, A, R)                                \
1748   ((__m512h)__builtin_ia32_vcvtw2ph512_mask(                                   \
1749       (__v32hi)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1750 
1751 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1752 _mm512_cvtepi16_ph(__m512i __A) {
1753   return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1754       (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)-1,
1755       _MM_FROUND_CUR_DIRECTION);
1756 }
1757 
1758 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1759 _mm512_mask_cvtepi16_ph(__m512h __W, __mmask32 __U, __m512i __A) {
1760   return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1761       (__v32hi)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1762 }
1763 
1764 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1765 _mm512_maskz_cvtepi16_ph(__mmask32 __U, __m512i __A) {
1766   return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1767       (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1768       _MM_FROUND_CUR_DIRECTION);
1769 }
1770 
1771 #define _mm512_cvt_roundph_epu16(A, R)                                         \
1772   ((__m512i)__builtin_ia32_vcvtph2uw512_mask(                                  \
1773       (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1),        \
1774       (int)(R)))
1775 
1776 #define _mm512_mask_cvt_roundph_epu16(W, U, A, R)                              \
1777   ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), (__v32hu)(W),       \
1778                                              (__mmask32)(U), (int)(R)))
1779 
1780 #define _mm512_maskz_cvt_roundph_epu16(U, A, R)                                \
1781   ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A),                     \
1782                                              (__v32hu)_mm512_setzero_epi32(),  \
1783                                              (__mmask32)(U), (int)(R)))
1784 
1785 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1786 _mm512_cvtph_epu16(__m512h __A) {
1787   return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1788       (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)-1,
1789       _MM_FROUND_CUR_DIRECTION);
1790 }
1791 
1792 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1793 _mm512_mask_cvtph_epu16(__m512i __W, __mmask32 __U, __m512h __A) {
1794   return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1795       (__v32hf)__A, (__v32hu)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1796 }
1797 
1798 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1799 _mm512_maskz_cvtph_epu16(__mmask32 __U, __m512h __A) {
1800   return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1801       (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)__U,
1802       _MM_FROUND_CUR_DIRECTION);
1803 }
1804 
1805 #define _mm512_cvtt_roundph_epu16(A, R)                                        \
1806   ((__m512i)__builtin_ia32_vcvttph2uw512_mask(                                 \
1807       (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1),        \
1808       (int)(R)))
1809 
1810 #define _mm512_mask_cvtt_roundph_epu16(W, U, A, R)                             \
1811   ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), (__v32hu)(W),      \
1812                                               (__mmask32)(U), (int)(R)))
1813 
1814 #define _mm512_maskz_cvtt_roundph_epu16(U, A, R)                               \
1815   ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A),                    \
1816                                               (__v32hu)_mm512_setzero_epi32(), \
1817                                               (__mmask32)(U), (int)(R)))
1818 
1819 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1820 _mm512_cvttph_epu16(__m512h __A) {
1821   return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1822       (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)-1,
1823       _MM_FROUND_CUR_DIRECTION);
1824 }
1825 
1826 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1827 _mm512_mask_cvttph_epu16(__m512i __W, __mmask32 __U, __m512h __A) {
1828   return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1829       (__v32hf)__A, (__v32hu)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1830 }
1831 
1832 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1833 _mm512_maskz_cvttph_epu16(__mmask32 __U, __m512h __A) {
1834   return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1835       (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)__U,
1836       _MM_FROUND_CUR_DIRECTION);
1837 }
1838 
1839 #define _mm512_cvt_roundepu16_ph(A, R)                                         \
1840   ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A),                     \
1841                                              (__v32hf)_mm512_undefined_ph(),   \
1842                                              (__mmask32)(-1), (int)(R)))
1843 
1844 #define _mm512_mask_cvt_roundepu16_ph(W, U, A, R)                              \
1845   ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), (__v32hf)(W),       \
1846                                              (__mmask32)(U), (int)(R)))
1847 
1848 #define _mm512_maskz_cvt_roundepu16_ph(U, A, R)                                \
1849   ((__m512h)__builtin_ia32_vcvtuw2ph512_mask(                                  \
1850       (__v32hu)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1851 
1852 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1853 _mm512_cvtepu16_ph(__m512i __A) {
1854   return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1855       (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)-1,
1856       _MM_FROUND_CUR_DIRECTION);
1857 }
1858 
1859 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1860 _mm512_mask_cvtepu16_ph(__m512h __W, __mmask32 __U, __m512i __A) {
1861   return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1862       (__v32hu)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1863 }
1864 
1865 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1866 _mm512_maskz_cvtepu16_ph(__mmask32 __U, __m512i __A) {
1867   return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1868       (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1869       _MM_FROUND_CUR_DIRECTION);
1870 }
1871 
1872 #define _mm512_cvt_roundph_epi32(A, R)                                         \
1873   ((__m512i)__builtin_ia32_vcvtph2dq512_mask(                                  \
1874       (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1),        \
1875       (int)(R)))
1876 
1877 #define _mm512_mask_cvt_roundph_epi32(W, U, A, R)                              \
1878   ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), (__v16si)(W),       \
1879                                              (__mmask16)(U), (int)(R)))
1880 
1881 #define _mm512_maskz_cvt_roundph_epi32(U, A, R)                                \
1882   ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A),                     \
1883                                              (__v16si)_mm512_setzero_epi32(),  \
1884                                              (__mmask16)(U), (int)(R)))
1885 
1886 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1887 _mm512_cvtph_epi32(__m256h __A) {
1888   return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1889       (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)-1,
1890       _MM_FROUND_CUR_DIRECTION);
1891 }
1892 
1893 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1894 _mm512_mask_cvtph_epi32(__m512i __W, __mmask16 __U, __m256h __A) {
1895   return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1896       (__v16hf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1897 }
1898 
1899 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1900 _mm512_maskz_cvtph_epi32(__mmask16 __U, __m256h __A) {
1901   return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1902       (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)__U,
1903       _MM_FROUND_CUR_DIRECTION);
1904 }
1905 
1906 #define _mm512_cvt_roundph_epu32(A, R)                                         \
1907   ((__m512i)__builtin_ia32_vcvtph2udq512_mask(                                 \
1908       (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1),        \
1909       (int)(R)))
1910 
1911 #define _mm512_mask_cvt_roundph_epu32(W, U, A, R)                              \
1912   ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), (__v16su)(W),      \
1913                                               (__mmask16)(U), (int)(R)))
1914 
1915 #define _mm512_maskz_cvt_roundph_epu32(U, A, R)                                \
1916   ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A),                    \
1917                                               (__v16su)_mm512_setzero_epi32(), \
1918                                               (__mmask16)(U), (int)(R)))
1919 
1920 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1921 _mm512_cvtph_epu32(__m256h __A) {
1922   return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1923       (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)-1,
1924       _MM_FROUND_CUR_DIRECTION);
1925 }
1926 
1927 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1928 _mm512_mask_cvtph_epu32(__m512i __W, __mmask16 __U, __m256h __A) {
1929   return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1930       (__v16hf)__A, (__v16su)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1931 }
1932 
1933 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1934 _mm512_maskz_cvtph_epu32(__mmask16 __U, __m256h __A) {
1935   return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1936       (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)__U,
1937       _MM_FROUND_CUR_DIRECTION);
1938 }
1939 
1940 #define _mm512_cvt_roundepi32_ph(A, R)                                         \
1941   ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A),                     \
1942                                              (__v16hf)_mm256_undefined_ph(),   \
1943                                              (__mmask16)(-1), (int)(R)))
1944 
1945 #define _mm512_mask_cvt_roundepi32_ph(W, U, A, R)                              \
1946   ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), (__v16hf)(W),       \
1947                                              (__mmask16)(U), (int)(R)))
1948 
1949 #define _mm512_maskz_cvt_roundepi32_ph(U, A, R)                                \
1950   ((__m256h)__builtin_ia32_vcvtdq2ph512_mask(                                  \
1951       (__v16si)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1952 
1953 static __inline__ __m256h __DEFAULT_FN_ATTRS512
1954 _mm512_cvtepi32_ph(__m512i __A) {
1955   return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1956       (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
1957       _MM_FROUND_CUR_DIRECTION);
1958 }
1959 
1960 static __inline__ __m256h __DEFAULT_FN_ATTRS512
1961 _mm512_mask_cvtepi32_ph(__m256h __W, __mmask16 __U, __m512i __A) {
1962   return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1963       (__v16si)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1964 }
1965 
1966 static __inline__ __m256h __DEFAULT_FN_ATTRS512
1967 _mm512_maskz_cvtepi32_ph(__mmask16 __U, __m512i __A) {
1968   return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1969       (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
1970       _MM_FROUND_CUR_DIRECTION);
1971 }
1972 
1973 #define _mm512_cvt_roundepu32_ph(A, R)                                         \
1974   ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A),                    \
1975                                               (__v16hf)_mm256_undefined_ph(),  \
1976                                               (__mmask16)(-1), (int)(R)))
1977 
1978 #define _mm512_mask_cvt_roundepu32_ph(W, U, A, R)                              \
1979   ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), (__v16hf)(W),      \
1980                                               (__mmask16)(U), (int)(R)))
1981 
1982 #define _mm512_maskz_cvt_roundepu32_ph(U, A, R)                                \
1983   ((__m256h)__builtin_ia32_vcvtudq2ph512_mask(                                 \
1984       (__v16su)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1985 
1986 static __inline__ __m256h __DEFAULT_FN_ATTRS512
1987 _mm512_cvtepu32_ph(__m512i __A) {
1988   return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
1989       (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
1990       _MM_FROUND_CUR_DIRECTION);
1991 }
1992 
1993 static __inline__ __m256h __DEFAULT_FN_ATTRS512
1994 _mm512_mask_cvtepu32_ph(__m256h __W, __mmask16 __U, __m512i __A) {
1995   return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
1996       (__v16su)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1997 }
1998 
1999 static __inline__ __m256h __DEFAULT_FN_ATTRS512
2000 _mm512_maskz_cvtepu32_ph(__mmask16 __U, __m512i __A) {
2001   return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
2002       (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
2003       _MM_FROUND_CUR_DIRECTION);
2004 }
2005 
2006 #define _mm512_cvtt_roundph_epi32(A, R)                                        \
2007   ((__m512i)__builtin_ia32_vcvttph2dq512_mask(                                 \
2008       (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1),        \
2009       (int)(R)))
2010 
2011 #define _mm512_mask_cvtt_roundph_epi32(W, U, A, R)                             \
2012   ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), (__v16si)(W),      \
2013                                               (__mmask16)(U), (int)(R)))
2014 
2015 #define _mm512_maskz_cvtt_roundph_epi32(U, A, R)                               \
2016   ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A),                    \
2017                                               (__v16si)_mm512_setzero_epi32(), \
2018                                               (__mmask16)(U), (int)(R)))
2019 
2020 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2021 _mm512_cvttph_epi32(__m256h __A) {
2022   return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2023       (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)-1,
2024       _MM_FROUND_CUR_DIRECTION);
2025 }
2026 
2027 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2028 _mm512_mask_cvttph_epi32(__m512i __W, __mmask16 __U, __m256h __A) {
2029   return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2030       (__v16hf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2031 }
2032 
2033 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2034 _mm512_maskz_cvttph_epi32(__mmask16 __U, __m256h __A) {
2035   return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2036       (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)__U,
2037       _MM_FROUND_CUR_DIRECTION);
2038 }
2039 
2040 #define _mm512_cvtt_roundph_epu32(A, R)                                        \
2041   ((__m512i)__builtin_ia32_vcvttph2udq512_mask(                                \
2042       (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1),        \
2043       (int)(R)))
2044 
2045 #define _mm512_mask_cvtt_roundph_epu32(W, U, A, R)                             \
2046   ((__m512i)__builtin_ia32_vcvttph2udq512_mask((__v16hf)(A), (__v16su)(W),     \
2047                                                (__mmask16)(U), (int)(R)))
2048 
2049 #define _mm512_maskz_cvtt_roundph_epu32(U, A, R)                               \
2050   ((__m512i)__builtin_ia32_vcvttph2udq512_mask(                                \
2051       (__v16hf)(A), (__v16su)_mm512_setzero_epi32(), (__mmask16)(U),           \
2052       (int)(R)))
2053 
2054 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2055 _mm512_cvttph_epu32(__m256h __A) {
2056   return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2057       (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)-1,
2058       _MM_FROUND_CUR_DIRECTION);
2059 }
2060 
2061 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2062 _mm512_mask_cvttph_epu32(__m512i __W, __mmask16 __U, __m256h __A) {
2063   return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2064       (__v16hf)__A, (__v16su)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2065 }
2066 
2067 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2068 _mm512_maskz_cvttph_epu32(__mmask16 __U, __m256h __A) {
2069   return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2070       (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)__U,
2071       _MM_FROUND_CUR_DIRECTION);
2072 }
2073 
2074 #define _mm512_cvt_roundepi64_ph(A, R)                                         \
2075   ((__m128h)__builtin_ia32_vcvtqq2ph512_mask(                                  \
2076       (__v8di)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
2077 
2078 #define _mm512_mask_cvt_roundepi64_ph(W, U, A, R)                              \
2079   ((__m128h)__builtin_ia32_vcvtqq2ph512_mask((__v8di)(A), (__v8hf)(W),         \
2080                                              (__mmask8)(U), (int)(R)))
2081 
2082 #define _mm512_maskz_cvt_roundepi64_ph(U, A, R)                                \
2083   ((__m128h)__builtin_ia32_vcvtqq2ph512_mask(                                  \
2084       (__v8di)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
2085 
2086 static __inline__ __m128h __DEFAULT_FN_ATTRS512
2087 _mm512_cvtepi64_ph(__m512i __A) {
2088   return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2089       (__v8di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
2090       _MM_FROUND_CUR_DIRECTION);
2091 }
2092 
2093 static __inline__ __m128h __DEFAULT_FN_ATTRS512
2094 _mm512_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m512i __A) {
2095   return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2096       (__v8di)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2097 }
2098 
2099 static __inline__ __m128h __DEFAULT_FN_ATTRS512
2100 _mm512_maskz_cvtepi64_ph(__mmask8 __U, __m512i __A) {
2101   return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2102       (__v8di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
2103       _MM_FROUND_CUR_DIRECTION);
2104 }
2105 
2106 #define _mm512_cvt_roundph_epi64(A, R)                                         \
2107   ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A),                      \
2108                                              (__v8di)_mm512_undefined_epi32(), \
2109                                              (__mmask8)(-1), (int)(R)))
2110 
2111 #define _mm512_mask_cvt_roundph_epi64(W, U, A, R)                              \
2112   ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), (__v8di)(W),         \
2113                                              (__mmask8)(U), (int)(R)))
2114 
2115 #define _mm512_maskz_cvt_roundph_epi64(U, A, R)                                \
2116   ((__m512i)__builtin_ia32_vcvtph2qq512_mask(                                  \
2117       (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2118 
2119 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2120 _mm512_cvtph_epi64(__m128h __A) {
2121   return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2122       (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)-1,
2123       _MM_FROUND_CUR_DIRECTION);
2124 }
2125 
2126 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2127 _mm512_mask_cvtph_epi64(__m512i __W, __mmask8 __U, __m128h __A) {
2128   return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2129       (__v8hf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2130 }
2131 
2132 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2133 _mm512_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) {
2134   return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2135       (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)__U,
2136       _MM_FROUND_CUR_DIRECTION);
2137 }
2138 
2139 #define _mm512_cvt_roundepu64_ph(A, R)                                         \
2140   ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask(                                 \
2141       (__v8du)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
2142 
2143 #define _mm512_mask_cvt_roundepu64_ph(W, U, A, R)                              \
2144   ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask((__v8du)(A), (__v8hf)(W),        \
2145                                               (__mmask8)(U), (int)(R)))
2146 
2147 #define _mm512_maskz_cvt_roundepu64_ph(U, A, R)                                \
2148   ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask(                                 \
2149       (__v8du)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
2150 
2151 static __inline__ __m128h __DEFAULT_FN_ATTRS512
2152 _mm512_cvtepu64_ph(__m512i __A) {
2153   return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2154       (__v8du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
2155       _MM_FROUND_CUR_DIRECTION);
2156 }
2157 
2158 static __inline__ __m128h __DEFAULT_FN_ATTRS512
2159 _mm512_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m512i __A) {
2160   return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2161       (__v8du)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2162 }
2163 
2164 static __inline__ __m128h __DEFAULT_FN_ATTRS512
2165 _mm512_maskz_cvtepu64_ph(__mmask8 __U, __m512i __A) {
2166   return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2167       (__v8du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
2168       _MM_FROUND_CUR_DIRECTION);
2169 }
2170 
2171 #define _mm512_cvt_roundph_epu64(A, R)                                         \
2172   ((__m512i)__builtin_ia32_vcvtph2uqq512_mask(                                 \
2173       (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1),           \
2174       (int)(R)))
2175 
2176 #define _mm512_mask_cvt_roundph_epu64(W, U, A, R)                              \
2177   ((__m512i)__builtin_ia32_vcvtph2uqq512_mask((__v8hf)(A), (__v8du)(W),        \
2178                                               (__mmask8)(U), (int)(R)))
2179 
2180 #define _mm512_maskz_cvt_roundph_epu64(U, A, R)                                \
2181   ((__m512i)__builtin_ia32_vcvtph2uqq512_mask(                                 \
2182       (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2183 
2184 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2185 _mm512_cvtph_epu64(__m128h __A) {
2186   return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2187       (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)-1,
2188       _MM_FROUND_CUR_DIRECTION);
2189 }
2190 
2191 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2192 _mm512_mask_cvtph_epu64(__m512i __W, __mmask8 __U, __m128h __A) {
2193   return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2194       (__v8hf)__A, (__v8du)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2195 }
2196 
2197 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2198 _mm512_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) {
2199   return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2200       (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)__U,
2201       _MM_FROUND_CUR_DIRECTION);
2202 }
2203 
2204 #define _mm512_cvtt_roundph_epi64(A, R)                                        \
2205   ((__m512i)__builtin_ia32_vcvttph2qq512_mask(                                 \
2206       (__v8hf)(A), (__v8di)_mm512_undefined_epi32(), (__mmask8)(-1),           \
2207       (int)(R)))
2208 
2209 #define _mm512_mask_cvtt_roundph_epi64(W, U, A, R)                             \
2210   ((__m512i)__builtin_ia32_vcvttph2qq512_mask((__v8hf)(A), (__v8di)(W),        \
2211                                               (__mmask8)(U), (int)(R)))
2212 
2213 #define _mm512_maskz_cvtt_roundph_epi64(U, A, R)                               \
2214   ((__m512i)__builtin_ia32_vcvttph2qq512_mask(                                 \
2215       (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2216 
2217 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2218 _mm512_cvttph_epi64(__m128h __A) {
2219   return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2220       (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)-1,
2221       _MM_FROUND_CUR_DIRECTION);
2222 }
2223 
2224 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2225 _mm512_mask_cvttph_epi64(__m512i __W, __mmask8 __U, __m128h __A) {
2226   return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2227       (__v8hf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2228 }
2229 
2230 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2231 _mm512_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) {
2232   return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2233       (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)__U,
2234       _MM_FROUND_CUR_DIRECTION);
2235 }
2236 
2237 #define _mm512_cvtt_roundph_epu64(A, R)                                        \
2238   ((__m512i)__builtin_ia32_vcvttph2uqq512_mask(                                \
2239       (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1),           \
2240       (int)(R)))
2241 
2242 #define _mm512_mask_cvtt_roundph_epu64(W, U, A, R)                             \
2243   ((__m512i)__builtin_ia32_vcvttph2uqq512_mask((__v8hf)(A), (__v8du)(W),       \
2244                                                (__mmask8)(U), (int)(R)))
2245 
2246 #define _mm512_maskz_cvtt_roundph_epu64(U, A, R)                               \
2247   ((__m512i)__builtin_ia32_vcvttph2uqq512_mask(                                \
2248       (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2249 
2250 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2251 _mm512_cvttph_epu64(__m128h __A) {
2252   return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2253       (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)-1,
2254       _MM_FROUND_CUR_DIRECTION);
2255 }
2256 
2257 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2258 _mm512_mask_cvttph_epu64(__m512i __W, __mmask8 __U, __m128h __A) {
2259   return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2260       (__v8hf)__A, (__v8du)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2261 }
2262 
2263 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2264 _mm512_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) {
2265   return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2266       (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)__U,
2267       _MM_FROUND_CUR_DIRECTION);
2268 }
2269 
2270 #define _mm_cvt_roundsh_i32(A, R)                                              \
2271   ((int)__builtin_ia32_vcvtsh2si32((__v8hf)(A), (int)(R)))
2272 
2273 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvtsh_i32(__m128h __A) {
2274   return (int)__builtin_ia32_vcvtsh2si32((__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
2275 }
2276 
2277 #define _mm_cvt_roundsh_u32(A, R)                                              \
2278   ((unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)(A), (int)(R)))
2279 
2280 static __inline__ unsigned int __DEFAULT_FN_ATTRS128
2281 _mm_cvtsh_u32(__m128h __A) {
2282   return (unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)__A,
2283                                                    _MM_FROUND_CUR_DIRECTION);
2284 }
2285 
2286 #ifdef __x86_64__
2287 #define _mm_cvt_roundsh_i64(A, R)                                              \
2288   ((long long)__builtin_ia32_vcvtsh2si64((__v8hf)(A), (int)(R)))
2289 
2290 static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvtsh_i64(__m128h __A) {
2291   return (long long)__builtin_ia32_vcvtsh2si64((__v8hf)__A,
2292                                                _MM_FROUND_CUR_DIRECTION);
2293 }
2294 
2295 #define _mm_cvt_roundsh_u64(A, R)                                              \
2296   ((unsigned long long)__builtin_ia32_vcvtsh2usi64((__v8hf)(A), (int)(R)))
2297 
2298 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
2299 _mm_cvtsh_u64(__m128h __A) {
2300   return (unsigned long long)__builtin_ia32_vcvtsh2usi64(
2301       (__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
2302 }
2303 #endif // __x86_64__
2304 
2305 #define _mm_cvt_roundu32_sh(A, B, R)                                           \
2306   ((__m128h)__builtin_ia32_vcvtusi2sh((__v8hf)(A), (unsigned int)(B), (int)(R)))
2307 
2308 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2309 _mm_cvtu32_sh(__m128h __A, unsigned int __B) {
2310   __A[0] = __B;
2311   return __A;
2312 }
2313 
2314 #ifdef __x86_64__
2315 #define _mm_cvt_roundu64_sh(A, B, R)                                           \
2316   ((__m128h)__builtin_ia32_vcvtusi642sh((__v8hf)(A), (unsigned long long)(B),  \
2317                                         (int)(R)))
2318 
2319 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2320 _mm_cvtu64_sh(__m128h __A, unsigned long long __B) {
2321   __A[0] = __B;
2322   return __A;
2323 }
2324 #endif
2325 
2326 #define _mm_cvt_roundi32_sh(A, B, R)                                           \
2327   ((__m128h)__builtin_ia32_vcvtsi2sh((__v8hf)(A), (int)(B), (int)(R)))
2328 
2329 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvti32_sh(__m128h __A,
2330                                                               int __B) {
2331   __A[0] = __B;
2332   return __A;
2333 }
2334 
2335 #ifdef __x86_64__
2336 #define _mm_cvt_roundi64_sh(A, B, R)                                           \
2337   ((__m128h)__builtin_ia32_vcvtsi642sh((__v8hf)(A), (long long)(B), (int)(R)))
2338 
2339 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvti64_sh(__m128h __A,
2340                                                               long long __B) {
2341   __A[0] = __B;
2342   return __A;
2343 }
2344 #endif
2345 
2346 #define _mm_cvtt_roundsh_i32(A, R)                                             \
2347   ((int)__builtin_ia32_vcvttsh2si32((__v8hf)(A), (int)(R)))
2348 
2349 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvttsh_i32(__m128h __A) {
2350   return (int)__builtin_ia32_vcvttsh2si32((__v8hf)__A,
2351                                           _MM_FROUND_CUR_DIRECTION);
2352 }
2353 
2354 #ifdef __x86_64__
2355 #define _mm_cvtt_roundsh_i64(A, R)                                             \
2356   ((long long)__builtin_ia32_vcvttsh2si64((__v8hf)(A), (int)(R)))
2357 
2358 static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvttsh_i64(__m128h __A) {
2359   return (long long)__builtin_ia32_vcvttsh2si64((__v8hf)__A,
2360                                                 _MM_FROUND_CUR_DIRECTION);
2361 }
2362 #endif
2363 
2364 #define _mm_cvtt_roundsh_u32(A, R)                                             \
2365   ((unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)(A), (int)(R)))
2366 
2367 static __inline__ unsigned int __DEFAULT_FN_ATTRS128
2368 _mm_cvttsh_u32(__m128h __A) {
2369   return (unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)__A,
2370                                                     _MM_FROUND_CUR_DIRECTION);
2371 }
2372 
2373 #ifdef __x86_64__
2374 #define _mm_cvtt_roundsh_u64(A, R)                                             \
2375   ((unsigned long long)__builtin_ia32_vcvttsh2usi64((__v8hf)(A), (int)(R)))
2376 
2377 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
2378 _mm_cvttsh_u64(__m128h __A) {
2379   return (unsigned long long)__builtin_ia32_vcvttsh2usi64(
2380       (__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
2381 }
2382 #endif
2383 
2384 #define _mm512_cvtx_roundph_ps(A, R)                                           \
2385   ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A),                     \
2386                                              (__v16sf)_mm512_undefined_ps(),   \
2387                                              (__mmask16)(-1), (int)(R)))
2388 
2389 #define _mm512_mask_cvtx_roundph_ps(W, U, A, R)                                \
2390   ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), (__v16sf)(W),       \
2391                                              (__mmask16)(U), (int)(R)))
2392 
2393 #define _mm512_maskz_cvtx_roundph_ps(U, A, R)                                  \
2394   ((__m512)__builtin_ia32_vcvtph2psx512_mask(                                  \
2395       (__v16hf)(A), (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), (int)(R)))
2396 
2397 static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtxph_ps(__m256h __A) {
2398   return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2399       (__v16hf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)-1,
2400       _MM_FROUND_CUR_DIRECTION);
2401 }
2402 
2403 static __inline__ __m512 __DEFAULT_FN_ATTRS512
2404 _mm512_mask_cvtxph_ps(__m512 __W, __mmask16 __U, __m256h __A) {
2405   return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2406       (__v16hf)__A, (__v16sf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2407 }
2408 
2409 static __inline__ __m512 __DEFAULT_FN_ATTRS512
2410 _mm512_maskz_cvtxph_ps(__mmask16 __U, __m256h __A) {
2411   return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2412       (__v16hf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U,
2413       _MM_FROUND_CUR_DIRECTION);
2414 }
2415 
2416 #define _mm512_cvtx_roundps_ph(A, R)                                           \
2417   ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A),                    \
2418                                               (__v16hf)_mm256_undefined_ph(),  \
2419                                               (__mmask16)(-1), (int)(R)))
2420 
2421 #define _mm512_mask_cvtx_roundps_ph(W, U, A, R)                                \
2422   ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), (__v16hf)(W),      \
2423                                               (__mmask16)(U), (int)(R)))
2424 
2425 #define _mm512_maskz_cvtx_roundps_ph(U, A, R)                                  \
2426   ((__m256h)__builtin_ia32_vcvtps2phx512_mask(                                 \
2427       (__v16sf)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
2428 
2429 static __inline__ __m256h __DEFAULT_FN_ATTRS512 _mm512_cvtxps_ph(__m512 __A) {
2430   return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2431       (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
2432       _MM_FROUND_CUR_DIRECTION);
2433 }
2434 
2435 static __inline__ __m256h __DEFAULT_FN_ATTRS512
2436 _mm512_mask_cvtxps_ph(__m256h __W, __mmask16 __U, __m512 __A) {
2437   return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2438       (__v16sf)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2439 }
2440 
2441 static __inline__ __m256h __DEFAULT_FN_ATTRS512
2442 _mm512_maskz_cvtxps_ph(__mmask16 __U, __m512 __A) {
2443   return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2444       (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
2445       _MM_FROUND_CUR_DIRECTION);
2446 }
2447 
2448 #define _mm512_fmadd_round_ph(A, B, C, R)                                      \
2449   ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2450       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2451       (__mmask32)-1, (int)(R)))
2452 
2453 #define _mm512_mask_fmadd_round_ph(A, U, B, C, R)                              \
2454   ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2455       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2456       (__mmask32)(U), (int)(R)))
2457 
2458 #define _mm512_mask3_fmadd_round_ph(A, B, C, U, R)                             \
2459   ((__m512h)__builtin_ia32_vfmaddph512_mask3(                                  \
2460       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2461       (__mmask32)(U), (int)(R)))
2462 
2463 #define _mm512_maskz_fmadd_round_ph(U, A, B, C, R)                             \
2464   ((__m512h)__builtin_ia32_vfmaddph512_maskz(                                  \
2465       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2466       (__mmask32)(U), (int)(R)))
2467 
2468 #define _mm512_fmsub_round_ph(A, B, C, R)                                      \
2469   ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2470       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
2471       (__mmask32)-1, (int)(R)))
2472 
2473 #define _mm512_mask_fmsub_round_ph(A, U, B, C, R)                              \
2474   ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2475       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
2476       (__mmask32)(U), (int)(R)))
2477 
2478 #define _mm512_maskz_fmsub_round_ph(U, A, B, C, R)                             \
2479   ((__m512h)__builtin_ia32_vfmaddph512_maskz(                                  \
2480       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
2481       (__mmask32)(U), (int)(R)))
2482 
2483 #define _mm512_fnmadd_round_ph(A, B, C, R)                                     \
2484   ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2485       (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),    \
2486       (__mmask32)-1, (int)(R)))
2487 
2488 #define _mm512_mask3_fnmadd_round_ph(A, B, C, U, R)                            \
2489   ((__m512h)__builtin_ia32_vfmaddph512_mask3(                                  \
2490       -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),    \
2491       (__mmask32)(U), (int)(R)))
2492 
2493 #define _mm512_maskz_fnmadd_round_ph(U, A, B, C, R)                            \
2494   ((__m512h)__builtin_ia32_vfmaddph512_maskz(                                  \
2495       -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),    \
2496       (__mmask32)(U), (int)(R)))
2497 
2498 #define _mm512_fnmsub_round_ph(A, B, C, R)                                     \
2499   ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2500       (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),   \
2501       (__mmask32)-1, (int)(R)))
2502 
2503 #define _mm512_maskz_fnmsub_round_ph(U, A, B, C, R)                            \
2504   ((__m512h)__builtin_ia32_vfmaddph512_maskz(                                  \
2505       -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),   \
2506       (__mmask32)(U), (int)(R)))
2507 
2508 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_ph(__m512h __A,
2509                                                                 __m512h __B,
2510                                                                 __m512h __C) {
2511   return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2512                                                   (__v32hf)__C, (__mmask32)-1,
2513                                                   _MM_FROUND_CUR_DIRECTION);
2514 }
2515 
2516 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2517 _mm512_mask_fmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2518   return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2519                                                   (__v32hf)__C, (__mmask32)__U,
2520                                                   _MM_FROUND_CUR_DIRECTION);
2521 }
2522 
2523 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2524 _mm512_mask3_fmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2525   return (__m512h)__builtin_ia32_vfmaddph512_mask3((__v32hf)__A, (__v32hf)__B,
2526                                                    (__v32hf)__C, (__mmask32)__U,
2527                                                    _MM_FROUND_CUR_DIRECTION);
2528 }
2529 
2530 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2531 _mm512_maskz_fmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2532   return (__m512h)__builtin_ia32_vfmaddph512_maskz((__v32hf)__A, (__v32hf)__B,
2533                                                    (__v32hf)__C, (__mmask32)__U,
2534                                                    _MM_FROUND_CUR_DIRECTION);
2535 }
2536 
2537 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmsub_ph(__m512h __A,
2538                                                                 __m512h __B,
2539                                                                 __m512h __C) {
2540   return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2541                                                   -(__v32hf)__C, (__mmask32)-1,
2542                                                   _MM_FROUND_CUR_DIRECTION);
2543 }
2544 
2545 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2546 _mm512_mask_fmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2547   return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2548                                                   -(__v32hf)__C, (__mmask32)__U,
2549                                                   _MM_FROUND_CUR_DIRECTION);
2550 }
2551 
2552 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2553 _mm512_maskz_fmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2554   return (__m512h)__builtin_ia32_vfmaddph512_maskz(
2555       (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2556       _MM_FROUND_CUR_DIRECTION);
2557 }
2558 
2559 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fnmadd_ph(__m512h __A,
2560                                                                  __m512h __B,
2561                                                                  __m512h __C) {
2562   return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2563                                                   (__v32hf)__C, (__mmask32)-1,
2564                                                   _MM_FROUND_CUR_DIRECTION);
2565 }
2566 
2567 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2568 _mm512_mask3_fnmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2569   return (__m512h)__builtin_ia32_vfmaddph512_mask3(-(__v32hf)__A, (__v32hf)__B,
2570                                                    (__v32hf)__C, (__mmask32)__U,
2571                                                    _MM_FROUND_CUR_DIRECTION);
2572 }
2573 
2574 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2575 _mm512_maskz_fnmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2576   return (__m512h)__builtin_ia32_vfmaddph512_maskz(-(__v32hf)__A, (__v32hf)__B,
2577                                                    (__v32hf)__C, (__mmask32)__U,
2578                                                    _MM_FROUND_CUR_DIRECTION);
2579 }
2580 
2581 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fnmsub_ph(__m512h __A,
2582                                                                  __m512h __B,
2583                                                                  __m512h __C) {
2584   return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2585                                                   -(__v32hf)__C, (__mmask32)-1,
2586                                                   _MM_FROUND_CUR_DIRECTION);
2587 }
2588 
2589 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2590 _mm512_maskz_fnmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2591   return (__m512h)__builtin_ia32_vfmaddph512_maskz(
2592       -(__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2593       _MM_FROUND_CUR_DIRECTION);
2594 }
2595 
2596 #define _mm512_fmaddsub_round_ph(A, B, C, R)                                   \
2597   ((__m512h)__builtin_ia32_vfmaddsubph512_mask(                                \
2598       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2599       (__mmask32)-1, (int)(R)))
2600 
2601 #define _mm512_mask_fmaddsub_round_ph(A, U, B, C, R)                           \
2602   ((__m512h)__builtin_ia32_vfmaddsubph512_mask(                                \
2603       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2604       (__mmask32)(U), (int)(R)))
2605 
2606 #define _mm512_mask3_fmaddsub_round_ph(A, B, C, U, R)                          \
2607   ((__m512h)__builtin_ia32_vfmaddsubph512_mask3(                               \
2608       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2609       (__mmask32)(U), (int)(R)))
2610 
2611 #define _mm512_maskz_fmaddsub_round_ph(U, A, B, C, R)                          \
2612   ((__m512h)__builtin_ia32_vfmaddsubph512_maskz(                               \
2613       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2614       (__mmask32)(U), (int)(R)))
2615 
2616 #define _mm512_fmsubadd_round_ph(A, B, C, R)                                   \
2617   ((__m512h)__builtin_ia32_vfmaddsubph512_mask(                                \
2618       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
2619       (__mmask32)-1, (int)(R)))
2620 
2621 #define _mm512_mask_fmsubadd_round_ph(A, U, B, C, R)                           \
2622   ((__m512h)__builtin_ia32_vfmaddsubph512_mask(                                \
2623       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
2624       (__mmask32)(U), (int)(R)))
2625 
2626 #define _mm512_maskz_fmsubadd_round_ph(U, A, B, C, R)                          \
2627   ((__m512h)__builtin_ia32_vfmaddsubph512_maskz(                               \
2628       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
2629       (__mmask32)(U), (int)(R)))
2630 
2631 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2632 _mm512_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C) {
2633   return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2634       (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)-1,
2635       _MM_FROUND_CUR_DIRECTION);
2636 }
2637 
2638 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2639 _mm512_mask_fmaddsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2640   return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2641       (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2642       _MM_FROUND_CUR_DIRECTION);
2643 }
2644 
2645 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2646 _mm512_mask3_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2647   return (__m512h)__builtin_ia32_vfmaddsubph512_mask3(
2648       (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2649       _MM_FROUND_CUR_DIRECTION);
2650 }
2651 
2652 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2653 _mm512_maskz_fmaddsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2654   return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
2655       (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2656       _MM_FROUND_CUR_DIRECTION);
2657 }
2658 
2659 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2660 _mm512_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C) {
2661   return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2662       (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)-1,
2663       _MM_FROUND_CUR_DIRECTION);
2664 }
2665 
2666 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2667 _mm512_mask_fmsubadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2668   return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2669       (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2670       _MM_FROUND_CUR_DIRECTION);
2671 }
2672 
2673 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2674 _mm512_maskz_fmsubadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2675   return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
2676       (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2677       _MM_FROUND_CUR_DIRECTION);
2678 }
2679 
2680 #define _mm512_mask3_fmsub_round_ph(A, B, C, U, R)                             \
2681   ((__m512h)__builtin_ia32_vfmsubph512_mask3(                                  \
2682       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2683       (__mmask32)(U), (int)(R)))
2684 
2685 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2686 _mm512_mask3_fmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2687   return (__m512h)__builtin_ia32_vfmsubph512_mask3((__v32hf)__A, (__v32hf)__B,
2688                                                    (__v32hf)__C, (__mmask32)__U,
2689                                                    _MM_FROUND_CUR_DIRECTION);
2690 }
2691 
2692 #define _mm512_mask3_fmsubadd_round_ph(A, B, C, U, R)                          \
2693   ((__m512h)__builtin_ia32_vfmsubaddph512_mask3(                               \
2694       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2695       (__mmask32)(U), (int)(R)))
2696 
2697 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2698 _mm512_mask3_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2699   return (__m512h)__builtin_ia32_vfmsubaddph512_mask3(
2700       (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2701       _MM_FROUND_CUR_DIRECTION);
2702 }
2703 
2704 #define _mm512_mask_fnmadd_round_ph(A, U, B, C, R)                             \
2705   ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2706       (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),    \
2707       (__mmask32)(U), (int)(R)))
2708 
2709 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2710 _mm512_mask_fnmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2711   return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2712                                                   (__v32hf)__C, (__mmask32)__U,
2713                                                   _MM_FROUND_CUR_DIRECTION);
2714 }
2715 
2716 #define _mm512_mask_fnmsub_round_ph(A, U, B, C, R)                             \
2717   ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2718       (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),   \
2719       (__mmask32)(U), (int)(R)))
2720 
2721 #define _mm512_mask3_fnmsub_round_ph(A, B, C, U, R)                            \
2722   ((__m512h)__builtin_ia32_vfmsubph512_mask3(                                  \
2723       -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),    \
2724       (__mmask32)(U), (int)(R)))
2725 
2726 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2727 _mm512_mask_fnmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2728   return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2729                                                   -(__v32hf)__C, (__mmask32)__U,
2730                                                   _MM_FROUND_CUR_DIRECTION);
2731 }
2732 
2733 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2734 _mm512_mask3_fnmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2735   return (__m512h)__builtin_ia32_vfmsubph512_mask3(-(__v32hf)__A, (__v32hf)__B,
2736                                                    (__v32hf)__C, (__mmask32)__U,
2737                                                    _MM_FROUND_CUR_DIRECTION);
2738 }
2739 
2740 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sh(__m128h __W,
2741                                                              __m128h __A,
2742                                                              __m128h __B) {
2743   return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
2744                                        (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
2745 }
2746 
2747 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_sh(__m128h __W,
2748                                                                   __mmask8 __U,
2749                                                                   __m128h __A,
2750                                                                   __m128h __B) {
2751   return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
2752                                        (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2753 }
2754 
2755 #define _mm_fmadd_round_sh(A, B, C, R)                                         \
2756   ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2757       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C),        \
2758       (__mmask8)-1, (int)(R)))
2759 
2760 #define _mm_mask_fmadd_round_sh(W, U, A, B, R)                                 \
2761   ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2762       (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B),        \
2763       (__mmask8)(U), (int)(R)))
2764 
2765 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2766 _mm_maskz_fmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2767   return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B, (__v8hf)__C,
2768                                         (__mmask8)__U,
2769                                         _MM_FROUND_CUR_DIRECTION);
2770 }
2771 
2772 #define _mm_maskz_fmadd_round_sh(U, A, B, C, R)                                \
2773   ((__m128h)__builtin_ia32_vfmaddsh3_maskz(                                    \
2774       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C),        \
2775       (__mmask8)(U), (int)(R)))
2776 
2777 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2778 _mm_mask3_fmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2779   return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)__Y,
2780                                         (__mmask8)__U,
2781                                         _MM_FROUND_CUR_DIRECTION);
2782 }
2783 
2784 #define _mm_mask3_fmadd_round_sh(W, X, Y, U, R)                                \
2785   ((__m128h)__builtin_ia32_vfmaddsh3_mask3(                                    \
2786       (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y),        \
2787       (__mmask8)(U), (int)(R)))
2788 
2789 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsub_sh(__m128h __W,
2790                                                              __m128h __A,
2791                                                              __m128h __B) {
2792   return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
2793                                                 -(__v8hf)__B, (__mmask8)-1,
2794                                                 _MM_FROUND_CUR_DIRECTION);
2795 }
2796 
2797 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_sh(__m128h __W,
2798                                                                   __mmask8 __U,
2799                                                                   __m128h __A,
2800                                                                   __m128h __B) {
2801   return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
2802                                                 -(__v8hf)__B, (__mmask8)__U,
2803                                                 _MM_FROUND_CUR_DIRECTION);
2804 }
2805 
2806 #define _mm_fmsub_round_sh(A, B, C, R)                                         \
2807   ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2808       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C),       \
2809       (__mmask8)-1, (int)(R)))
2810 
2811 #define _mm_mask_fmsub_round_sh(W, U, A, B, R)                                 \
2812   ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2813       (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B),       \
2814       (__mmask8)(U), (int)(R)))
2815 
2816 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2817 _mm_maskz_fmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2818   return (__m128h)__builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B,
2819                                                  -(__v8hf)__C, (__mmask8)__U,
2820                                                  _MM_FROUND_CUR_DIRECTION);
2821 }
2822 
2823 #define _mm_maskz_fmsub_round_sh(U, A, B, C, R)                                \
2824   ((__m128h)__builtin_ia32_vfmaddsh3_maskz(                                    \
2825       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C),       \
2826       (__mmask8)(U), (int)R))
2827 
2828 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2829 _mm_mask3_fmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2830   return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)__Y,
2831                                         (__mmask8)__U,
2832                                         _MM_FROUND_CUR_DIRECTION);
2833 }
2834 
2835 #define _mm_mask3_fmsub_round_sh(W, X, Y, U, R)                                \
2836   ((__m128h)__builtin_ia32_vfmsubsh3_mask3(                                    \
2837       (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y),        \
2838       (__mmask8)(U), (int)(R)))
2839 
2840 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmadd_sh(__m128h __W,
2841                                                               __m128h __A,
2842                                                               __m128h __B) {
2843   return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
2844                                        (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
2845 }
2846 
2847 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2848 _mm_mask_fnmadd_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
2849   return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
2850                                        (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2851 }
2852 
2853 #define _mm_fnmadd_round_sh(A, B, C, R)                                        \
2854   ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2855       (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C),       \
2856       (__mmask8)-1, (int)(R)))
2857 
2858 #define _mm_mask_fnmadd_round_sh(W, U, A, B, R)                                \
2859   ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2860       (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B),       \
2861       (__mmask8)(U), (int)(R)))
2862 
2863 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2864 _mm_maskz_fnmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2865   return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C,
2866                                         (__mmask8)__U,
2867                                         _MM_FROUND_CUR_DIRECTION);
2868 }
2869 
2870 #define _mm_maskz_fnmadd_round_sh(U, A, B, C, R)                               \
2871   ((__m128h)__builtin_ia32_vfmaddsh3_maskz(                                    \
2872       (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C),       \
2873       (__mmask8)(U), (int)(R)))
2874 
2875 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2876 _mm_mask3_fnmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2877   return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)__Y,
2878                                         (__mmask8)__U,
2879                                         _MM_FROUND_CUR_DIRECTION);
2880 }
2881 
2882 #define _mm_mask3_fnmadd_round_sh(W, X, Y, U, R)                               \
2883   ((__m128h)__builtin_ia32_vfmaddsh3_mask3(                                    \
2884       (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y),       \
2885       (__mmask8)(U), (int)(R)))
2886 
2887 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmsub_sh(__m128h __W,
2888                                                               __m128h __A,
2889                                                               __m128h __B) {
2890   return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
2891                                        (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
2892 }
2893 
2894 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2895 _mm_mask_fnmsub_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
2896   return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
2897                                        (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2898 }
2899 
2900 #define _mm_fnmsub_round_sh(A, B, C, R)                                        \
2901   ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2902       (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C),      \
2903       (__mmask8)-1, (int)(R)))
2904 
2905 #define _mm_mask_fnmsub_round_sh(W, U, A, B, R)                                \
2906   ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2907       (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B),      \
2908       (__mmask8)(U), (int)(R)))
2909 
2910 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2911 _mm_maskz_fnmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2912   return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C,
2913                                         (__mmask8)__U,
2914                                         _MM_FROUND_CUR_DIRECTION);
2915 }
2916 
2917 #define _mm_maskz_fnmsub_round_sh(U, A, B, C, R)                               \
2918   ((__m128h)__builtin_ia32_vfmaddsh3_maskz(                                    \
2919       (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C),      \
2920       (__mmask8)(U), (int)(R)))
2921 
2922 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2923 _mm_mask3_fnmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2924   return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)__Y,
2925                                         (__mmask8)__U,
2926                                         _MM_FROUND_CUR_DIRECTION);
2927 }
2928 
2929 #define _mm_mask3_fnmsub_round_sh(W, X, Y, U, R)                               \
2930   ((__m128h)__builtin_ia32_vfmsubsh3_mask3(                                    \
2931       (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y),       \
2932       (__mmask8)(U), (int)(R)))
2933 
2934 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmadd_sch(__m128h __A,
2935                                                                __m128h __B,
2936                                                                __m128h __C) {
2937   return (__m128h)__builtin_ia32_vfcmaddcsh_mask((__v4sf)__A, (__v4sf)__B,
2938                                                  (__v4sf)__C, (__mmask8)-1,
2939                                                  _MM_FROUND_CUR_DIRECTION);
2940 }
2941 
2942 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2943 _mm_mask_fcmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
2944   return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask(
2945       (__v4sf)__A, (__v4sf)(__B), (__v4sf)(__C), __U, _MM_FROUND_CUR_DIRECTION);
2946 }
2947 
2948 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2949 _mm_maskz_fcmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2950   return (__m128h)__builtin_ia32_vfcmaddcsh_maskz((__v4sf)__A, (__v4sf)__B,
2951                                                   (__v4sf)__C, (__mmask8)__U,
2952                                                   _MM_FROUND_CUR_DIRECTION);
2953 }
2954 
2955 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2956 _mm_mask3_fcmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
2957   return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask3(
2958       (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U, _MM_FROUND_CUR_DIRECTION);
2959 }
2960 
2961 #define _mm_fcmadd_round_sch(A, B, C, R)                                       \
2962   ((__m128h)__builtin_ia32_vfcmaddcsh_mask(                                    \
2963       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
2964       (__mmask8)-1, (int)(R)))
2965 
2966 #define _mm_mask_fcmadd_round_sch(A, U, B, C, R)                               \
2967   ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask(                              \
2968       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
2969       (__mmask8)(U), (int)(R)))
2970 
2971 #define _mm_maskz_fcmadd_round_sch(U, A, B, C, R)                              \
2972   ((__m128h)__builtin_ia32_vfcmaddcsh_maskz(                                   \
2973       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
2974       (__mmask8)(U), (int)(R)))
2975 
2976 #define _mm_mask3_fcmadd_round_sch(A, B, C, U, R)                              \
2977   ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask3(                             \
2978       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
2979       (__mmask8)(U), (int)(R)))
2980 
2981 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sch(__m128h __A,
2982                                                               __m128h __B,
2983                                                               __m128h __C) {
2984   return (__m128h)__builtin_ia32_vfmaddcsh_mask((__v4sf)__A, (__v4sf)__B,
2985                                                 (__v4sf)__C, (__mmask8)-1,
2986                                                 _MM_FROUND_CUR_DIRECTION);
2987 }
2988 
2989 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2990 _mm_mask_fmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
2991   return (__m128h)__builtin_ia32_vfmaddcsh_round_mask(
2992       (__v4sf)__A, (__v4sf)(__B), (__v4sf)(__C), __U, _MM_FROUND_CUR_DIRECTION);
2993 }
2994 
2995 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2996 _mm_maskz_fmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2997   return (__m128h)__builtin_ia32_vfmaddcsh_maskz((__v4sf)__A, (__v4sf)__B,
2998                                                  (__v4sf)__C, (__mmask8)__U,
2999                                                  _MM_FROUND_CUR_DIRECTION);
3000 }
3001 
3002 static __inline__ __m128h __DEFAULT_FN_ATTRS128
3003 _mm_mask3_fmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
3004   return (__m128h)__builtin_ia32_vfmaddcsh_round_mask3(
3005       (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U, _MM_FROUND_CUR_DIRECTION);
3006 }
3007 
3008 #define _mm_fmadd_round_sch(A, B, C, R)                                        \
3009   ((__m128h)__builtin_ia32_vfmaddcsh_mask(                                     \
3010       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
3011       (__mmask8)-1, (int)(R)))
3012 
3013 #define _mm_mask_fmadd_round_sch(A, U, B, C, R)                                \
3014   ((__m128h)__builtin_ia32_vfmaddcsh_round_mask(                               \
3015       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
3016       (__mmask8)(U), (int)(R)))
3017 
3018 #define _mm_maskz_fmadd_round_sch(U, A, B, C, R)                               \
3019   ((__m128h)__builtin_ia32_vfmaddcsh_maskz(                                    \
3020       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
3021       (__mmask8)(U), (int)(R)))
3022 
3023 #define _mm_mask3_fmadd_round_sch(A, B, C, U, R)                               \
3024   ((__m128h)__builtin_ia32_vfmaddcsh_round_mask3(                              \
3025       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
3026       (__mmask8)(U), (int)(R)))
3027 
3028 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmul_sch(__m128h __A,
3029                                                               __m128h __B) {
3030   return (__m128h)__builtin_ia32_vfcmulcsh_mask(
3031       (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1,
3032       _MM_FROUND_CUR_DIRECTION);
3033 }
3034 
3035 static __inline__ __m128h __DEFAULT_FN_ATTRS128
3036 _mm_mask_fcmul_sch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
3037   return (__m128h)__builtin_ia32_vfcmulcsh_mask((__v4sf)__A, (__v4sf)__B,
3038                                                 (__v4sf)__W, (__mmask8)__U,
3039                                                 _MM_FROUND_CUR_DIRECTION);
3040 }
3041 
3042 static __inline__ __m128h __DEFAULT_FN_ATTRS128
3043 _mm_maskz_fcmul_sch(__mmask8 __U, __m128h __A, __m128h __B) {
3044   return (__m128h)__builtin_ia32_vfcmulcsh_mask(
3045       (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U,
3046       _MM_FROUND_CUR_DIRECTION);
3047 }
3048 
3049 #define _mm_fcmul_round_sch(A, B, R)                                           \
3050   ((__m128h)__builtin_ia32_vfcmulcsh_mask(                                     \
3051       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B),                              \
3052       (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
3053 
3054 #define _mm_mask_fcmul_round_sch(W, U, A, B, R)                                \
3055   ((__m128h)__builtin_ia32_vfcmulcsh_mask(                                     \
3056       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W),        \
3057       (__mmask8)(U), (int)(R)))
3058 
3059 #define _mm_maskz_fcmul_round_sch(U, A, B, R)                                  \
3060   ((__m128h)__builtin_ia32_vfcmulcsh_mask(                                     \
3061       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B),                              \
3062       (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
3063 
3064 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmul_sch(__m128h __A,
3065                                                              __m128h __B) {
3066   return (__m128h)__builtin_ia32_vfmulcsh_mask(
3067       (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1,
3068       _MM_FROUND_CUR_DIRECTION);
3069 }
3070 
3071 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmul_sch(__m128h __W,
3072                                                                   __mmask8 __U,
3073                                                                   __m128h __A,
3074                                                                   __m128h __B) {
3075   return (__m128h)__builtin_ia32_vfmulcsh_mask((__v4sf)__A, (__v4sf)__B,
3076                                                (__v4sf)__W, (__mmask8)__U,
3077                                                _MM_FROUND_CUR_DIRECTION);
3078 }
3079 
3080 static __inline__ __m128h __DEFAULT_FN_ATTRS128
3081 _mm_maskz_fmul_sch(__mmask8 __U, __m128h __A, __m128h __B) {
3082   return (__m128h)__builtin_ia32_vfmulcsh_mask(
3083       (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U,
3084       _MM_FROUND_CUR_DIRECTION);
3085 }
3086 
3087 #define _mm_fmul_round_sch(A, B, R)                                            \
3088   ((__m128h)__builtin_ia32_vfmulcsh_mask(                                      \
3089       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B),                              \
3090       (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
3091 
3092 #define _mm_mask_fmul_round_sch(W, U, A, B, R)                                 \
3093   ((__m128h)__builtin_ia32_vfmulcsh_mask(                                      \
3094       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W),        \
3095       (__mmask8)(U), (int)(R)))
3096 
3097 #define _mm_maskz_fmul_round_sch(U, A, B, R)                                   \
3098   ((__m128h)__builtin_ia32_vfmulcsh_mask(                                      \
3099       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B),                              \
3100       (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
3101 
3102 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmul_pch(__m512h __A,
3103                                                                  __m512h __B) {
3104   return (__m512h)__builtin_ia32_vfcmulcph512_mask(
3105       (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (__mmask16)-1,
3106       _MM_FROUND_CUR_DIRECTION);
3107 }
3108 
3109 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3110 _mm512_mask_fcmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
3111   return (__m512h)__builtin_ia32_vfcmulcph512_mask((__v16sf)__A, (__v16sf)__B,
3112                                                    (__v16sf)__W, (__mmask16)__U,
3113                                                    _MM_FROUND_CUR_DIRECTION);
3114 }
3115 
3116 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3117 _mm512_maskz_fcmul_pch(__mmask16 __U, __m512h __A, __m512h __B) {
3118   return (__m512h)__builtin_ia32_vfcmulcph512_mask(
3119       (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (__mmask16)__U,
3120       _MM_FROUND_CUR_DIRECTION);
3121 }
3122 
3123 #define _mm512_fcmul_round_pch(A, B, R)                                        \
3124   ((__m512h)__builtin_ia32_vfcmulcph512_mask(                                  \
3125       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B),                            \
3126       (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
3127 
3128 #define _mm512_mask_fcmul_round_pch(W, U, A, B, R)                             \
3129   ((__m512h)__builtin_ia32_vfcmulcph512_mask(                                  \
3130       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W),     \
3131       (__mmask16)(U), (int)(R)))
3132 
3133 #define _mm512_maskz_fcmul_round_pch(U, A, B, R)                               \
3134   ((__m512h)__builtin_ia32_vfcmulcph512_mask(                                  \
3135       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B),                            \
3136       (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
3137 
3138 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmul_pch(__m512h __A,
3139                                                                 __m512h __B) {
3140   return (__m512h)__builtin_ia32_vfmulcph512_mask(
3141       (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (__mmask16)-1,
3142       _MM_FROUND_CUR_DIRECTION);
3143 }
3144 
3145 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3146 _mm512_mask_fmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
3147   return (__m512h)__builtin_ia32_vfmulcph512_mask((__v16sf)__A, (__v16sf)__B,
3148                                                   (__v16sf)__W, (__mmask16)__U,
3149                                                   _MM_FROUND_CUR_DIRECTION);
3150 }
3151 
3152 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3153 _mm512_maskz_fmul_pch(__mmask16 __U, __m512h __A, __m512h __B) {
3154   return (__m512h)__builtin_ia32_vfmulcph512_mask(
3155       (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (__mmask16)__U,
3156       _MM_FROUND_CUR_DIRECTION);
3157 }
3158 
3159 #define _mm512_fmul_round_pch(A, B, R)                                         \
3160   ((__m512h)__builtin_ia32_vfmulcph512_mask(                                   \
3161       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B),                            \
3162       (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
3163 
3164 #define _mm512_mask_fmul_round_pch(W, U, A, B, R)                              \
3165   ((__m512h)__builtin_ia32_vfmulcph512_mask(                                   \
3166       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W),     \
3167       (__mmask16)(U), (int)(R)))
3168 
3169 #define _mm512_maskz_fmul_round_pch(U, A, B, R)                                \
3170   ((__m512h)__builtin_ia32_vfmulcph512_mask(                                   \
3171       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B),                            \
3172       (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
3173 
3174 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmadd_pch(__m512h __A,
3175                                                                   __m512h __B,
3176                                                                   __m512h __C) {
3177   return (__m512h)__builtin_ia32_vfcmaddcph512_mask3(
3178       (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)-1,
3179       _MM_FROUND_CUR_DIRECTION);
3180 }
3181 
3182 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3183 _mm512_mask_fcmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
3184   return (__m512h)__builtin_ia32_vfcmaddcph512_mask(
3185       (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3186       _MM_FROUND_CUR_DIRECTION);
3187 }
3188 
3189 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3190 _mm512_mask3_fcmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
3191   return (__m512h)__builtin_ia32_vfcmaddcph512_mask3(
3192       (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3193       _MM_FROUND_CUR_DIRECTION);
3194 }
3195 
3196 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3197 _mm512_maskz_fcmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
3198   return (__m512h)__builtin_ia32_vfcmaddcph512_maskz(
3199       (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3200       _MM_FROUND_CUR_DIRECTION);
3201 }
3202 
3203 #define _mm512_fcmadd_round_pch(A, B, C, R)                                    \
3204   ((__m512h)__builtin_ia32_vfcmaddcph512_mask3(                                \
3205       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3206       (__mmask16)-1, (int)(R)))
3207 
3208 #define _mm512_mask_fcmadd_round_pch(A, U, B, C, R)                            \
3209   ((__m512h)__builtin_ia32_vfcmaddcph512_mask(                                 \
3210       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3211       (__mmask16)(U), (int)(R)))
3212 
3213 #define _mm512_mask3_fcmadd_round_pch(A, B, C, U, R)                           \
3214   ((__m512h)__builtin_ia32_vfcmaddcph512_mask3(                                \
3215       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3216       (__mmask16)(U), (int)(R)))
3217 
3218 #define _mm512_maskz_fcmadd_round_pch(U, A, B, C, R)                           \
3219   ((__m512h)__builtin_ia32_vfcmaddcph512_maskz(                                \
3220       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3221       (__mmask16)(U), (int)(R)))
3222 
3223 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_pch(__m512h __A,
3224                                                                  __m512h __B,
3225                                                                  __m512h __C) {
3226   return (__m512h)__builtin_ia32_vfmaddcph512_mask3((__v16sf)__A, (__v16sf)__B,
3227                                                     (__v16sf)__C, (__mmask16)-1,
3228                                                     _MM_FROUND_CUR_DIRECTION);
3229 }
3230 
3231 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3232 _mm512_mask_fmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
3233   return (__m512h)__builtin_ia32_vfmaddcph512_mask((__v16sf)__A, (__v16sf)__B,
3234                                                    (__v16sf)__C, (__mmask16)__U,
3235                                                    _MM_FROUND_CUR_DIRECTION);
3236 }
3237 
3238 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3239 _mm512_mask3_fmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
3240   return (__m512h)__builtin_ia32_vfmaddcph512_mask3(
3241       (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3242       _MM_FROUND_CUR_DIRECTION);
3243 }
3244 
3245 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3246 _mm512_maskz_fmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
3247   return (__m512h)__builtin_ia32_vfmaddcph512_maskz(
3248       (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3249       _MM_FROUND_CUR_DIRECTION);
3250 }
3251 
3252 #define _mm512_fmadd_round_pch(A, B, C, R)                                     \
3253   ((__m512h)__builtin_ia32_vfmaddcph512_mask3(                                 \
3254       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3255       (__mmask16)-1, (int)(R)))
3256 
3257 #define _mm512_mask_fmadd_round_pch(A, U, B, C, R)                             \
3258   ((__m512h)__builtin_ia32_vfmaddcph512_mask(                                  \
3259       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3260       (__mmask16)(U), (int)(R)))
3261 
3262 #define _mm512_mask3_fmadd_round_pch(A, B, C, U, R)                            \
3263   ((__m512h)__builtin_ia32_vfmaddcph512_mask3(                                 \
3264       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3265       (__mmask16)(U), (int)(R)))
3266 
3267 #define _mm512_maskz_fmadd_round_pch(U, A, B, C, R)                            \
3268   ((__m512h)__builtin_ia32_vfmaddcph512_maskz(                                 \
3269       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3270       (__mmask16)(U), (int)(R)))
3271 
3272 static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3273 _mm512_reduce_add_ph(__m512h __W) {
3274   return __builtin_ia32_reduce_fadd_ph512(-0.0f16, __W);
3275 }
3276 
3277 static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3278 _mm512_reduce_mul_ph(__m512h __W) {
3279   return __builtin_ia32_reduce_fmul_ph512(1.0f16, __W);
3280 }
3281 
3282 static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3283 _mm512_reduce_max_ph(__m512h __V) {
3284   return __builtin_ia32_reduce_fmax_ph512(__V);
3285 }
3286 
3287 static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3288 _mm512_reduce_min_ph(__m512h __V) {
3289   return __builtin_ia32_reduce_fmin_ph512(__V);
3290 }
3291 
3292 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3293 _mm512_mask_blend_ph(__mmask32 __U, __m512h __A, __m512h __W) {
3294   return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, (__v32hf)__W,
3295                                               (__v32hf)__A);
3296 }
3297 
3298 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3299 _mm512_permutex2var_ph(__m512h __A, __m512i __I, __m512h __B) {
3300   return (__m512h)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
3301                                                  (__v32hi)__B);
3302 }
3303 
3304 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3305 _mm512_permutexvar_ph(__m512i __A, __m512h __B) {
3306   return (__m512h)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A);
3307 }
3308 
3309 // intrinsics below are alias for f*mul_*ch
3310 #define _mm512_mul_pch(A, B) _mm512_fmul_pch(A, B)
3311 #define _mm512_mask_mul_pch(W, U, A, B) _mm512_mask_fmul_pch(W, U, A, B)
3312 #define _mm512_maskz_mul_pch(U, A, B) _mm512_maskz_fmul_pch(U, A, B)
3313 #define _mm512_mul_round_pch(A, B, R) _mm512_fmul_round_pch(A, B, R)
3314 #define _mm512_mask_mul_round_pch(W, U, A, B, R)                               \
3315   _mm512_mask_fmul_round_pch(W, U, A, B, R)
3316 #define _mm512_maskz_mul_round_pch(U, A, B, R)                                 \
3317   _mm512_maskz_fmul_round_pch(U, A, B, R)
3318 
3319 #define _mm512_cmul_pch(A, B) _mm512_fcmul_pch(A, B)
3320 #define _mm512_mask_cmul_pch(W, U, A, B) _mm512_mask_fcmul_pch(W, U, A, B)
3321 #define _mm512_maskz_cmul_pch(U, A, B) _mm512_maskz_fcmul_pch(U, A, B)
3322 #define _mm512_cmul_round_pch(A, B, R) _mm512_fcmul_round_pch(A, B, R)
3323 #define _mm512_mask_cmul_round_pch(W, U, A, B, R)                              \
3324   _mm512_mask_fcmul_round_pch(W, U, A, B, R)
3325 #define _mm512_maskz_cmul_round_pch(U, A, B, R)                                \
3326   _mm512_maskz_fcmul_round_pch(U, A, B, R)
3327 
3328 #define _mm_mul_sch(A, B) _mm_fmul_sch(A, B)
3329 #define _mm_mask_mul_sch(W, U, A, B) _mm_mask_fmul_sch(W, U, A, B)
3330 #define _mm_maskz_mul_sch(U, A, B) _mm_maskz_fmul_sch(U, A, B)
3331 #define _mm_mul_round_sch(A, B, R) _mm_fmul_round_sch(A, B, R)
3332 #define _mm_mask_mul_round_sch(W, U, A, B, R)                                  \
3333   _mm_mask_fmul_round_sch(W, U, A, B, R)
3334 #define _mm_maskz_mul_round_sch(U, A, B, R) _mm_maskz_fmul_round_sch(U, A, B, R)
3335 
3336 #define _mm_cmul_sch(A, B) _mm_fcmul_sch(A, B)
3337 #define _mm_mask_cmul_sch(W, U, A, B) _mm_mask_fcmul_sch(W, U, A, B)
3338 #define _mm_maskz_cmul_sch(U, A, B) _mm_maskz_fcmul_sch(U, A, B)
3339 #define _mm_cmul_round_sch(A, B, R) _mm_fcmul_round_sch(A, B, R)
3340 #define _mm_mask_cmul_round_sch(W, U, A, B, R)                                 \
3341   _mm_mask_fcmul_round_sch(W, U, A, B, R)
3342 #define _mm_maskz_cmul_round_sch(U, A, B, R)                                   \
3343   _mm_maskz_fcmul_round_sch(U, A, B, R)
3344 
3345 #undef __DEFAULT_FN_ATTRS128
3346 #undef __DEFAULT_FN_ATTRS256
3347 #undef __DEFAULT_FN_ATTRS512
3348 
3349 #endif
3350 #endif
3351