xref: /freebsd/contrib/llvm-project/clang/lib/Headers/avx512fp16intrin.h (revision 19fae0f66023a97a9b464b3beeeabb2081f575b3)
1 /*===----------- avx512fp16intrin.h - AVX512-FP16 intrinsics ---------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 #ifndef __IMMINTRIN_H
10 #error "Never use <avx512fp16intrin.h> directly; include <immintrin.h> instead."
11 #endif
12 
13 #ifdef __SSE2__
14 
15 #ifndef __AVX512FP16INTRIN_H
16 #define __AVX512FP16INTRIN_H
17 
18 /* Define the default attributes for the functions in this file. */
19 typedef _Float16 __v32hf __attribute__((__vector_size__(64), __aligned__(64)));
20 typedef _Float16 __m512h __attribute__((__vector_size__(64), __aligned__(64)));
21 typedef _Float16 __m512h_u __attribute__((__vector_size__(64), __aligned__(1)));
22 
23 /* Define the default attributes for the functions in this file. */
24 #define __DEFAULT_FN_ATTRS512                                                  \
25   __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"),     \
26                  __min_vector_width__(512)))
27 #define __DEFAULT_FN_ATTRS256                                                  \
28   __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"),     \
29                  __min_vector_width__(256)))
30 #define __DEFAULT_FN_ATTRS128                                                  \
31   __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"),     \
32                  __min_vector_width__(128)))
33 
34 static __inline__ _Float16 __DEFAULT_FN_ATTRS512 _mm512_cvtsh_h(__m512h __a) {
35   return __a[0];
36 }
37 
38 static __inline __m128h __DEFAULT_FN_ATTRS128 _mm_setzero_ph(void) {
39   return (__m128h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
40 }
41 
42 static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_setzero_ph(void) {
43   return (__m256h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
44                    0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
45 }
46 
47 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_undefined_ph(void) {
48   return (__m256h)__builtin_ia32_undef256();
49 }
50 
51 static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_setzero_ph(void) {
52   return (__m512h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
53                    0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
54                    0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
55 }
56 
57 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_undefined_ph(void) {
58   return (__m128h)__builtin_ia32_undef128();
59 }
60 
61 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_undefined_ph(void) {
62   return (__m512h)__builtin_ia32_undef512();
63 }
64 
65 static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_set1_ph(_Float16 __h) {
66   return (__m512h)(__v32hf){__h, __h, __h, __h, __h, __h, __h, __h,
67                             __h, __h, __h, __h, __h, __h, __h, __h,
68                             __h, __h, __h, __h, __h, __h, __h, __h,
69                             __h, __h, __h, __h, __h, __h, __h, __h};
70 }
71 
72 static __inline __m512h __DEFAULT_FN_ATTRS512
73 _mm512_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4,
74               _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8,
75               _Float16 __h9, _Float16 __h10, _Float16 __h11, _Float16 __h12,
76               _Float16 __h13, _Float16 __h14, _Float16 __h15, _Float16 __h16,
77               _Float16 __h17, _Float16 __h18, _Float16 __h19, _Float16 __h20,
78               _Float16 __h21, _Float16 __h22, _Float16 __h23, _Float16 __h24,
79               _Float16 __h25, _Float16 __h26, _Float16 __h27, _Float16 __h28,
80               _Float16 __h29, _Float16 __h30, _Float16 __h31, _Float16 __h32) {
81   return (__m512h)(__v32hf){__h32, __h31, __h30, __h29, __h28, __h27, __h26,
82                             __h25, __h24, __h23, __h22, __h21, __h20, __h19,
83                             __h18, __h17, __h16, __h15, __h14, __h13, __h12,
84                             __h11, __h10, __h9,  __h8,  __h7,  __h6,  __h5,
85                             __h4,  __h3,  __h2,  __h1};
86 }
87 
88 #define _mm512_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11, h12, h13, \
89                        h14, h15, h16, h17, h18, h19, h20, h21, h22, h23, h24,  \
90                        h25, h26, h27, h28, h29, h30, h31, h32)                 \
91   _mm512_set_ph((h32), (h31), (h30), (h29), (h28), (h27), (h26), (h25), (h24), \
92                 (h23), (h22), (h21), (h20), (h19), (h18), (h17), (h16), (h15), \
93                 (h14), (h13), (h12), (h11), (h10), (h9), (h8), (h7), (h6),     \
94                 (h5), (h4), (h3), (h2), (h1))
95 
96 static __inline __m512h __DEFAULT_FN_ATTRS512
97 _mm512_set1_pch(_Float16 _Complex h) {
98   return (__m512h)_mm512_set1_ps(__builtin_bit_cast(float, h));
99 }
100 
101 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_castph_ps(__m128h __a) {
102   return (__m128)__a;
103 }
104 
105 static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_castph_ps(__m256h __a) {
106   return (__m256)__a;
107 }
108 
109 static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_castph_ps(__m512h __a) {
110   return (__m512)__a;
111 }
112 
113 static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_castph_pd(__m128h __a) {
114   return (__m128d)__a;
115 }
116 
117 static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_castph_pd(__m256h __a) {
118   return (__m256d)__a;
119 }
120 
121 static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_castph_pd(__m512h __a) {
122   return (__m512d)__a;
123 }
124 
125 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_castph_si128(__m128h __a) {
126   return (__m128i)__a;
127 }
128 
129 static __inline__ __m256i __DEFAULT_FN_ATTRS256
130 _mm256_castph_si256(__m256h __a) {
131   return (__m256i)__a;
132 }
133 
134 static __inline__ __m512i __DEFAULT_FN_ATTRS512
135 _mm512_castph_si512(__m512h __a) {
136   return (__m512i)__a;
137 }
138 
139 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castps_ph(__m128 __a) {
140   return (__m128h)__a;
141 }
142 
143 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_castps_ph(__m256 __a) {
144   return (__m256h)__a;
145 }
146 
147 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_castps_ph(__m512 __a) {
148   return (__m512h)__a;
149 }
150 
151 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castpd_ph(__m128d __a) {
152   return (__m128h)__a;
153 }
154 
155 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_castpd_ph(__m256d __a) {
156   return (__m256h)__a;
157 }
158 
159 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_castpd_ph(__m512d __a) {
160   return (__m512h)__a;
161 }
162 
163 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castsi128_ph(__m128i __a) {
164   return (__m128h)__a;
165 }
166 
167 static __inline__ __m256h __DEFAULT_FN_ATTRS256
168 _mm256_castsi256_ph(__m256i __a) {
169   return (__m256h)__a;
170 }
171 
172 static __inline__ __m512h __DEFAULT_FN_ATTRS512
173 _mm512_castsi512_ph(__m512i __a) {
174   return (__m512h)__a;
175 }
176 
177 static __inline__ __m128h __DEFAULT_FN_ATTRS256
178 _mm256_castph256_ph128(__m256h __a) {
179   return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
180 }
181 
182 static __inline__ __m128h __DEFAULT_FN_ATTRS512
183 _mm512_castph512_ph128(__m512h __a) {
184   return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
185 }
186 
187 static __inline__ __m256h __DEFAULT_FN_ATTRS512
188 _mm512_castph512_ph256(__m512h __a) {
189   return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
190                                  12, 13, 14, 15);
191 }
192 
193 static __inline__ __m256h __DEFAULT_FN_ATTRS256
194 _mm256_castph128_ph256(__m128h __a) {
195   return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1,
196                                  -1, -1, -1, -1, -1);
197 }
198 
199 static __inline__ __m512h __DEFAULT_FN_ATTRS512
200 _mm512_castph128_ph512(__m128h __a) {
201   return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1,
202                                  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
203                                  -1, -1, -1, -1, -1, -1, -1, -1, -1);
204 }
205 
206 static __inline__ __m512h __DEFAULT_FN_ATTRS512
207 _mm512_castph256_ph512(__m256h __a) {
208   return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
209                                  12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1,
210                                  -1, -1, -1, -1, -1, -1, -1, -1);
211 }
212 
213 /// Constructs a 256-bit floating-point vector of [16 x half] from a
214 ///    128-bit floating-point vector of [8 x half]. The lower 128 bits
215 ///    contain the value of the source vector. The upper 384 bits are set
216 ///    to zero.
217 ///
218 /// \headerfile <x86intrin.h>
219 ///
220 /// This intrinsic has no corresponding instruction.
221 ///
222 /// \param __a
223 ///    A 128-bit vector of [8 x half].
224 /// \returns A 512-bit floating-point vector of [16 x half]. The lower 128 bits
225 ///    contain the value of the parameter. The upper 384 bits are set to zero.
226 static __inline__ __m256h __DEFAULT_FN_ATTRS256
227 _mm256_zextph128_ph256(__m128h __a) {
228   return __builtin_shufflevector(__a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4,
229                                  5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
230 }
231 
232 /// Constructs a 512-bit floating-point vector of [32 x half] from a
233 ///    128-bit floating-point vector of [8 x half]. The lower 128 bits
234 ///    contain the value of the source vector. The upper 384 bits are set
235 ///    to zero.
236 ///
237 /// \headerfile <x86intrin.h>
238 ///
239 /// This intrinsic has no corresponding instruction.
240 ///
241 /// \param __a
242 ///    A 128-bit vector of [8 x half].
243 /// \returns A 512-bit floating-point vector of [32 x half]. The lower 128 bits
244 ///    contain the value of the parameter. The upper 384 bits are set to zero.
245 static __inline__ __m512h __DEFAULT_FN_ATTRS512
246 _mm512_zextph128_ph512(__m128h __a) {
247   return __builtin_shufflevector(
248       __a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
249       13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15);
250 }
251 
252 /// Constructs a 512-bit floating-point vector of [32 x half] from a
253 ///    256-bit floating-point vector of [16 x half]. The lower 256 bits
254 ///    contain the value of the source vector. The upper 256 bits are set
255 ///    to zero.
256 ///
257 /// \headerfile <x86intrin.h>
258 ///
259 /// This intrinsic has no corresponding instruction.
260 ///
261 /// \param __a
262 ///    A 256-bit vector of [16 x half].
263 /// \returns A 512-bit floating-point vector of [32 x half]. The lower 256 bits
264 ///    contain the value of the parameter. The upper 256 bits are set to zero.
265 static __inline__ __m512h __DEFAULT_FN_ATTRS512
266 _mm512_zextph256_ph512(__m256h __a) {
267   return __builtin_shufflevector(__a, (__v16hf)_mm256_setzero_ph(), 0, 1, 2, 3,
268                                  4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
269                                  17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
270                                  29, 30, 31);
271 }
272 
273 #define _mm_comi_round_sh(A, B, P, R)                                          \
274   __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, (int)(P), (int)(R))
275 
276 #define _mm_comi_sh(A, B, pred)                                                \
277   _mm_comi_round_sh((A), (B), (pred), _MM_FROUND_CUR_DIRECTION)
278 
279 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comieq_sh(__m128h A,
280                                                           __m128h B) {
281   return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_EQ_OS,
282                                 _MM_FROUND_CUR_DIRECTION);
283 }
284 
285 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comilt_sh(__m128h A,
286                                                           __m128h B) {
287   return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LT_OS,
288                                 _MM_FROUND_CUR_DIRECTION);
289 }
290 
291 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comile_sh(__m128h A,
292                                                           __m128h B) {
293   return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LE_OS,
294                                 _MM_FROUND_CUR_DIRECTION);
295 }
296 
297 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comigt_sh(__m128h A,
298                                                           __m128h B) {
299   return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GT_OS,
300                                 _MM_FROUND_CUR_DIRECTION);
301 }
302 
303 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comige_sh(__m128h A,
304                                                           __m128h B) {
305   return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GE_OS,
306                                 _MM_FROUND_CUR_DIRECTION);
307 }
308 
309 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comineq_sh(__m128h A,
310                                                            __m128h B) {
311   return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_NEQ_US,
312                                 _MM_FROUND_CUR_DIRECTION);
313 }
314 
315 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomieq_sh(__m128h A,
316                                                            __m128h B) {
317   return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_EQ_OQ,
318                                 _MM_FROUND_CUR_DIRECTION);
319 }
320 
321 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomilt_sh(__m128h A,
322                                                            __m128h B) {
323   return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LT_OQ,
324                                 _MM_FROUND_CUR_DIRECTION);
325 }
326 
327 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomile_sh(__m128h A,
328                                                            __m128h B) {
329   return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LE_OQ,
330                                 _MM_FROUND_CUR_DIRECTION);
331 }
332 
333 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomigt_sh(__m128h A,
334                                                            __m128h B) {
335   return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GT_OQ,
336                                 _MM_FROUND_CUR_DIRECTION);
337 }
338 
339 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomige_sh(__m128h A,
340                                                            __m128h B) {
341   return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GE_OQ,
342                                 _MM_FROUND_CUR_DIRECTION);
343 }
344 
345 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomineq_sh(__m128h A,
346                                                             __m128h B) {
347   return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_NEQ_UQ,
348                                 _MM_FROUND_CUR_DIRECTION);
349 }
350 
351 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_add_ph(__m512h __A,
352                                                               __m512h __B) {
353   return (__m512h)((__v32hf)__A + (__v32hf)__B);
354 }
355 
356 static __inline__ __m512h __DEFAULT_FN_ATTRS512
357 _mm512_mask_add_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
358   return (__m512h)__builtin_ia32_selectph_512(
359       (__mmask32)__U, (__v32hf)_mm512_add_ph(__A, __B), (__v32hf)__W);
360 }
361 
362 static __inline__ __m512h __DEFAULT_FN_ATTRS512
363 _mm512_maskz_add_ph(__mmask32 __U, __m512h __A, __m512h __B) {
364   return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
365                                               (__v32hf)_mm512_add_ph(__A, __B),
366                                               (__v32hf)_mm512_setzero_ph());
367 }
368 
369 #define _mm512_add_round_ph(A, B, R)                                           \
370   ((__m512h)__builtin_ia32_addph512((__v32hf)(__m512h)(A),                     \
371                                     (__v32hf)(__m512h)(B), (int)(R)))
372 
373 #define _mm512_mask_add_round_ph(W, U, A, B, R)                                \
374   ((__m512h)__builtin_ia32_selectph_512(                                       \
375       (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)),             \
376       (__v32hf)(__m512h)(W)))
377 
378 #define _mm512_maskz_add_round_ph(U, A, B, R)                                  \
379   ((__m512h)__builtin_ia32_selectph_512(                                       \
380       (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)),             \
381       (__v32hf)_mm512_setzero_ph()))
382 
383 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sub_ph(__m512h __A,
384                                                               __m512h __B) {
385   return (__m512h)((__v32hf)__A - (__v32hf)__B);
386 }
387 
388 static __inline__ __m512h __DEFAULT_FN_ATTRS512
389 _mm512_mask_sub_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
390   return (__m512h)__builtin_ia32_selectph_512(
391       (__mmask32)__U, (__v32hf)_mm512_sub_ph(__A, __B), (__v32hf)__W);
392 }
393 
394 static __inline__ __m512h __DEFAULT_FN_ATTRS512
395 _mm512_maskz_sub_ph(__mmask32 __U, __m512h __A, __m512h __B) {
396   return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
397                                               (__v32hf)_mm512_sub_ph(__A, __B),
398                                               (__v32hf)_mm512_setzero_ph());
399 }
400 
401 #define _mm512_sub_round_ph(A, B, R)                                           \
402   ((__m512h)__builtin_ia32_subph512((__v32hf)(__m512h)(A),                     \
403                                     (__v32hf)(__m512h)(B), (int)(R)))
404 
405 #define _mm512_mask_sub_round_ph(W, U, A, B, R)                                \
406   ((__m512h)__builtin_ia32_selectph_512(                                       \
407       (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)),             \
408       (__v32hf)(__m512h)(W)))
409 
410 #define _mm512_maskz_sub_round_ph(U, A, B, R)                                  \
411   ((__m512h)__builtin_ia32_selectph_512(                                       \
412       (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)),             \
413       (__v32hf)_mm512_setzero_ph()))
414 
415 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_mul_ph(__m512h __A,
416                                                               __m512h __B) {
417   return (__m512h)((__v32hf)__A * (__v32hf)__B);
418 }
419 
420 static __inline__ __m512h __DEFAULT_FN_ATTRS512
421 _mm512_mask_mul_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
422   return (__m512h)__builtin_ia32_selectph_512(
423       (__mmask32)__U, (__v32hf)_mm512_mul_ph(__A, __B), (__v32hf)__W);
424 }
425 
426 static __inline__ __m512h __DEFAULT_FN_ATTRS512
427 _mm512_maskz_mul_ph(__mmask32 __U, __m512h __A, __m512h __B) {
428   return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
429                                               (__v32hf)_mm512_mul_ph(__A, __B),
430                                               (__v32hf)_mm512_setzero_ph());
431 }
432 
433 #define _mm512_mul_round_ph(A, B, R)                                           \
434   ((__m512h)__builtin_ia32_mulph512((__v32hf)(__m512h)(A),                     \
435                                     (__v32hf)(__m512h)(B), (int)(R)))
436 
437 #define _mm512_mask_mul_round_ph(W, U, A, B, R)                                \
438   ((__m512h)__builtin_ia32_selectph_512(                                       \
439       (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)),             \
440       (__v32hf)(__m512h)(W)))
441 
442 #define _mm512_maskz_mul_round_ph(U, A, B, R)                                  \
443   ((__m512h)__builtin_ia32_selectph_512(                                       \
444       (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)),             \
445       (__v32hf)_mm512_setzero_ph()))
446 
447 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_div_ph(__m512h __A,
448                                                               __m512h __B) {
449   return (__m512h)((__v32hf)__A / (__v32hf)__B);
450 }
451 
452 static __inline__ __m512h __DEFAULT_FN_ATTRS512
453 _mm512_mask_div_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
454   return (__m512h)__builtin_ia32_selectph_512(
455       (__mmask32)__U, (__v32hf)_mm512_div_ph(__A, __B), (__v32hf)__W);
456 }
457 
458 static __inline__ __m512h __DEFAULT_FN_ATTRS512
459 _mm512_maskz_div_ph(__mmask32 __U, __m512h __A, __m512h __B) {
460   return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
461                                               (__v32hf)_mm512_div_ph(__A, __B),
462                                               (__v32hf)_mm512_setzero_ph());
463 }
464 
465 #define _mm512_div_round_ph(A, B, R)                                           \
466   ((__m512h)__builtin_ia32_divph512((__v32hf)(__m512h)(A),                     \
467                                     (__v32hf)(__m512h)(B), (int)(R)))
468 
469 #define _mm512_mask_div_round_ph(W, U, A, B, R)                                \
470   ((__m512h)__builtin_ia32_selectph_512(                                       \
471       (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)),             \
472       (__v32hf)(__m512h)(W)))
473 
474 #define _mm512_maskz_div_round_ph(U, A, B, R)                                  \
475   ((__m512h)__builtin_ia32_selectph_512(                                       \
476       (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)),             \
477       (__v32hf)_mm512_setzero_ph()))
478 
479 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_min_ph(__m512h __A,
480                                                               __m512h __B) {
481   return (__m512h)__builtin_ia32_minph512((__v32hf)__A, (__v32hf)__B,
482                                           _MM_FROUND_CUR_DIRECTION);
483 }
484 
485 static __inline__ __m512h __DEFAULT_FN_ATTRS512
486 _mm512_mask_min_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
487   return (__m512h)__builtin_ia32_selectph_512(
488       (__mmask32)__U, (__v32hf)_mm512_min_ph(__A, __B), (__v32hf)__W);
489 }
490 
491 static __inline__ __m512h __DEFAULT_FN_ATTRS512
492 _mm512_maskz_min_ph(__mmask32 __U, __m512h __A, __m512h __B) {
493   return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
494                                               (__v32hf)_mm512_min_ph(__A, __B),
495                                               (__v32hf)_mm512_setzero_ph());
496 }
497 
498 #define _mm512_min_round_ph(A, B, R)                                           \
499   ((__m512h)__builtin_ia32_minph512((__v32hf)(__m512h)(A),                     \
500                                     (__v32hf)(__m512h)(B), (int)(R)))
501 
502 #define _mm512_mask_min_round_ph(W, U, A, B, R)                                \
503   ((__m512h)__builtin_ia32_selectph_512(                                       \
504       (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)),             \
505       (__v32hf)(__m512h)(W)))
506 
507 #define _mm512_maskz_min_round_ph(U, A, B, R)                                  \
508   ((__m512h)__builtin_ia32_selectph_512(                                       \
509       (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)),             \
510       (__v32hf)_mm512_setzero_ph()))
511 
512 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_max_ph(__m512h __A,
513                                                               __m512h __B) {
514   return (__m512h)__builtin_ia32_maxph512((__v32hf)__A, (__v32hf)__B,
515                                           _MM_FROUND_CUR_DIRECTION);
516 }
517 
518 static __inline__ __m512h __DEFAULT_FN_ATTRS512
519 _mm512_mask_max_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
520   return (__m512h)__builtin_ia32_selectph_512(
521       (__mmask32)__U, (__v32hf)_mm512_max_ph(__A, __B), (__v32hf)__W);
522 }
523 
524 static __inline__ __m512h __DEFAULT_FN_ATTRS512
525 _mm512_maskz_max_ph(__mmask32 __U, __m512h __A, __m512h __B) {
526   return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
527                                               (__v32hf)_mm512_max_ph(__A, __B),
528                                               (__v32hf)_mm512_setzero_ph());
529 }
530 
531 #define _mm512_max_round_ph(A, B, R)                                           \
532   ((__m512h)__builtin_ia32_maxph512((__v32hf)(__m512h)(A),                     \
533                                     (__v32hf)(__m512h)(B), (int)(R)))
534 
535 #define _mm512_mask_max_round_ph(W, U, A, B, R)                                \
536   ((__m512h)__builtin_ia32_selectph_512(                                       \
537       (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)),             \
538       (__v32hf)(__m512h)(W)))
539 
540 #define _mm512_maskz_max_round_ph(U, A, B, R)                                  \
541   ((__m512h)__builtin_ia32_selectph_512(                                       \
542       (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)),             \
543       (__v32hf)_mm512_setzero_ph()))
544 
545 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_abs_ph(__m512h __A) {
546   return (__m512h)_mm512_and_epi32(_mm512_set1_epi32(0x7FFF7FFF), (__m512i)__A);
547 }
548 
549 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_conj_pch(__m512h __A) {
550   return (__m512h)_mm512_xor_ps((__m512)__A, _mm512_set1_ps(-0.0f));
551 }
552 
553 static __inline__ __m512h __DEFAULT_FN_ATTRS512
554 _mm512_mask_conj_pch(__m512h __W, __mmask16 __U, __m512h __A) {
555   return (__m512h)__builtin_ia32_selectps_512(
556       (__mmask16)__U, (__v16sf)_mm512_conj_pch(__A), (__v16sf)__W);
557 }
558 
559 static __inline__ __m512h __DEFAULT_FN_ATTRS512
560 _mm512_maskz_conj_pch(__mmask16 __U, __m512h __A) {
561   return (__m512h)__builtin_ia32_selectps_512((__mmask16)__U,
562                                               (__v16sf)_mm512_conj_pch(__A),
563                                               (__v16sf)_mm512_setzero_ps());
564 }
565 
566 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_add_sh(__m128h __A,
567                                                            __m128h __B) {
568   __A[0] += __B[0];
569   return __A;
570 }
571 
572 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_add_sh(__m128h __W,
573                                                                 __mmask8 __U,
574                                                                 __m128h __A,
575                                                                 __m128h __B) {
576   __A = _mm_add_sh(__A, __B);
577   return __builtin_ia32_selectsh_128(__U, __A, __W);
578 }
579 
580 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_add_sh(__mmask8 __U,
581                                                                  __m128h __A,
582                                                                  __m128h __B) {
583   __A = _mm_add_sh(__A, __B);
584   return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
585 }
586 
587 #define _mm_add_round_sh(A, B, R)                                              \
588   ((__m128h)__builtin_ia32_addsh_round_mask(                                   \
589       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
590       (__mmask8)-1, (int)(R)))
591 
592 #define _mm_mask_add_round_sh(W, U, A, B, R)                                   \
593   ((__m128h)__builtin_ia32_addsh_round_mask(                                   \
594       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
595       (__mmask8)(U), (int)(R)))
596 
597 #define _mm_maskz_add_round_sh(U, A, B, R)                                     \
598   ((__m128h)__builtin_ia32_addsh_round_mask(                                   \
599       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
600       (__mmask8)(U), (int)(R)))
601 
602 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sub_sh(__m128h __A,
603                                                            __m128h __B) {
604   __A[0] -= __B[0];
605   return __A;
606 }
607 
608 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sub_sh(__m128h __W,
609                                                                 __mmask8 __U,
610                                                                 __m128h __A,
611                                                                 __m128h __B) {
612   __A = _mm_sub_sh(__A, __B);
613   return __builtin_ia32_selectsh_128(__U, __A, __W);
614 }
615 
616 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sub_sh(__mmask8 __U,
617                                                                  __m128h __A,
618                                                                  __m128h __B) {
619   __A = _mm_sub_sh(__A, __B);
620   return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
621 }
622 
623 #define _mm_sub_round_sh(A, B, R)                                              \
624   ((__m128h)__builtin_ia32_subsh_round_mask(                                   \
625       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
626       (__mmask8)-1, (int)(R)))
627 
628 #define _mm_mask_sub_round_sh(W, U, A, B, R)                                   \
629   ((__m128h)__builtin_ia32_subsh_round_mask(                                   \
630       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
631       (__mmask8)(U), (int)(R)))
632 
633 #define _mm_maskz_sub_round_sh(U, A, B, R)                                     \
634   ((__m128h)__builtin_ia32_subsh_round_mask(                                   \
635       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
636       (__mmask8)(U), (int)(R)))
637 
638 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mul_sh(__m128h __A,
639                                                            __m128h __B) {
640   __A[0] *= __B[0];
641   return __A;
642 }
643 
644 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_mul_sh(__m128h __W,
645                                                                 __mmask8 __U,
646                                                                 __m128h __A,
647                                                                 __m128h __B) {
648   __A = _mm_mul_sh(__A, __B);
649   return __builtin_ia32_selectsh_128(__U, __A, __W);
650 }
651 
652 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_mul_sh(__mmask8 __U,
653                                                                  __m128h __A,
654                                                                  __m128h __B) {
655   __A = _mm_mul_sh(__A, __B);
656   return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
657 }
658 
659 #define _mm_mul_round_sh(A, B, R)                                              \
660   ((__m128h)__builtin_ia32_mulsh_round_mask(                                   \
661       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
662       (__mmask8)-1, (int)(R)))
663 
664 #define _mm_mask_mul_round_sh(W, U, A, B, R)                                   \
665   ((__m128h)__builtin_ia32_mulsh_round_mask(                                   \
666       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
667       (__mmask8)(U), (int)(R)))
668 
669 #define _mm_maskz_mul_round_sh(U, A, B, R)                                     \
670   ((__m128h)__builtin_ia32_mulsh_round_mask(                                   \
671       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
672       (__mmask8)(U), (int)(R)))
673 
674 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_div_sh(__m128h __A,
675                                                            __m128h __B) {
676   __A[0] /= __B[0];
677   return __A;
678 }
679 
680 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_div_sh(__m128h __W,
681                                                                 __mmask8 __U,
682                                                                 __m128h __A,
683                                                                 __m128h __B) {
684   __A = _mm_div_sh(__A, __B);
685   return __builtin_ia32_selectsh_128(__U, __A, __W);
686 }
687 
688 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_div_sh(__mmask8 __U,
689                                                                  __m128h __A,
690                                                                  __m128h __B) {
691   __A = _mm_div_sh(__A, __B);
692   return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
693 }
694 
695 #define _mm_div_round_sh(A, B, R)                                              \
696   ((__m128h)__builtin_ia32_divsh_round_mask(                                   \
697       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
698       (__mmask8)-1, (int)(R)))
699 
700 #define _mm_mask_div_round_sh(W, U, A, B, R)                                   \
701   ((__m128h)__builtin_ia32_divsh_round_mask(                                   \
702       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
703       (__mmask8)(U), (int)(R)))
704 
705 #define _mm_maskz_div_round_sh(U, A, B, R)                                     \
706   ((__m128h)__builtin_ia32_divsh_round_mask(                                   \
707       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
708       (__mmask8)(U), (int)(R)))
709 
710 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_min_sh(__m128h __A,
711                                                            __m128h __B) {
712   return (__m128h)__builtin_ia32_minsh_round_mask(
713       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
714       _MM_FROUND_CUR_DIRECTION);
715 }
716 
717 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_min_sh(__m128h __W,
718                                                                 __mmask8 __U,
719                                                                 __m128h __A,
720                                                                 __m128h __B) {
721   return (__m128h)__builtin_ia32_minsh_round_mask((__v8hf)__A, (__v8hf)__B,
722                                                   (__v8hf)__W, (__mmask8)__U,
723                                                   _MM_FROUND_CUR_DIRECTION);
724 }
725 
726 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_min_sh(__mmask8 __U,
727                                                                  __m128h __A,
728                                                                  __m128h __B) {
729   return (__m128h)__builtin_ia32_minsh_round_mask(
730       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
731       _MM_FROUND_CUR_DIRECTION);
732 }
733 
734 #define _mm_min_round_sh(A, B, R)                                              \
735   ((__m128h)__builtin_ia32_minsh_round_mask(                                   \
736       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
737       (__mmask8)-1, (int)(R)))
738 
739 #define _mm_mask_min_round_sh(W, U, A, B, R)                                   \
740   ((__m128h)__builtin_ia32_minsh_round_mask(                                   \
741       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
742       (__mmask8)(U), (int)(R)))
743 
744 #define _mm_maskz_min_round_sh(U, A, B, R)                                     \
745   ((__m128h)__builtin_ia32_minsh_round_mask(                                   \
746       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
747       (__mmask8)(U), (int)(R)))
748 
749 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_max_sh(__m128h __A,
750                                                            __m128h __B) {
751   return (__m128h)__builtin_ia32_maxsh_round_mask(
752       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
753       _MM_FROUND_CUR_DIRECTION);
754 }
755 
756 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_max_sh(__m128h __W,
757                                                                 __mmask8 __U,
758                                                                 __m128h __A,
759                                                                 __m128h __B) {
760   return (__m128h)__builtin_ia32_maxsh_round_mask((__v8hf)__A, (__v8hf)__B,
761                                                   (__v8hf)__W, (__mmask8)__U,
762                                                   _MM_FROUND_CUR_DIRECTION);
763 }
764 
765 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_max_sh(__mmask8 __U,
766                                                                  __m128h __A,
767                                                                  __m128h __B) {
768   return (__m128h)__builtin_ia32_maxsh_round_mask(
769       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
770       _MM_FROUND_CUR_DIRECTION);
771 }
772 
773 #define _mm_max_round_sh(A, B, R)                                              \
774   ((__m128h)__builtin_ia32_maxsh_round_mask(                                   \
775       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
776       (__mmask8)-1, (int)(R)))
777 
778 #define _mm_mask_max_round_sh(W, U, A, B, R)                                   \
779   ((__m128h)__builtin_ia32_maxsh_round_mask(                                   \
780       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
781       (__mmask8)(U), (int)(R)))
782 
783 #define _mm_maskz_max_round_sh(U, A, B, R)                                     \
784   ((__m128h)__builtin_ia32_maxsh_round_mask(                                   \
785       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
786       (__mmask8)(U), (int)(R)))
787 
788 #define _mm512_cmp_round_ph_mask(A, B, P, R)                                   \
789   ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A),              \
790                                            (__v32hf)(__m512h)(B), (int)(P),    \
791                                            (__mmask32)-1, (int)(R)))
792 
793 #define _mm512_mask_cmp_round_ph_mask(U, A, B, P, R)                           \
794   ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A),              \
795                                            (__v32hf)(__m512h)(B), (int)(P),    \
796                                            (__mmask32)(U), (int)(R)))
797 
798 #define _mm512_cmp_ph_mask(A, B, P)                                            \
799   _mm512_cmp_round_ph_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
800 
801 #define _mm512_mask_cmp_ph_mask(U, A, B, P)                                    \
802   _mm512_mask_cmp_round_ph_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
803 
804 #define _mm_cmp_round_sh_mask(X, Y, P, R)                                      \
805   ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X),                   \
806                                        (__v8hf)(__m128h)(Y), (int)(P),         \
807                                        (__mmask8)-1, (int)(R)))
808 
809 #define _mm_mask_cmp_round_sh_mask(M, X, Y, P, R)                              \
810   ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X),                   \
811                                        (__v8hf)(__m128h)(Y), (int)(P),         \
812                                        (__mmask8)(M), (int)(R)))
813 
814 #define _mm_cmp_sh_mask(X, Y, P)                                               \
815   ((__mmask8)__builtin_ia32_cmpsh_mask(                                        \
816       (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)-1,      \
817       _MM_FROUND_CUR_DIRECTION))
818 
819 #define _mm_mask_cmp_sh_mask(M, X, Y, P)                                       \
820   ((__mmask8)__builtin_ia32_cmpsh_mask(                                        \
821       (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)(M),     \
822       _MM_FROUND_CUR_DIRECTION))
823 // loads with vmovsh:
824 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_sh(void const *__dp) {
825   struct __mm_load_sh_struct {
826     _Float16 __u;
827   } __attribute__((__packed__, __may_alias__));
828   _Float16 __u = ((const struct __mm_load_sh_struct *)__dp)->__u;
829   return (__m128h){__u, 0, 0, 0, 0, 0, 0, 0};
830 }
831 
832 static __inline__ __m128h __DEFAULT_FN_ATTRS128
833 _mm_mask_load_sh(__m128h __W, __mmask8 __U, const void *__A) {
834   __m128h src = (__v8hf)__builtin_shufflevector(
835       (__v8hf)__W, (__v8hf)_mm_setzero_ph(), 0, 8, 8, 8, 8, 8, 8, 8);
836 
837   return (__m128h)__builtin_ia32_loadsh128_mask((const __v8hf *)__A, src, __U & 1);
838 }
839 
840 static __inline__ __m128h __DEFAULT_FN_ATTRS128
841 _mm_maskz_load_sh(__mmask8 __U, const void *__A) {
842   return (__m128h)__builtin_ia32_loadsh128_mask(
843       (const __v8hf *)__A, (__v8hf)_mm_setzero_ph(), __U & 1);
844 }
845 
846 static __inline__ __m512h __DEFAULT_FN_ATTRS512
847 _mm512_load_ph(void const *__p) {
848   return *(const __m512h *)__p;
849 }
850 
851 static __inline__ __m256h __DEFAULT_FN_ATTRS256
852 _mm256_load_ph(void const *__p) {
853   return *(const __m256h *)__p;
854 }
855 
856 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_ph(void const *__p) {
857   return *(const __m128h *)__p;
858 }
859 
860 static __inline__ __m512h __DEFAULT_FN_ATTRS512
861 _mm512_loadu_ph(void const *__p) {
862   struct __loadu_ph {
863     __m512h_u __v;
864   } __attribute__((__packed__, __may_alias__));
865   return ((const struct __loadu_ph *)__p)->__v;
866 }
867 
868 static __inline__ __m256h __DEFAULT_FN_ATTRS256
869 _mm256_loadu_ph(void const *__p) {
870   struct __loadu_ph {
871     __m256h_u __v;
872   } __attribute__((__packed__, __may_alias__));
873   return ((const struct __loadu_ph *)__p)->__v;
874 }
875 
876 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_loadu_ph(void const *__p) {
877   struct __loadu_ph {
878     __m128h_u __v;
879   } __attribute__((__packed__, __may_alias__));
880   return ((const struct __loadu_ph *)__p)->__v;
881 }
882 
883 // stores with vmovsh:
884 static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_sh(void *__dp,
885                                                           __m128h __a) {
886   struct __mm_store_sh_struct {
887     _Float16 __u;
888   } __attribute__((__packed__, __may_alias__));
889   ((struct __mm_store_sh_struct *)__dp)->__u = __a[0];
890 }
891 
892 static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_store_sh(void *__W,
893                                                                __mmask8 __U,
894                                                                __m128h __A) {
895   __builtin_ia32_storesh128_mask((__v8hf *)__W, __A, __U & 1);
896 }
897 
898 static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_store_ph(void *__P,
899                                                              __m512h __A) {
900   *(__m512h *)__P = __A;
901 }
902 
903 static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_store_ph(void *__P,
904                                                              __m256h __A) {
905   *(__m256h *)__P = __A;
906 }
907 
908 static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_ph(void *__P,
909                                                           __m128h __A) {
910   *(__m128h *)__P = __A;
911 }
912 
913 static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_storeu_ph(void *__P,
914                                                               __m512h __A) {
915   struct __storeu_ph {
916     __m512h_u __v;
917   } __attribute__((__packed__, __may_alias__));
918   ((struct __storeu_ph *)__P)->__v = __A;
919 }
920 
921 static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_storeu_ph(void *__P,
922                                                               __m256h __A) {
923   struct __storeu_ph {
924     __m256h_u __v;
925   } __attribute__((__packed__, __may_alias__));
926   ((struct __storeu_ph *)__P)->__v = __A;
927 }
928 
929 static __inline__ void __DEFAULT_FN_ATTRS128 _mm_storeu_ph(void *__P,
930                                                            __m128h __A) {
931   struct __storeu_ph {
932     __m128h_u __v;
933   } __attribute__((__packed__, __may_alias__));
934   ((struct __storeu_ph *)__P)->__v = __A;
935 }
936 
937 // moves with vmovsh:
938 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_move_sh(__m128h __a,
939                                                             __m128h __b) {
940   __a[0] = __b[0];
941   return __a;
942 }
943 
944 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_move_sh(__m128h __W,
945                                                                  __mmask8 __U,
946                                                                  __m128h __A,
947                                                                  __m128h __B) {
948   return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B), __W);
949 }
950 
951 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_move_sh(__mmask8 __U,
952                                                                   __m128h __A,
953                                                                   __m128h __B) {
954   return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B),
955                                      _mm_setzero_ph());
956 }
957 
958 // vmovw:
959 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtsi16_si128(short __a) {
960   return (__m128i)(__v8hi){__a, 0, 0, 0, 0, 0, 0, 0};
961 }
962 
963 static __inline__ short __DEFAULT_FN_ATTRS128 _mm_cvtsi128_si16(__m128i __a) {
964   __v8hi __b = (__v8hi)__a;
965   return __b[0];
966 }
967 
968 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rcp_ph(__m512h __A) {
969   return (__m512h)__builtin_ia32_rcpph512_mask(
970       (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1);
971 }
972 
973 static __inline__ __m512h __DEFAULT_FN_ATTRS512
974 _mm512_mask_rcp_ph(__m512h __W, __mmask32 __U, __m512h __A) {
975   return (__m512h)__builtin_ia32_rcpph512_mask((__v32hf)__A, (__v32hf)__W,
976                                                (__mmask32)__U);
977 }
978 
979 static __inline__ __m512h __DEFAULT_FN_ATTRS512
980 _mm512_maskz_rcp_ph(__mmask32 __U, __m512h __A) {
981   return (__m512h)__builtin_ia32_rcpph512_mask(
982       (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U);
983 }
984 
985 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rsqrt_ph(__m512h __A) {
986   return (__m512h)__builtin_ia32_rsqrtph512_mask(
987       (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1);
988 }
989 
990 static __inline__ __m512h __DEFAULT_FN_ATTRS512
991 _mm512_mask_rsqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) {
992   return (__m512h)__builtin_ia32_rsqrtph512_mask((__v32hf)__A, (__v32hf)__W,
993                                                  (__mmask32)__U);
994 }
995 
996 static __inline__ __m512h __DEFAULT_FN_ATTRS512
997 _mm512_maskz_rsqrt_ph(__mmask32 __U, __m512h __A) {
998   return (__m512h)__builtin_ia32_rsqrtph512_mask(
999       (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U);
1000 }
1001 
1002 #define _mm512_getmant_ph(A, B, C)                                             \
1003   ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
1004       (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)),                          \
1005       (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,                           \
1006       _MM_FROUND_CUR_DIRECTION))
1007 
1008 #define _mm512_mask_getmant_ph(W, U, A, B, C)                                  \
1009   ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
1010       (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W),   \
1011       (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1012 
1013 #define _mm512_maskz_getmant_ph(U, A, B, C)                                    \
1014   ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
1015       (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)),                          \
1016       (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1017 
1018 #define _mm512_getmant_round_ph(A, B, C, R)                                    \
1019   ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
1020       (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)),                          \
1021       (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
1022 
1023 #define _mm512_mask_getmant_round_ph(W, U, A, B, C, R)                         \
1024   ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
1025       (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W),   \
1026       (__mmask32)(U), (int)(R)))
1027 
1028 #define _mm512_maskz_getmant_round_ph(U, A, B, C, R)                           \
1029   ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
1030       (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)),                          \
1031       (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1032 
1033 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_getexp_ph(__m512h __A) {
1034   return (__m512h)__builtin_ia32_getexpph512_mask(
1035       (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,
1036       _MM_FROUND_CUR_DIRECTION);
1037 }
1038 
1039 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1040 _mm512_mask_getexp_ph(__m512h __W, __mmask32 __U, __m512h __A) {
1041   return (__m512h)__builtin_ia32_getexpph512_mask(
1042       (__v32hf)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1043 }
1044 
1045 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1046 _mm512_maskz_getexp_ph(__mmask32 __U, __m512h __A) {
1047   return (__m512h)__builtin_ia32_getexpph512_mask(
1048       (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1049       _MM_FROUND_CUR_DIRECTION);
1050 }
1051 
1052 #define _mm512_getexp_round_ph(A, R)                                           \
1053   ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A),             \
1054                                             (__v32hf)_mm512_undefined_ph(),    \
1055                                             (__mmask32)-1, (int)(R)))
1056 
1057 #define _mm512_mask_getexp_round_ph(W, U, A, R)                                \
1058   ((__m512h)__builtin_ia32_getexpph512_mask(                                   \
1059       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(W), (__mmask32)(U), (int)(R)))
1060 
1061 #define _mm512_maskz_getexp_round_ph(U, A, R)                                  \
1062   ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A),             \
1063                                             (__v32hf)_mm512_setzero_ph(),      \
1064                                             (__mmask32)(U), (int)(R)))
1065 
1066 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_scalef_ph(__m512h __A,
1067                                                                  __m512h __B) {
1068   return (__m512h)__builtin_ia32_scalefph512_mask(
1069       (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,
1070       _MM_FROUND_CUR_DIRECTION);
1071 }
1072 
1073 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1074 _mm512_mask_scalef_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
1075   return (__m512h)__builtin_ia32_scalefph512_mask((__v32hf)__A, (__v32hf)__B,
1076                                                   (__v32hf)__W, (__mmask32)__U,
1077                                                   _MM_FROUND_CUR_DIRECTION);
1078 }
1079 
1080 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1081 _mm512_maskz_scalef_ph(__mmask32 __U, __m512h __A, __m512h __B) {
1082   return (__m512h)__builtin_ia32_scalefph512_mask(
1083       (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1084       _MM_FROUND_CUR_DIRECTION);
1085 }
1086 
1087 #define _mm512_scalef_round_ph(A, B, R)                                        \
1088   ((__m512h)__builtin_ia32_scalefph512_mask(                                   \
1089       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B),                            \
1090       (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
1091 
1092 #define _mm512_mask_scalef_round_ph(W, U, A, B, R)                             \
1093   ((__m512h)__builtin_ia32_scalefph512_mask(                                   \
1094       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(W),     \
1095       (__mmask32)(U), (int)(R)))
1096 
1097 #define _mm512_maskz_scalef_round_ph(U, A, B, R)                               \
1098   ((__m512h)__builtin_ia32_scalefph512_mask(                                   \
1099       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B),                            \
1100       (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1101 
1102 #define _mm512_roundscale_ph(A, B)                                             \
1103   ((__m512h)__builtin_ia32_rndscaleph_mask(                                    \
1104       (__v32hf)(__m512h)(A), (int)(B), (__v32hf)(__m512h)(A), (__mmask32)-1,   \
1105       _MM_FROUND_CUR_DIRECTION))
1106 
1107 #define _mm512_mask_roundscale_ph(A, B, C, imm)                                \
1108   ((__m512h)__builtin_ia32_rndscaleph_mask(                                    \
1109       (__v32hf)(__m512h)(C), (int)(imm), (__v32hf)(__m512h)(A),                \
1110       (__mmask32)(B), _MM_FROUND_CUR_DIRECTION))
1111 
1112 #define _mm512_maskz_roundscale_ph(A, B, imm)                                  \
1113   ((__m512h)__builtin_ia32_rndscaleph_mask(                                    \
1114       (__v32hf)(__m512h)(B), (int)(imm), (__v32hf)_mm512_setzero_ph(),         \
1115       (__mmask32)(A), _MM_FROUND_CUR_DIRECTION))
1116 
1117 #define _mm512_mask_roundscale_round_ph(A, B, C, imm, R)                       \
1118   ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(C), (int)(imm),  \
1119                                            (__v32hf)(__m512h)(A),              \
1120                                            (__mmask32)(B), (int)(R)))
1121 
1122 #define _mm512_maskz_roundscale_round_ph(A, B, imm, R)                         \
1123   ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(B), (int)(imm),  \
1124                                            (__v32hf)_mm512_setzero_ph(),       \
1125                                            (__mmask32)(A), (int)(R)))
1126 
1127 #define _mm512_roundscale_round_ph(A, imm, R)                                  \
1128   ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(A), (int)(imm),  \
1129                                            (__v32hf)_mm512_undefined_ph(),     \
1130                                            (__mmask32)-1, (int)(R)))
1131 
1132 #define _mm512_reduce_ph(A, imm)                                               \
1133   ((__m512h)__builtin_ia32_reduceph512_mask(                                   \
1134       (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_undefined_ph(),       \
1135       (__mmask32)-1, _MM_FROUND_CUR_DIRECTION))
1136 
1137 #define _mm512_mask_reduce_ph(W, U, A, imm)                                    \
1138   ((__m512h)__builtin_ia32_reduceph512_mask(                                   \
1139       (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)(__m512h)(W),                \
1140       (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1141 
1142 #define _mm512_maskz_reduce_ph(U, A, imm)                                      \
1143   ((__m512h)__builtin_ia32_reduceph512_mask(                                   \
1144       (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_setzero_ph(),         \
1145       (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1146 
1147 #define _mm512_mask_reduce_round_ph(W, U, A, imm, R)                           \
1148   ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1149                                             (__v32hf)(__m512h)(W),             \
1150                                             (__mmask32)(U), (int)(R)))
1151 
1152 #define _mm512_maskz_reduce_round_ph(U, A, imm, R)                             \
1153   ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1154                                             (__v32hf)_mm512_setzero_ph(),      \
1155                                             (__mmask32)(U), (int)(R)))
1156 
1157 #define _mm512_reduce_round_ph(A, imm, R)                                      \
1158   ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1159                                             (__v32hf)_mm512_undefined_ph(),    \
1160                                             (__mmask32)-1, (int)(R)))
1161 
1162 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rcp_sh(__m128h __A,
1163                                                            __m128h __B) {
1164   return (__m128h)__builtin_ia32_rcpsh_mask(
1165       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
1166 }
1167 
1168 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rcp_sh(__m128h __W,
1169                                                                 __mmask8 __U,
1170                                                                 __m128h __A,
1171                                                                 __m128h __B) {
1172   return (__m128h)__builtin_ia32_rcpsh_mask((__v8hf)__A, (__v8hf)__B,
1173                                             (__v8hf)__W, (__mmask8)__U);
1174 }
1175 
1176 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_rcp_sh(__mmask8 __U,
1177                                                                  __m128h __A,
1178                                                                  __m128h __B) {
1179   return (__m128h)__builtin_ia32_rcpsh_mask(
1180       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1181 }
1182 
1183 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rsqrt_sh(__m128h __A,
1184                                                              __m128h __B) {
1185   return (__m128h)__builtin_ia32_rsqrtsh_mask(
1186       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
1187 }
1188 
1189 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rsqrt_sh(__m128h __W,
1190                                                                   __mmask8 __U,
1191                                                                   __m128h __A,
1192                                                                   __m128h __B) {
1193   return (__m128h)__builtin_ia32_rsqrtsh_mask((__v8hf)__A, (__v8hf)__B,
1194                                               (__v8hf)__W, (__mmask8)__U);
1195 }
1196 
1197 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1198 _mm_maskz_rsqrt_sh(__mmask8 __U, __m128h __A, __m128h __B) {
1199   return (__m128h)__builtin_ia32_rsqrtsh_mask(
1200       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1201 }
1202 
1203 #define _mm_getmant_round_sh(A, B, C, D, R)                                    \
1204   ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
1205       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
1206       (__v8hf)_mm_setzero_ph(), (__mmask8)-1, (int)(R)))
1207 
1208 #define _mm_getmant_sh(A, B, C, D)                                             \
1209   ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
1210       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
1211       (__v8hf)_mm_setzero_ph(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))
1212 
1213 #define _mm_mask_getmant_sh(W, U, A, B, C, D)                                  \
1214   ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
1215       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
1216       (__v8hf)(__m128h)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
1217 
1218 #define _mm_mask_getmant_round_sh(W, U, A, B, C, D, R)                         \
1219   ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
1220       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
1221       (__v8hf)(__m128h)(W), (__mmask8)(U), (int)(R)))
1222 
1223 #define _mm_maskz_getmant_sh(U, A, B, C, D)                                    \
1224   ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
1225       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
1226       (__v8hf)_mm_setzero_ph(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
1227 
1228 #define _mm_maskz_getmant_round_sh(U, A, B, C, D, R)                           \
1229   ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
1230       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
1231       (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1232 
1233 #define _mm_getexp_round_sh(A, B, R)                                           \
1234   ((__m128h)__builtin_ia32_getexpsh128_round_mask(                             \
1235       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1236       (__mmask8)-1, (int)(R)))
1237 
1238 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_getexp_sh(__m128h __A,
1239                                                               __m128h __B) {
1240   return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1241       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
1242       _MM_FROUND_CUR_DIRECTION);
1243 }
1244 
1245 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1246 _mm_mask_getexp_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
1247   return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1248       (__v8hf)__A, (__v8hf)__B, (__v8hf)__W, (__mmask8)__U,
1249       _MM_FROUND_CUR_DIRECTION);
1250 }
1251 
1252 #define _mm_mask_getexp_round_sh(W, U, A, B, R)                                \
1253   ((__m128h)__builtin_ia32_getexpsh128_round_mask(                             \
1254       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
1255       (__mmask8)(U), (int)(R)))
1256 
1257 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1258 _mm_maskz_getexp_sh(__mmask8 __U, __m128h __A, __m128h __B) {
1259   return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1260       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1261       _MM_FROUND_CUR_DIRECTION);
1262 }
1263 
1264 #define _mm_maskz_getexp_round_sh(U, A, B, R)                                  \
1265   ((__m128h)__builtin_ia32_getexpsh128_round_mask(                             \
1266       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1267       (__mmask8)(U), (int)(R)))
1268 
1269 #define _mm_scalef_round_sh(A, B, R)                                           \
1270   ((__m128h)__builtin_ia32_scalefsh_round_mask(                                \
1271       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1272       (__mmask8)-1, (int)(R)))
1273 
1274 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_scalef_sh(__m128h __A,
1275                                                               __m128h __B) {
1276   return (__m128h)__builtin_ia32_scalefsh_round_mask(
1277       (__v8hf)__A, (__v8hf)(__B), (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
1278       _MM_FROUND_CUR_DIRECTION);
1279 }
1280 
1281 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1282 _mm_mask_scalef_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
1283   return (__m128h)__builtin_ia32_scalefsh_round_mask((__v8hf)__A, (__v8hf)__B,
1284                                                      (__v8hf)__W, (__mmask8)__U,
1285                                                      _MM_FROUND_CUR_DIRECTION);
1286 }
1287 
1288 #define _mm_mask_scalef_round_sh(W, U, A, B, R)                                \
1289   ((__m128h)__builtin_ia32_scalefsh_round_mask(                                \
1290       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
1291       (__mmask8)(U), (int)(R)))
1292 
1293 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1294 _mm_maskz_scalef_sh(__mmask8 __U, __m128h __A, __m128h __B) {
1295   return (__m128h)__builtin_ia32_scalefsh_round_mask(
1296       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1297       _MM_FROUND_CUR_DIRECTION);
1298 }
1299 
1300 #define _mm_maskz_scalef_round_sh(U, A, B, R)                                  \
1301   ((__m128h)__builtin_ia32_scalefsh_round_mask(                                \
1302       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1303       (__mmask8)(U), (int)(R)))
1304 
1305 #define _mm_roundscale_round_sh(A, B, imm, R)                                  \
1306   ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
1307       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1308       (__mmask8)-1, (int)(imm), (int)(R)))
1309 
1310 #define _mm_roundscale_sh(A, B, imm)                                           \
1311   ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
1312       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1313       (__mmask8)-1, (int)(imm), _MM_FROUND_CUR_DIRECTION))
1314 
1315 #define _mm_mask_roundscale_sh(W, U, A, B, I)                                  \
1316   ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
1317       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
1318       (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
1319 
1320 #define _mm_mask_roundscale_round_sh(W, U, A, B, I, R)                         \
1321   ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
1322       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
1323       (__mmask8)(U), (int)(I), (int)(R)))
1324 
1325 #define _mm_maskz_roundscale_sh(U, A, B, I)                                    \
1326   ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
1327       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1328       (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
1329 
1330 #define _mm_maskz_roundscale_round_sh(U, A, B, I, R)                           \
1331   ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
1332       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1333       (__mmask8)(U), (int)(I), (int)(R)))
1334 
1335 #define _mm_reduce_sh(A, B, C)                                                 \
1336   ((__m128h)__builtin_ia32_reducesh_mask(                                      \
1337       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1338       (__mmask8)-1, (int)(C), _MM_FROUND_CUR_DIRECTION))
1339 
1340 #define _mm_mask_reduce_sh(W, U, A, B, C)                                      \
1341   ((__m128h)__builtin_ia32_reducesh_mask(                                      \
1342       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
1343       (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
1344 
1345 #define _mm_maskz_reduce_sh(U, A, B, C)                                        \
1346   ((__m128h)__builtin_ia32_reducesh_mask(                                      \
1347       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1348       (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
1349 
1350 #define _mm_reduce_round_sh(A, B, C, R)                                        \
1351   ((__m128h)__builtin_ia32_reducesh_mask(                                      \
1352       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1353       (__mmask8)-1, (int)(C), (int)(R)))
1354 
1355 #define _mm_mask_reduce_round_sh(W, U, A, B, C, R)                             \
1356   ((__m128h)__builtin_ia32_reducesh_mask(                                      \
1357       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
1358       (__mmask8)(U), (int)(C), (int)(R)))
1359 
1360 #define _mm_maskz_reduce_round_sh(U, A, B, C, R)                               \
1361   ((__m128h)__builtin_ia32_reducesh_mask(                                      \
1362       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1363       (__mmask8)(U), (int)(C), (int)(R)))
1364 
1365 #define _mm512_sqrt_round_ph(A, R)                                             \
1366   ((__m512h)__builtin_ia32_sqrtph512((__v32hf)(__m512h)(A), (int)(R)))
1367 
1368 #define _mm512_mask_sqrt_round_ph(W, U, A, R)                                  \
1369   ((__m512h)__builtin_ia32_selectph_512(                                       \
1370       (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)),                 \
1371       (__v32hf)(__m512h)(W)))
1372 
1373 #define _mm512_maskz_sqrt_round_ph(U, A, R)                                    \
1374   ((__m512h)__builtin_ia32_selectph_512(                                       \
1375       (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)),                 \
1376       (__v32hf)_mm512_setzero_ph()))
1377 
1378 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sqrt_ph(__m512h __A) {
1379   return (__m512h)__builtin_ia32_sqrtph512((__v32hf)__A,
1380                                            _MM_FROUND_CUR_DIRECTION);
1381 }
1382 
1383 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1384 _mm512_mask_sqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) {
1385   return (__m512h)__builtin_ia32_selectph_512(
1386       (__mmask32)(__U),
1387       (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)),
1388       (__v32hf)(__m512h)(__W));
1389 }
1390 
1391 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1392 _mm512_maskz_sqrt_ph(__mmask32 __U, __m512h __A) {
1393   return (__m512h)__builtin_ia32_selectph_512(
1394       (__mmask32)(__U),
1395       (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)),
1396       (__v32hf)_mm512_setzero_ph());
1397 }
1398 
1399 #define _mm_sqrt_round_sh(A, B, R)                                             \
1400   ((__m128h)__builtin_ia32_sqrtsh_round_mask(                                  \
1401       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1402       (__mmask8)-1, (int)(R)))
1403 
1404 #define _mm_mask_sqrt_round_sh(W, U, A, B, R)                                  \
1405   ((__m128h)__builtin_ia32_sqrtsh_round_mask(                                  \
1406       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
1407       (__mmask8)(U), (int)(R)))
1408 
1409 #define _mm_maskz_sqrt_round_sh(U, A, B, R)                                    \
1410   ((__m128h)__builtin_ia32_sqrtsh_round_mask(                                  \
1411       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1412       (__mmask8)(U), (int)(R)))
1413 
1414 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_sh(__m128h __A,
1415                                                             __m128h __B) {
1416   return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1417       (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
1418       (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
1419 }
1420 
1421 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_sh(__m128h __W,
1422                                                                  __mmask32 __U,
1423                                                                  __m128h __A,
1424                                                                  __m128h __B) {
1425   return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1426       (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)(__m128h)(__W),
1427       (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
1428 }
1429 
1430 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_sh(__mmask32 __U,
1431                                                                   __m128h __A,
1432                                                                   __m128h __B) {
1433   return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1434       (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
1435       (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
1436 }
1437 
1438 #define _mm512_mask_fpclass_ph_mask(U, A, imm)                                 \
1439   ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A),          \
1440                                                (int)(imm), (__mmask32)(U)))
1441 
1442 #define _mm512_fpclass_ph_mask(A, imm)                                         \
1443   ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A),          \
1444                                                (int)(imm), (__mmask32)-1))
1445 
1446 #define _mm_fpclass_sh_mask(A, imm)                                            \
1447   ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm),   \
1448                                            (__mmask8)-1))
1449 
1450 #define _mm_mask_fpclass_sh_mask(U, A, imm)                                    \
1451   ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm),   \
1452                                            (__mmask8)(U)))
1453 
1454 #define _mm512_cvt_roundpd_ph(A, R)                                            \
1455   ((__m128h)__builtin_ia32_vcvtpd2ph512_mask(                                  \
1456       (__v8df)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
1457 
1458 #define _mm512_mask_cvt_roundpd_ph(W, U, A, R)                                 \
1459   ((__m128h)__builtin_ia32_vcvtpd2ph512_mask((__v8df)(A), (__v8hf)(W),         \
1460                                              (__mmask8)(U), (int)(R)))
1461 
1462 #define _mm512_maskz_cvt_roundpd_ph(U, A, R)                                   \
1463   ((__m128h)__builtin_ia32_vcvtpd2ph512_mask(                                  \
1464       (__v8df)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1465 
1466 static __inline__ __m128h __DEFAULT_FN_ATTRS512 _mm512_cvtpd_ph(__m512d __A) {
1467   return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1468       (__v8df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
1469       _MM_FROUND_CUR_DIRECTION);
1470 }
1471 
1472 static __inline__ __m128h __DEFAULT_FN_ATTRS512
1473 _mm512_mask_cvtpd_ph(__m128h __W, __mmask8 __U, __m512d __A) {
1474   return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1475       (__v8df)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
1476 }
1477 
1478 static __inline__ __m128h __DEFAULT_FN_ATTRS512
1479 _mm512_maskz_cvtpd_ph(__mmask8 __U, __m512d __A) {
1480   return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1481       (__v8df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1482       _MM_FROUND_CUR_DIRECTION);
1483 }
1484 
1485 #define _mm512_cvt_roundph_pd(A, R)                                            \
1486   ((__m512d)__builtin_ia32_vcvtph2pd512_mask(                                  \
1487       (__v8hf)(A), (__v8df)_mm512_undefined_pd(), (__mmask8)(-1), (int)(R)))
1488 
1489 #define _mm512_mask_cvt_roundph_pd(W, U, A, R)                                 \
1490   ((__m512d)__builtin_ia32_vcvtph2pd512_mask((__v8hf)(A), (__v8df)(W),         \
1491                                              (__mmask8)(U), (int)(R)))
1492 
1493 #define _mm512_maskz_cvt_roundph_pd(U, A, R)                                   \
1494   ((__m512d)__builtin_ia32_vcvtph2pd512_mask(                                  \
1495       (__v8hf)(A), (__v8df)_mm512_setzero_pd(), (__mmask8)(U), (int)(R)))
1496 
1497 static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_cvtph_pd(__m128h __A) {
1498   return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1499       (__v8hf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)-1,
1500       _MM_FROUND_CUR_DIRECTION);
1501 }
1502 
1503 static __inline__ __m512d __DEFAULT_FN_ATTRS512
1504 _mm512_mask_cvtph_pd(__m512d __W, __mmask8 __U, __m128h __A) {
1505   return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1506       (__v8hf)__A, (__v8df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
1507 }
1508 
1509 static __inline__ __m512d __DEFAULT_FN_ATTRS512
1510 _mm512_maskz_cvtph_pd(__mmask8 __U, __m128h __A) {
1511   return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1512       (__v8hf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U,
1513       _MM_FROUND_CUR_DIRECTION);
1514 }
1515 
1516 #define _mm_cvt_roundsh_ss(A, B, R)                                            \
1517   ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B),       \
1518                                                (__v4sf)_mm_undefined_ps(),     \
1519                                                (__mmask8)(-1), (int)(R)))
1520 
1521 #define _mm_mask_cvt_roundsh_ss(W, U, A, B, R)                                 \
1522   ((__m128)__builtin_ia32_vcvtsh2ss_round_mask(                                \
1523       (__v4sf)(A), (__v8hf)(B), (__v4sf)(W), (__mmask8)(U), (int)(R)))
1524 
1525 #define _mm_maskz_cvt_roundsh_ss(U, A, B, R)                                   \
1526   ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B),       \
1527                                                (__v4sf)_mm_setzero_ps(),       \
1528                                                (__mmask8)(U), (int)(R)))
1529 
1530 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtsh_ss(__m128 __A,
1531                                                             __m128h __B) {
1532   return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
1533       (__v4sf)__A, (__v8hf)__B, (__v4sf)_mm_undefined_ps(), (__mmask8)-1,
1534       _MM_FROUND_CUR_DIRECTION);
1535 }
1536 
1537 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtsh_ss(__m128 __W,
1538                                                                  __mmask8 __U,
1539                                                                  __m128 __A,
1540                                                                  __m128h __B) {
1541   return (__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)__A, (__v8hf)__B,
1542                                                      (__v4sf)__W, (__mmask8)__U,
1543                                                      _MM_FROUND_CUR_DIRECTION);
1544 }
1545 
1546 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_cvtsh_ss(__mmask8 __U,
1547                                                                   __m128 __A,
1548                                                                   __m128h __B) {
1549   return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
1550       (__v4sf)__A, (__v8hf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U,
1551       _MM_FROUND_CUR_DIRECTION);
1552 }
1553 
1554 #define _mm_cvt_roundss_sh(A, B, R)                                            \
1555   ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B),      \
1556                                                 (__v8hf)_mm_undefined_ph(),    \
1557                                                 (__mmask8)(-1), (int)(R)))
1558 
1559 #define _mm_mask_cvt_roundss_sh(W, U, A, B, R)                                 \
1560   ((__m128h)__builtin_ia32_vcvtss2sh_round_mask(                               \
1561       (__v8hf)(A), (__v4sf)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
1562 
1563 #define _mm_maskz_cvt_roundss_sh(U, A, B, R)                                   \
1564   ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B),      \
1565                                                 (__v8hf)_mm_setzero_ph(),      \
1566                                                 (__mmask8)(U), (int)(R)))
1567 
1568 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtss_sh(__m128h __A,
1569                                                              __m128 __B) {
1570   return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1571       (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_undefined_ph(), (__mmask8)-1,
1572       _MM_FROUND_CUR_DIRECTION);
1573 }
1574 
1575 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtss_sh(__m128h __W,
1576                                                                   __mmask8 __U,
1577                                                                   __m128h __A,
1578                                                                   __m128 __B) {
1579   return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1580       (__v8hf)__A, (__v4sf)__B, (__v8hf)__W, (__mmask8)__U,
1581       _MM_FROUND_CUR_DIRECTION);
1582 }
1583 
1584 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_cvtss_sh(__mmask8 __U,
1585                                                                    __m128h __A,
1586                                                                    __m128 __B) {
1587   return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1588       (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1589       _MM_FROUND_CUR_DIRECTION);
1590 }
1591 
1592 #define _mm_cvt_roundsd_sh(A, B, R)                                            \
1593   ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B),      \
1594                                                 (__v8hf)_mm_undefined_ph(),    \
1595                                                 (__mmask8)(-1), (int)(R)))
1596 
1597 #define _mm_mask_cvt_roundsd_sh(W, U, A, B, R)                                 \
1598   ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask(                               \
1599       (__v8hf)(A), (__v2df)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
1600 
1601 #define _mm_maskz_cvt_roundsd_sh(U, A, B, R)                                   \
1602   ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B),      \
1603                                                 (__v8hf)_mm_setzero_ph(),      \
1604                                                 (__mmask8)(U), (int)(R)))
1605 
1606 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtsd_sh(__m128h __A,
1607                                                              __m128d __B) {
1608   return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1609       (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_undefined_ph(), (__mmask8)-1,
1610       _MM_FROUND_CUR_DIRECTION);
1611 }
1612 
1613 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtsd_sh(__m128h __W,
1614                                                                   __mmask8 __U,
1615                                                                   __m128h __A,
1616                                                                   __m128d __B) {
1617   return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1618       (__v8hf)__A, (__v2df)__B, (__v8hf)__W, (__mmask8)__U,
1619       _MM_FROUND_CUR_DIRECTION);
1620 }
1621 
1622 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1623 _mm_maskz_cvtsd_sh(__mmask8 __U, __m128h __A, __m128d __B) {
1624   return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1625       (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1626       _MM_FROUND_CUR_DIRECTION);
1627 }
1628 
1629 #define _mm_cvt_roundsh_sd(A, B, R)                                            \
1630   ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B),      \
1631                                                 (__v2df)_mm_undefined_pd(),    \
1632                                                 (__mmask8)(-1), (int)(R)))
1633 
1634 #define _mm_mask_cvt_roundsh_sd(W, U, A, B, R)                                 \
1635   ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask(                               \
1636       (__v2df)(A), (__v8hf)(B), (__v2df)(W), (__mmask8)(U), (int)(R)))
1637 
1638 #define _mm_maskz_cvt_roundsh_sd(U, A, B, R)                                   \
1639   ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B),      \
1640                                                 (__v2df)_mm_setzero_pd(),      \
1641                                                 (__mmask8)(U), (int)(R)))
1642 
1643 static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_cvtsh_sd(__m128d __A,
1644                                                              __m128h __B) {
1645   return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1646       (__v2df)__A, (__v8hf)__B, (__v2df)_mm_undefined_pd(), (__mmask8)-1,
1647       _MM_FROUND_CUR_DIRECTION);
1648 }
1649 
1650 static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_cvtsh_sd(__m128d __W,
1651                                                                   __mmask8 __U,
1652                                                                   __m128d __A,
1653                                                                   __m128h __B) {
1654   return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1655       (__v2df)__A, (__v8hf)__B, (__v2df)__W, (__mmask8)__U,
1656       _MM_FROUND_CUR_DIRECTION);
1657 }
1658 
1659 static __inline__ __m128d __DEFAULT_FN_ATTRS128
1660 _mm_maskz_cvtsh_sd(__mmask8 __U, __m128d __A, __m128h __B) {
1661   return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1662       (__v2df)__A, (__v8hf)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U,
1663       _MM_FROUND_CUR_DIRECTION);
1664 }
1665 
1666 #define _mm512_cvt_roundph_epi16(A, R)                                         \
1667   ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A),                      \
1668                                             (__v32hi)_mm512_undefined_epi32(), \
1669                                             (__mmask32)(-1), (int)(R)))
1670 
1671 #define _mm512_mask_cvt_roundph_epi16(W, U, A, R)                              \
1672   ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), (__v32hi)(W),        \
1673                                             (__mmask32)(U), (int)(R)))
1674 
1675 #define _mm512_maskz_cvt_roundph_epi16(U, A, R)                                \
1676   ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A),                      \
1677                                             (__v32hi)_mm512_setzero_epi32(),   \
1678                                             (__mmask32)(U), (int)(R)))
1679 
1680 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1681 _mm512_cvtph_epi16(__m512h __A) {
1682   return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1683       (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)-1,
1684       _MM_FROUND_CUR_DIRECTION);
1685 }
1686 
1687 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1688 _mm512_mask_cvtph_epi16(__m512i __W, __mmask32 __U, __m512h __A) {
1689   return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1690       (__v32hf)__A, (__v32hi)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1691 }
1692 
1693 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1694 _mm512_maskz_cvtph_epi16(__mmask32 __U, __m512h __A) {
1695   return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1696       (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)__U,
1697       _MM_FROUND_CUR_DIRECTION);
1698 }
1699 
1700 #define _mm512_cvtt_roundph_epi16(A, R)                                        \
1701   ((__m512i)__builtin_ia32_vcvttph2w512_mask(                                  \
1702       (__v32hf)(A), (__v32hi)_mm512_undefined_epi32(), (__mmask32)(-1),        \
1703       (int)(R)))
1704 
1705 #define _mm512_mask_cvtt_roundph_epi16(W, U, A, R)                             \
1706   ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), (__v32hi)(W),       \
1707                                              (__mmask32)(U), (int)(R)))
1708 
1709 #define _mm512_maskz_cvtt_roundph_epi16(U, A, R)                               \
1710   ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A),                     \
1711                                              (__v32hi)_mm512_setzero_epi32(),  \
1712                                              (__mmask32)(U), (int)(R)))
1713 
1714 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1715 _mm512_cvttph_epi16(__m512h __A) {
1716   return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1717       (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)-1,
1718       _MM_FROUND_CUR_DIRECTION);
1719 }
1720 
1721 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1722 _mm512_mask_cvttph_epi16(__m512i __W, __mmask32 __U, __m512h __A) {
1723   return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1724       (__v32hf)__A, (__v32hi)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1725 }
1726 
1727 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1728 _mm512_maskz_cvttph_epi16(__mmask32 __U, __m512h __A) {
1729   return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1730       (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)__U,
1731       _MM_FROUND_CUR_DIRECTION);
1732 }
1733 
1734 #define _mm512_cvt_roundepi16_ph(A, R)                                         \
1735   ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A),                      \
1736                                             (__v32hf)_mm512_undefined_ph(),    \
1737                                             (__mmask32)(-1), (int)(R)))
1738 
1739 #define _mm512_mask_cvt_roundepi16_ph(W, U, A, R)                              \
1740   ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), (__v32hf)(W),        \
1741                                             (__mmask32)(U), (int)(R)))
1742 
1743 #define _mm512_maskz_cvt_roundepi16_ph(U, A, R)                                \
1744   ((__m512h)__builtin_ia32_vcvtw2ph512_mask(                                   \
1745       (__v32hi)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1746 
1747 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1748 _mm512_cvtepi16_ph(__m512i __A) {
1749   return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1750       (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)-1,
1751       _MM_FROUND_CUR_DIRECTION);
1752 }
1753 
1754 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1755 _mm512_mask_cvtepi16_ph(__m512h __W, __mmask32 __U, __m512i __A) {
1756   return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1757       (__v32hi)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1758 }
1759 
1760 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1761 _mm512_maskz_cvtepi16_ph(__mmask32 __U, __m512i __A) {
1762   return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1763       (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1764       _MM_FROUND_CUR_DIRECTION);
1765 }
1766 
1767 #define _mm512_cvt_roundph_epu16(A, R)                                         \
1768   ((__m512i)__builtin_ia32_vcvtph2uw512_mask(                                  \
1769       (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1),        \
1770       (int)(R)))
1771 
1772 #define _mm512_mask_cvt_roundph_epu16(W, U, A, R)                              \
1773   ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), (__v32hu)(W),       \
1774                                              (__mmask32)(U), (int)(R)))
1775 
1776 #define _mm512_maskz_cvt_roundph_epu16(U, A, R)                                \
1777   ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A),                     \
1778                                              (__v32hu)_mm512_setzero_epi32(),  \
1779                                              (__mmask32)(U), (int)(R)))
1780 
1781 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1782 _mm512_cvtph_epu16(__m512h __A) {
1783   return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1784       (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)-1,
1785       _MM_FROUND_CUR_DIRECTION);
1786 }
1787 
1788 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1789 _mm512_mask_cvtph_epu16(__m512i __W, __mmask32 __U, __m512h __A) {
1790   return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1791       (__v32hf)__A, (__v32hu)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1792 }
1793 
1794 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1795 _mm512_maskz_cvtph_epu16(__mmask32 __U, __m512h __A) {
1796   return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1797       (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)__U,
1798       _MM_FROUND_CUR_DIRECTION);
1799 }
1800 
1801 #define _mm512_cvtt_roundph_epu16(A, R)                                        \
1802   ((__m512i)__builtin_ia32_vcvttph2uw512_mask(                                 \
1803       (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1),        \
1804       (int)(R)))
1805 
1806 #define _mm512_mask_cvtt_roundph_epu16(W, U, A, R)                             \
1807   ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), (__v32hu)(W),      \
1808                                               (__mmask32)(U), (int)(R)))
1809 
1810 #define _mm512_maskz_cvtt_roundph_epu16(U, A, R)                               \
1811   ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A),                    \
1812                                               (__v32hu)_mm512_setzero_epi32(), \
1813                                               (__mmask32)(U), (int)(R)))
1814 
1815 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1816 _mm512_cvttph_epu16(__m512h __A) {
1817   return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1818       (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)-1,
1819       _MM_FROUND_CUR_DIRECTION);
1820 }
1821 
1822 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1823 _mm512_mask_cvttph_epu16(__m512i __W, __mmask32 __U, __m512h __A) {
1824   return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1825       (__v32hf)__A, (__v32hu)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1826 }
1827 
1828 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1829 _mm512_maskz_cvttph_epu16(__mmask32 __U, __m512h __A) {
1830   return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1831       (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)__U,
1832       _MM_FROUND_CUR_DIRECTION);
1833 }
1834 
1835 #define _mm512_cvt_roundepu16_ph(A, R)                                         \
1836   ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A),                     \
1837                                              (__v32hf)_mm512_undefined_ph(),   \
1838                                              (__mmask32)(-1), (int)(R)))
1839 
1840 #define _mm512_mask_cvt_roundepu16_ph(W, U, A, R)                              \
1841   ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), (__v32hf)(W),       \
1842                                              (__mmask32)(U), (int)(R)))
1843 
1844 #define _mm512_maskz_cvt_roundepu16_ph(U, A, R)                                \
1845   ((__m512h)__builtin_ia32_vcvtuw2ph512_mask(                                  \
1846       (__v32hu)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1847 
1848 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1849 _mm512_cvtepu16_ph(__m512i __A) {
1850   return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1851       (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)-1,
1852       _MM_FROUND_CUR_DIRECTION);
1853 }
1854 
1855 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1856 _mm512_mask_cvtepu16_ph(__m512h __W, __mmask32 __U, __m512i __A) {
1857   return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1858       (__v32hu)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1859 }
1860 
1861 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1862 _mm512_maskz_cvtepu16_ph(__mmask32 __U, __m512i __A) {
1863   return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1864       (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1865       _MM_FROUND_CUR_DIRECTION);
1866 }
1867 
1868 #define _mm512_cvt_roundph_epi32(A, R)                                         \
1869   ((__m512i)__builtin_ia32_vcvtph2dq512_mask(                                  \
1870       (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1),        \
1871       (int)(R)))
1872 
1873 #define _mm512_mask_cvt_roundph_epi32(W, U, A, R)                              \
1874   ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), (__v16si)(W),       \
1875                                              (__mmask16)(U), (int)(R)))
1876 
1877 #define _mm512_maskz_cvt_roundph_epi32(U, A, R)                                \
1878   ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A),                     \
1879                                              (__v16si)_mm512_setzero_epi32(),  \
1880                                              (__mmask16)(U), (int)(R)))
1881 
1882 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1883 _mm512_cvtph_epi32(__m256h __A) {
1884   return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1885       (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)-1,
1886       _MM_FROUND_CUR_DIRECTION);
1887 }
1888 
1889 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1890 _mm512_mask_cvtph_epi32(__m512i __W, __mmask16 __U, __m256h __A) {
1891   return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1892       (__v16hf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1893 }
1894 
1895 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1896 _mm512_maskz_cvtph_epi32(__mmask16 __U, __m256h __A) {
1897   return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1898       (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)__U,
1899       _MM_FROUND_CUR_DIRECTION);
1900 }
1901 
1902 #define _mm512_cvt_roundph_epu32(A, R)                                         \
1903   ((__m512i)__builtin_ia32_vcvtph2udq512_mask(                                 \
1904       (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1),        \
1905       (int)(R)))
1906 
1907 #define _mm512_mask_cvt_roundph_epu32(W, U, A, R)                              \
1908   ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), (__v16su)(W),      \
1909                                               (__mmask16)(U), (int)(R)))
1910 
1911 #define _mm512_maskz_cvt_roundph_epu32(U, A, R)                                \
1912   ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A),                    \
1913                                               (__v16su)_mm512_setzero_epi32(), \
1914                                               (__mmask16)(U), (int)(R)))
1915 
1916 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1917 _mm512_cvtph_epu32(__m256h __A) {
1918   return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1919       (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)-1,
1920       _MM_FROUND_CUR_DIRECTION);
1921 }
1922 
1923 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1924 _mm512_mask_cvtph_epu32(__m512i __W, __mmask16 __U, __m256h __A) {
1925   return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1926       (__v16hf)__A, (__v16su)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1927 }
1928 
1929 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1930 _mm512_maskz_cvtph_epu32(__mmask16 __U, __m256h __A) {
1931   return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1932       (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)__U,
1933       _MM_FROUND_CUR_DIRECTION);
1934 }
1935 
1936 #define _mm512_cvt_roundepi32_ph(A, R)                                         \
1937   ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A),                     \
1938                                              (__v16hf)_mm256_undefined_ph(),   \
1939                                              (__mmask16)(-1), (int)(R)))
1940 
1941 #define _mm512_mask_cvt_roundepi32_ph(W, U, A, R)                              \
1942   ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), (__v16hf)(W),       \
1943                                              (__mmask16)(U), (int)(R)))
1944 
1945 #define _mm512_maskz_cvt_roundepi32_ph(U, A, R)                                \
1946   ((__m256h)__builtin_ia32_vcvtdq2ph512_mask(                                  \
1947       (__v16si)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1948 
1949 static __inline__ __m256h __DEFAULT_FN_ATTRS512
1950 _mm512_cvtepi32_ph(__m512i __A) {
1951   return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1952       (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
1953       _MM_FROUND_CUR_DIRECTION);
1954 }
1955 
1956 static __inline__ __m256h __DEFAULT_FN_ATTRS512
1957 _mm512_mask_cvtepi32_ph(__m256h __W, __mmask16 __U, __m512i __A) {
1958   return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1959       (__v16si)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1960 }
1961 
1962 static __inline__ __m256h __DEFAULT_FN_ATTRS512
1963 _mm512_maskz_cvtepi32_ph(__mmask16 __U, __m512i __A) {
1964   return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1965       (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
1966       _MM_FROUND_CUR_DIRECTION);
1967 }
1968 
1969 #define _mm512_cvt_roundepu32_ph(A, R)                                         \
1970   ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A),                    \
1971                                               (__v16hf)_mm256_undefined_ph(),  \
1972                                               (__mmask16)(-1), (int)(R)))
1973 
1974 #define _mm512_mask_cvt_roundepu32_ph(W, U, A, R)                              \
1975   ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), (__v16hf)(W),      \
1976                                               (__mmask16)(U), (int)(R)))
1977 
1978 #define _mm512_maskz_cvt_roundepu32_ph(U, A, R)                                \
1979   ((__m256h)__builtin_ia32_vcvtudq2ph512_mask(                                 \
1980       (__v16su)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1981 
1982 static __inline__ __m256h __DEFAULT_FN_ATTRS512
1983 _mm512_cvtepu32_ph(__m512i __A) {
1984   return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
1985       (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
1986       _MM_FROUND_CUR_DIRECTION);
1987 }
1988 
1989 static __inline__ __m256h __DEFAULT_FN_ATTRS512
1990 _mm512_mask_cvtepu32_ph(__m256h __W, __mmask16 __U, __m512i __A) {
1991   return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
1992       (__v16su)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1993 }
1994 
1995 static __inline__ __m256h __DEFAULT_FN_ATTRS512
1996 _mm512_maskz_cvtepu32_ph(__mmask16 __U, __m512i __A) {
1997   return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
1998       (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
1999       _MM_FROUND_CUR_DIRECTION);
2000 }
2001 
2002 #define _mm512_cvtt_roundph_epi32(A, R)                                        \
2003   ((__m512i)__builtin_ia32_vcvttph2dq512_mask(                                 \
2004       (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1),        \
2005       (int)(R)))
2006 
2007 #define _mm512_mask_cvtt_roundph_epi32(W, U, A, R)                             \
2008   ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), (__v16si)(W),      \
2009                                               (__mmask16)(U), (int)(R)))
2010 
2011 #define _mm512_maskz_cvtt_roundph_epi32(U, A, R)                               \
2012   ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A),                    \
2013                                               (__v16si)_mm512_setzero_epi32(), \
2014                                               (__mmask16)(U), (int)(R)))
2015 
2016 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2017 _mm512_cvttph_epi32(__m256h __A) {
2018   return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2019       (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)-1,
2020       _MM_FROUND_CUR_DIRECTION);
2021 }
2022 
2023 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2024 _mm512_mask_cvttph_epi32(__m512i __W, __mmask16 __U, __m256h __A) {
2025   return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2026       (__v16hf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2027 }
2028 
2029 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2030 _mm512_maskz_cvttph_epi32(__mmask16 __U, __m256h __A) {
2031   return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2032       (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)__U,
2033       _MM_FROUND_CUR_DIRECTION);
2034 }
2035 
2036 #define _mm512_cvtt_roundph_epu32(A, R)                                        \
2037   ((__m512i)__builtin_ia32_vcvttph2udq512_mask(                                \
2038       (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1),        \
2039       (int)(R)))
2040 
2041 #define _mm512_mask_cvtt_roundph_epu32(W, U, A, R)                             \
2042   ((__m512i)__builtin_ia32_vcvttph2udq512_mask((__v16hf)(A), (__v16su)(W),     \
2043                                                (__mmask16)(U), (int)(R)))
2044 
2045 #define _mm512_maskz_cvtt_roundph_epu32(U, A, R)                               \
2046   ((__m512i)__builtin_ia32_vcvttph2udq512_mask(                                \
2047       (__v16hf)(A), (__v16su)_mm512_setzero_epi32(), (__mmask16)(U),           \
2048       (int)(R)))
2049 
2050 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2051 _mm512_cvttph_epu32(__m256h __A) {
2052   return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2053       (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)-1,
2054       _MM_FROUND_CUR_DIRECTION);
2055 }
2056 
2057 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2058 _mm512_mask_cvttph_epu32(__m512i __W, __mmask16 __U, __m256h __A) {
2059   return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2060       (__v16hf)__A, (__v16su)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2061 }
2062 
2063 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2064 _mm512_maskz_cvttph_epu32(__mmask16 __U, __m256h __A) {
2065   return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2066       (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)__U,
2067       _MM_FROUND_CUR_DIRECTION);
2068 }
2069 
2070 #define _mm512_cvt_roundepi64_ph(A, R)                                         \
2071   ((__m128h)__builtin_ia32_vcvtqq2ph512_mask(                                  \
2072       (__v8di)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
2073 
2074 #define _mm512_mask_cvt_roundepi64_ph(W, U, A, R)                              \
2075   ((__m128h)__builtin_ia32_vcvtqq2ph512_mask((__v8di)(A), (__v8hf)(W),         \
2076                                              (__mmask8)(U), (int)(R)))
2077 
2078 #define _mm512_maskz_cvt_roundepi64_ph(U, A, R)                                \
2079   ((__m128h)__builtin_ia32_vcvtqq2ph512_mask(                                  \
2080       (__v8di)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
2081 
2082 static __inline__ __m128h __DEFAULT_FN_ATTRS512
2083 _mm512_cvtepi64_ph(__m512i __A) {
2084   return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2085       (__v8di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
2086       _MM_FROUND_CUR_DIRECTION);
2087 }
2088 
2089 static __inline__ __m128h __DEFAULT_FN_ATTRS512
2090 _mm512_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m512i __A) {
2091   return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2092       (__v8di)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2093 }
2094 
2095 static __inline__ __m128h __DEFAULT_FN_ATTRS512
2096 _mm512_maskz_cvtepi64_ph(__mmask8 __U, __m512i __A) {
2097   return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2098       (__v8di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
2099       _MM_FROUND_CUR_DIRECTION);
2100 }
2101 
2102 #define _mm512_cvt_roundph_epi64(A, R)                                         \
2103   ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A),                      \
2104                                              (__v8di)_mm512_undefined_epi32(), \
2105                                              (__mmask8)(-1), (int)(R)))
2106 
2107 #define _mm512_mask_cvt_roundph_epi64(W, U, A, R)                              \
2108   ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), (__v8di)(W),         \
2109                                              (__mmask8)(U), (int)(R)))
2110 
2111 #define _mm512_maskz_cvt_roundph_epi64(U, A, R)                                \
2112   ((__m512i)__builtin_ia32_vcvtph2qq512_mask(                                  \
2113       (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2114 
2115 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2116 _mm512_cvtph_epi64(__m128h __A) {
2117   return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2118       (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)-1,
2119       _MM_FROUND_CUR_DIRECTION);
2120 }
2121 
2122 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2123 _mm512_mask_cvtph_epi64(__m512i __W, __mmask8 __U, __m128h __A) {
2124   return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2125       (__v8hf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2126 }
2127 
2128 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2129 _mm512_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) {
2130   return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2131       (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)__U,
2132       _MM_FROUND_CUR_DIRECTION);
2133 }
2134 
2135 #define _mm512_cvt_roundepu64_ph(A, R)                                         \
2136   ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask(                                 \
2137       (__v8du)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
2138 
2139 #define _mm512_mask_cvt_roundepu64_ph(W, U, A, R)                              \
2140   ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask((__v8du)(A), (__v8hf)(W),        \
2141                                               (__mmask8)(U), (int)(R)))
2142 
2143 #define _mm512_maskz_cvt_roundepu64_ph(U, A, R)                                \
2144   ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask(                                 \
2145       (__v8du)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
2146 
2147 static __inline__ __m128h __DEFAULT_FN_ATTRS512
2148 _mm512_cvtepu64_ph(__m512i __A) {
2149   return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2150       (__v8du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
2151       _MM_FROUND_CUR_DIRECTION);
2152 }
2153 
2154 static __inline__ __m128h __DEFAULT_FN_ATTRS512
2155 _mm512_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m512i __A) {
2156   return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2157       (__v8du)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2158 }
2159 
2160 static __inline__ __m128h __DEFAULT_FN_ATTRS512
2161 _mm512_maskz_cvtepu64_ph(__mmask8 __U, __m512i __A) {
2162   return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2163       (__v8du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
2164       _MM_FROUND_CUR_DIRECTION);
2165 }
2166 
2167 #define _mm512_cvt_roundph_epu64(A, R)                                         \
2168   ((__m512i)__builtin_ia32_vcvtph2uqq512_mask(                                 \
2169       (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1),           \
2170       (int)(R)))
2171 
2172 #define _mm512_mask_cvt_roundph_epu64(W, U, A, R)                              \
2173   ((__m512i)__builtin_ia32_vcvtph2uqq512_mask((__v8hf)(A), (__v8du)(W),        \
2174                                               (__mmask8)(U), (int)(R)))
2175 
2176 #define _mm512_maskz_cvt_roundph_epu64(U, A, R)                                \
2177   ((__m512i)__builtin_ia32_vcvtph2uqq512_mask(                                 \
2178       (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2179 
2180 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2181 _mm512_cvtph_epu64(__m128h __A) {
2182   return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2183       (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)-1,
2184       _MM_FROUND_CUR_DIRECTION);
2185 }
2186 
2187 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2188 _mm512_mask_cvtph_epu64(__m512i __W, __mmask8 __U, __m128h __A) {
2189   return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2190       (__v8hf)__A, (__v8du)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2191 }
2192 
2193 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2194 _mm512_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) {
2195   return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2196       (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)__U,
2197       _MM_FROUND_CUR_DIRECTION);
2198 }
2199 
2200 #define _mm512_cvtt_roundph_epi64(A, R)                                        \
2201   ((__m512i)__builtin_ia32_vcvttph2qq512_mask(                                 \
2202       (__v8hf)(A), (__v8di)_mm512_undefined_epi32(), (__mmask8)(-1),           \
2203       (int)(R)))
2204 
2205 #define _mm512_mask_cvtt_roundph_epi64(W, U, A, R)                             \
2206   ((__m512i)__builtin_ia32_vcvttph2qq512_mask((__v8hf)(A), (__v8di)(W),        \
2207                                               (__mmask8)(U), (int)(R)))
2208 
2209 #define _mm512_maskz_cvtt_roundph_epi64(U, A, R)                               \
2210   ((__m512i)__builtin_ia32_vcvttph2qq512_mask(                                 \
2211       (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2212 
2213 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2214 _mm512_cvttph_epi64(__m128h __A) {
2215   return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2216       (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)-1,
2217       _MM_FROUND_CUR_DIRECTION);
2218 }
2219 
2220 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2221 _mm512_mask_cvttph_epi64(__m512i __W, __mmask8 __U, __m128h __A) {
2222   return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2223       (__v8hf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2224 }
2225 
2226 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2227 _mm512_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) {
2228   return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2229       (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)__U,
2230       _MM_FROUND_CUR_DIRECTION);
2231 }
2232 
2233 #define _mm512_cvtt_roundph_epu64(A, R)                                        \
2234   ((__m512i)__builtin_ia32_vcvttph2uqq512_mask(                                \
2235       (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1),           \
2236       (int)(R)))
2237 
2238 #define _mm512_mask_cvtt_roundph_epu64(W, U, A, R)                             \
2239   ((__m512i)__builtin_ia32_vcvttph2uqq512_mask((__v8hf)(A), (__v8du)(W),       \
2240                                                (__mmask8)(U), (int)(R)))
2241 
2242 #define _mm512_maskz_cvtt_roundph_epu64(U, A, R)                               \
2243   ((__m512i)__builtin_ia32_vcvttph2uqq512_mask(                                \
2244       (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2245 
2246 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2247 _mm512_cvttph_epu64(__m128h __A) {
2248   return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2249       (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)-1,
2250       _MM_FROUND_CUR_DIRECTION);
2251 }
2252 
2253 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2254 _mm512_mask_cvttph_epu64(__m512i __W, __mmask8 __U, __m128h __A) {
2255   return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2256       (__v8hf)__A, (__v8du)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2257 }
2258 
2259 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2260 _mm512_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) {
2261   return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2262       (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)__U,
2263       _MM_FROUND_CUR_DIRECTION);
2264 }
2265 
2266 #define _mm_cvt_roundsh_i32(A, R)                                              \
2267   ((int)__builtin_ia32_vcvtsh2si32((__v8hf)(A), (int)(R)))
2268 
2269 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvtsh_i32(__m128h __A) {
2270   return (int)__builtin_ia32_vcvtsh2si32((__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
2271 }
2272 
2273 #define _mm_cvt_roundsh_u32(A, R)                                              \
2274   ((unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)(A), (int)(R)))
2275 
2276 static __inline__ unsigned int __DEFAULT_FN_ATTRS128
2277 _mm_cvtsh_u32(__m128h __A) {
2278   return (unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)__A,
2279                                                    _MM_FROUND_CUR_DIRECTION);
2280 }
2281 
2282 #ifdef __x86_64__
2283 #define _mm_cvt_roundsh_i64(A, R)                                              \
2284   ((long long)__builtin_ia32_vcvtsh2si64((__v8hf)(A), (int)(R)))
2285 
2286 static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvtsh_i64(__m128h __A) {
2287   return (long long)__builtin_ia32_vcvtsh2si64((__v8hf)__A,
2288                                                _MM_FROUND_CUR_DIRECTION);
2289 }
2290 
2291 #define _mm_cvt_roundsh_u64(A, R)                                              \
2292   ((unsigned long long)__builtin_ia32_vcvtsh2usi64((__v8hf)(A), (int)(R)))
2293 
2294 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
2295 _mm_cvtsh_u64(__m128h __A) {
2296   return (unsigned long long)__builtin_ia32_vcvtsh2usi64(
2297       (__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
2298 }
2299 #endif // __x86_64__
2300 
2301 #define _mm_cvt_roundu32_sh(A, B, R)                                           \
2302   ((__m128h)__builtin_ia32_vcvtusi2sh((__v8hf)(A), (unsigned int)(B), (int)(R)))
2303 
2304 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2305 _mm_cvtu32_sh(__m128h __A, unsigned int __B) {
2306   __A[0] = __B;
2307   return __A;
2308 }
2309 
2310 #ifdef __x86_64__
2311 #define _mm_cvt_roundu64_sh(A, B, R)                                           \
2312   ((__m128h)__builtin_ia32_vcvtusi642sh((__v8hf)(A), (unsigned long long)(B),  \
2313                                         (int)(R)))
2314 
2315 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2316 _mm_cvtu64_sh(__m128h __A, unsigned long long __B) {
2317   __A[0] = __B;
2318   return __A;
2319 }
2320 #endif
2321 
2322 #define _mm_cvt_roundi32_sh(A, B, R)                                           \
2323   ((__m128h)__builtin_ia32_vcvtsi2sh((__v8hf)(A), (int)(B), (int)(R)))
2324 
2325 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvti32_sh(__m128h __A,
2326                                                               int __B) {
2327   __A[0] = __B;
2328   return __A;
2329 }
2330 
2331 #ifdef __x86_64__
2332 #define _mm_cvt_roundi64_sh(A, B, R)                                           \
2333   ((__m128h)__builtin_ia32_vcvtsi642sh((__v8hf)(A), (long long)(B), (int)(R)))
2334 
2335 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvti64_sh(__m128h __A,
2336                                                               long long __B) {
2337   __A[0] = __B;
2338   return __A;
2339 }
2340 #endif
2341 
2342 #define _mm_cvtt_roundsh_i32(A, R)                                             \
2343   ((int)__builtin_ia32_vcvttsh2si32((__v8hf)(A), (int)(R)))
2344 
2345 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvttsh_i32(__m128h __A) {
2346   return (int)__builtin_ia32_vcvttsh2si32((__v8hf)__A,
2347                                           _MM_FROUND_CUR_DIRECTION);
2348 }
2349 
2350 #ifdef __x86_64__
2351 #define _mm_cvtt_roundsh_i64(A, R)                                             \
2352   ((long long)__builtin_ia32_vcvttsh2si64((__v8hf)(A), (int)(R)))
2353 
2354 static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvttsh_i64(__m128h __A) {
2355   return (long long)__builtin_ia32_vcvttsh2si64((__v8hf)__A,
2356                                                 _MM_FROUND_CUR_DIRECTION);
2357 }
2358 #endif
2359 
2360 #define _mm_cvtt_roundsh_u32(A, R)                                             \
2361   ((unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)(A), (int)(R)))
2362 
2363 static __inline__ unsigned int __DEFAULT_FN_ATTRS128
2364 _mm_cvttsh_u32(__m128h __A) {
2365   return (unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)__A,
2366                                                     _MM_FROUND_CUR_DIRECTION);
2367 }
2368 
2369 #ifdef __x86_64__
2370 #define _mm_cvtt_roundsh_u64(A, R)                                             \
2371   ((unsigned long long)__builtin_ia32_vcvttsh2usi64((__v8hf)(A), (int)(R)))
2372 
2373 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
2374 _mm_cvttsh_u64(__m128h __A) {
2375   return (unsigned long long)__builtin_ia32_vcvttsh2usi64(
2376       (__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
2377 }
2378 #endif
2379 
2380 #define _mm512_cvtx_roundph_ps(A, R)                                           \
2381   ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A),                     \
2382                                              (__v16sf)_mm512_undefined_ps(),   \
2383                                              (__mmask16)(-1), (int)(R)))
2384 
2385 #define _mm512_mask_cvtx_roundph_ps(W, U, A, R)                                \
2386   ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), (__v16sf)(W),       \
2387                                              (__mmask16)(U), (int)(R)))
2388 
2389 #define _mm512_maskz_cvtx_roundph_ps(U, A, R)                                  \
2390   ((__m512)__builtin_ia32_vcvtph2psx512_mask(                                  \
2391       (__v16hf)(A), (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), (int)(R)))
2392 
2393 static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtxph_ps(__m256h __A) {
2394   return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2395       (__v16hf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)-1,
2396       _MM_FROUND_CUR_DIRECTION);
2397 }
2398 
2399 static __inline__ __m512 __DEFAULT_FN_ATTRS512
2400 _mm512_mask_cvtxph_ps(__m512 __W, __mmask16 __U, __m256h __A) {
2401   return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2402       (__v16hf)__A, (__v16sf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2403 }
2404 
2405 static __inline__ __m512 __DEFAULT_FN_ATTRS512
2406 _mm512_maskz_cvtxph_ps(__mmask16 __U, __m256h __A) {
2407   return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2408       (__v16hf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U,
2409       _MM_FROUND_CUR_DIRECTION);
2410 }
2411 
2412 #define _mm512_cvtx_roundps_ph(A, R)                                           \
2413   ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A),                    \
2414                                               (__v16hf)_mm256_undefined_ph(),  \
2415                                               (__mmask16)(-1), (int)(R)))
2416 
2417 #define _mm512_mask_cvtx_roundps_ph(W, U, A, R)                                \
2418   ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), (__v16hf)(W),      \
2419                                               (__mmask16)(U), (int)(R)))
2420 
2421 #define _mm512_maskz_cvtx_roundps_ph(U, A, R)                                  \
2422   ((__m256h)__builtin_ia32_vcvtps2phx512_mask(                                 \
2423       (__v16sf)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
2424 
2425 static __inline__ __m256h __DEFAULT_FN_ATTRS512 _mm512_cvtxps_ph(__m512 __A) {
2426   return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2427       (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
2428       _MM_FROUND_CUR_DIRECTION);
2429 }
2430 
2431 static __inline__ __m256h __DEFAULT_FN_ATTRS512
2432 _mm512_mask_cvtxps_ph(__m256h __W, __mmask16 __U, __m512 __A) {
2433   return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2434       (__v16sf)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2435 }
2436 
2437 static __inline__ __m256h __DEFAULT_FN_ATTRS512
2438 _mm512_maskz_cvtxps_ph(__mmask16 __U, __m512 __A) {
2439   return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2440       (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
2441       _MM_FROUND_CUR_DIRECTION);
2442 }
2443 
2444 #define _mm512_fmadd_round_ph(A, B, C, R)                                      \
2445   ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2446       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2447       (__mmask32)-1, (int)(R)))
2448 
2449 #define _mm512_mask_fmadd_round_ph(A, U, B, C, R)                              \
2450   ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2451       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2452       (__mmask32)(U), (int)(R)))
2453 
2454 #define _mm512_mask3_fmadd_round_ph(A, B, C, U, R)                             \
2455   ((__m512h)__builtin_ia32_vfmaddph512_mask3(                                  \
2456       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2457       (__mmask32)(U), (int)(R)))
2458 
2459 #define _mm512_maskz_fmadd_round_ph(U, A, B, C, R)                             \
2460   ((__m512h)__builtin_ia32_vfmaddph512_maskz(                                  \
2461       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2462       (__mmask32)(U), (int)(R)))
2463 
2464 #define _mm512_fmsub_round_ph(A, B, C, R)                                      \
2465   ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2466       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
2467       (__mmask32)-1, (int)(R)))
2468 
2469 #define _mm512_mask_fmsub_round_ph(A, U, B, C, R)                              \
2470   ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2471       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
2472       (__mmask32)(U), (int)(R)))
2473 
2474 #define _mm512_maskz_fmsub_round_ph(U, A, B, C, R)                             \
2475   ((__m512h)__builtin_ia32_vfmaddph512_maskz(                                  \
2476       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
2477       (__mmask32)(U), (int)(R)))
2478 
2479 #define _mm512_fnmadd_round_ph(A, B, C, R)                                     \
2480   ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2481       (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),    \
2482       (__mmask32)-1, (int)(R)))
2483 
2484 #define _mm512_mask3_fnmadd_round_ph(A, B, C, U, R)                            \
2485   ((__m512h)__builtin_ia32_vfmaddph512_mask3(                                  \
2486       -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),    \
2487       (__mmask32)(U), (int)(R)))
2488 
2489 #define _mm512_maskz_fnmadd_round_ph(U, A, B, C, R)                            \
2490   ((__m512h)__builtin_ia32_vfmaddph512_maskz(                                  \
2491       -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),    \
2492       (__mmask32)(U), (int)(R)))
2493 
2494 #define _mm512_fnmsub_round_ph(A, B, C, R)                                     \
2495   ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2496       (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),   \
2497       (__mmask32)-1, (int)(R)))
2498 
2499 #define _mm512_maskz_fnmsub_round_ph(U, A, B, C, R)                            \
2500   ((__m512h)__builtin_ia32_vfmaddph512_maskz(                                  \
2501       -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),   \
2502       (__mmask32)(U), (int)(R)))
2503 
2504 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_ph(__m512h __A,
2505                                                                 __m512h __B,
2506                                                                 __m512h __C) {
2507   return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2508                                                   (__v32hf)__C, (__mmask32)-1,
2509                                                   _MM_FROUND_CUR_DIRECTION);
2510 }
2511 
2512 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2513 _mm512_mask_fmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2514   return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2515                                                   (__v32hf)__C, (__mmask32)__U,
2516                                                   _MM_FROUND_CUR_DIRECTION);
2517 }
2518 
2519 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2520 _mm512_mask3_fmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2521   return (__m512h)__builtin_ia32_vfmaddph512_mask3((__v32hf)__A, (__v32hf)__B,
2522                                                    (__v32hf)__C, (__mmask32)__U,
2523                                                    _MM_FROUND_CUR_DIRECTION);
2524 }
2525 
2526 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2527 _mm512_maskz_fmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2528   return (__m512h)__builtin_ia32_vfmaddph512_maskz((__v32hf)__A, (__v32hf)__B,
2529                                                    (__v32hf)__C, (__mmask32)__U,
2530                                                    _MM_FROUND_CUR_DIRECTION);
2531 }
2532 
2533 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmsub_ph(__m512h __A,
2534                                                                 __m512h __B,
2535                                                                 __m512h __C) {
2536   return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2537                                                   -(__v32hf)__C, (__mmask32)-1,
2538                                                   _MM_FROUND_CUR_DIRECTION);
2539 }
2540 
2541 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2542 _mm512_mask_fmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2543   return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2544                                                   -(__v32hf)__C, (__mmask32)__U,
2545                                                   _MM_FROUND_CUR_DIRECTION);
2546 }
2547 
2548 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2549 _mm512_maskz_fmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2550   return (__m512h)__builtin_ia32_vfmaddph512_maskz(
2551       (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2552       _MM_FROUND_CUR_DIRECTION);
2553 }
2554 
2555 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fnmadd_ph(__m512h __A,
2556                                                                  __m512h __B,
2557                                                                  __m512h __C) {
2558   return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2559                                                   (__v32hf)__C, (__mmask32)-1,
2560                                                   _MM_FROUND_CUR_DIRECTION);
2561 }
2562 
2563 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2564 _mm512_mask3_fnmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2565   return (__m512h)__builtin_ia32_vfmaddph512_mask3(-(__v32hf)__A, (__v32hf)__B,
2566                                                    (__v32hf)__C, (__mmask32)__U,
2567                                                    _MM_FROUND_CUR_DIRECTION);
2568 }
2569 
2570 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2571 _mm512_maskz_fnmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2572   return (__m512h)__builtin_ia32_vfmaddph512_maskz(-(__v32hf)__A, (__v32hf)__B,
2573                                                    (__v32hf)__C, (__mmask32)__U,
2574                                                    _MM_FROUND_CUR_DIRECTION);
2575 }
2576 
2577 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fnmsub_ph(__m512h __A,
2578                                                                  __m512h __B,
2579                                                                  __m512h __C) {
2580   return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2581                                                   -(__v32hf)__C, (__mmask32)-1,
2582                                                   _MM_FROUND_CUR_DIRECTION);
2583 }
2584 
2585 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2586 _mm512_maskz_fnmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2587   return (__m512h)__builtin_ia32_vfmaddph512_maskz(
2588       -(__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2589       _MM_FROUND_CUR_DIRECTION);
2590 }
2591 
2592 #define _mm512_fmaddsub_round_ph(A, B, C, R)                                   \
2593   ((__m512h)__builtin_ia32_vfmaddsubph512_mask(                                \
2594       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2595       (__mmask32)-1, (int)(R)))
2596 
2597 #define _mm512_mask_fmaddsub_round_ph(A, U, B, C, R)                           \
2598   ((__m512h)__builtin_ia32_vfmaddsubph512_mask(                                \
2599       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2600       (__mmask32)(U), (int)(R)))
2601 
2602 #define _mm512_mask3_fmaddsub_round_ph(A, B, C, U, R)                          \
2603   ((__m512h)__builtin_ia32_vfmaddsubph512_mask3(                               \
2604       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2605       (__mmask32)(U), (int)(R)))
2606 
2607 #define _mm512_maskz_fmaddsub_round_ph(U, A, B, C, R)                          \
2608   ((__m512h)__builtin_ia32_vfmaddsubph512_maskz(                               \
2609       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2610       (__mmask32)(U), (int)(R)))
2611 
2612 #define _mm512_fmsubadd_round_ph(A, B, C, R)                                   \
2613   ((__m512h)__builtin_ia32_vfmaddsubph512_mask(                                \
2614       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
2615       (__mmask32)-1, (int)(R)))
2616 
2617 #define _mm512_mask_fmsubadd_round_ph(A, U, B, C, R)                           \
2618   ((__m512h)__builtin_ia32_vfmaddsubph512_mask(                                \
2619       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
2620       (__mmask32)(U), (int)(R)))
2621 
2622 #define _mm512_maskz_fmsubadd_round_ph(U, A, B, C, R)                          \
2623   ((__m512h)__builtin_ia32_vfmaddsubph512_maskz(                               \
2624       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
2625       (__mmask32)(U), (int)(R)))
2626 
2627 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2628 _mm512_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C) {
2629   return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2630       (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)-1,
2631       _MM_FROUND_CUR_DIRECTION);
2632 }
2633 
2634 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2635 _mm512_mask_fmaddsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2636   return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2637       (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2638       _MM_FROUND_CUR_DIRECTION);
2639 }
2640 
2641 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2642 _mm512_mask3_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2643   return (__m512h)__builtin_ia32_vfmaddsubph512_mask3(
2644       (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2645       _MM_FROUND_CUR_DIRECTION);
2646 }
2647 
2648 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2649 _mm512_maskz_fmaddsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2650   return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
2651       (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2652       _MM_FROUND_CUR_DIRECTION);
2653 }
2654 
2655 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2656 _mm512_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C) {
2657   return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2658       (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)-1,
2659       _MM_FROUND_CUR_DIRECTION);
2660 }
2661 
2662 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2663 _mm512_mask_fmsubadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2664   return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2665       (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2666       _MM_FROUND_CUR_DIRECTION);
2667 }
2668 
2669 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2670 _mm512_maskz_fmsubadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2671   return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
2672       (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2673       _MM_FROUND_CUR_DIRECTION);
2674 }
2675 
2676 #define _mm512_mask3_fmsub_round_ph(A, B, C, U, R)                             \
2677   ((__m512h)__builtin_ia32_vfmsubph512_mask3(                                  \
2678       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2679       (__mmask32)(U), (int)(R)))
2680 
2681 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2682 _mm512_mask3_fmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2683   return (__m512h)__builtin_ia32_vfmsubph512_mask3((__v32hf)__A, (__v32hf)__B,
2684                                                    (__v32hf)__C, (__mmask32)__U,
2685                                                    _MM_FROUND_CUR_DIRECTION);
2686 }
2687 
2688 #define _mm512_mask3_fmsubadd_round_ph(A, B, C, U, R)                          \
2689   ((__m512h)__builtin_ia32_vfmsubaddph512_mask3(                               \
2690       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2691       (__mmask32)(U), (int)(R)))
2692 
2693 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2694 _mm512_mask3_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2695   return (__m512h)__builtin_ia32_vfmsubaddph512_mask3(
2696       (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2697       _MM_FROUND_CUR_DIRECTION);
2698 }
2699 
2700 #define _mm512_mask_fnmadd_round_ph(A, U, B, C, R)                             \
2701   ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2702       (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),    \
2703       (__mmask32)(U), (int)(R)))
2704 
2705 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2706 _mm512_mask_fnmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2707   return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2708                                                   (__v32hf)__C, (__mmask32)__U,
2709                                                   _MM_FROUND_CUR_DIRECTION);
2710 }
2711 
2712 #define _mm512_mask_fnmsub_round_ph(A, U, B, C, R)                             \
2713   ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2714       (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),   \
2715       (__mmask32)(U), (int)(R)))
2716 
2717 #define _mm512_mask3_fnmsub_round_ph(A, B, C, U, R)                            \
2718   ((__m512h)__builtin_ia32_vfmsubph512_mask3(                                  \
2719       -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),    \
2720       (__mmask32)(U), (int)(R)))
2721 
2722 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2723 _mm512_mask_fnmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2724   return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2725                                                   -(__v32hf)__C, (__mmask32)__U,
2726                                                   _MM_FROUND_CUR_DIRECTION);
2727 }
2728 
2729 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2730 _mm512_mask3_fnmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2731   return (__m512h)__builtin_ia32_vfmsubph512_mask3(-(__v32hf)__A, (__v32hf)__B,
2732                                                    (__v32hf)__C, (__mmask32)__U,
2733                                                    _MM_FROUND_CUR_DIRECTION);
2734 }
2735 
2736 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sh(__m128h __W,
2737                                                              __m128h __A,
2738                                                              __m128h __B) {
2739   return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
2740                                        (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
2741 }
2742 
2743 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_sh(__m128h __W,
2744                                                                   __mmask8 __U,
2745                                                                   __m128h __A,
2746                                                                   __m128h __B) {
2747   return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
2748                                        (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2749 }
2750 
2751 #define _mm_fmadd_round_sh(A, B, C, R)                                         \
2752   ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2753       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C),        \
2754       (__mmask8)-1, (int)(R)))
2755 
2756 #define _mm_mask_fmadd_round_sh(W, U, A, B, R)                                 \
2757   ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2758       (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B),        \
2759       (__mmask8)(U), (int)(R)))
2760 
2761 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2762 _mm_maskz_fmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2763   return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B, (__v8hf)__C,
2764                                         (__mmask8)__U,
2765                                         _MM_FROUND_CUR_DIRECTION);
2766 }
2767 
2768 #define _mm_maskz_fmadd_round_sh(U, A, B, C, R)                                \
2769   ((__m128h)__builtin_ia32_vfmaddsh3_maskz(                                    \
2770       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C),        \
2771       (__mmask8)(U), (int)(R)))
2772 
2773 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2774 _mm_mask3_fmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2775   return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)__Y,
2776                                         (__mmask8)__U,
2777                                         _MM_FROUND_CUR_DIRECTION);
2778 }
2779 
2780 #define _mm_mask3_fmadd_round_sh(W, X, Y, U, R)                                \
2781   ((__m128h)__builtin_ia32_vfmaddsh3_mask3(                                    \
2782       (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y),        \
2783       (__mmask8)(U), (int)(R)))
2784 
2785 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsub_sh(__m128h __W,
2786                                                              __m128h __A,
2787                                                              __m128h __B) {
2788   return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
2789                                                 -(__v8hf)__B, (__mmask8)-1,
2790                                                 _MM_FROUND_CUR_DIRECTION);
2791 }
2792 
2793 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_sh(__m128h __W,
2794                                                                   __mmask8 __U,
2795                                                                   __m128h __A,
2796                                                                   __m128h __B) {
2797   return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
2798                                                 -(__v8hf)__B, (__mmask8)__U,
2799                                                 _MM_FROUND_CUR_DIRECTION);
2800 }
2801 
2802 #define _mm_fmsub_round_sh(A, B, C, R)                                         \
2803   ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2804       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C),       \
2805       (__mmask8)-1, (int)(R)))
2806 
2807 #define _mm_mask_fmsub_round_sh(W, U, A, B, R)                                 \
2808   ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2809       (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B),       \
2810       (__mmask8)(U), (int)(R)))
2811 
2812 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2813 _mm_maskz_fmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2814   return (__m128h)__builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B,
2815                                                  -(__v8hf)__C, (__mmask8)__U,
2816                                                  _MM_FROUND_CUR_DIRECTION);
2817 }
2818 
2819 #define _mm_maskz_fmsub_round_sh(U, A, B, C, R)                                \
2820   ((__m128h)__builtin_ia32_vfmaddsh3_maskz(                                    \
2821       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C),       \
2822       (__mmask8)(U), (int)R))
2823 
2824 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2825 _mm_mask3_fmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2826   return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)__Y,
2827                                         (__mmask8)__U,
2828                                         _MM_FROUND_CUR_DIRECTION);
2829 }
2830 
2831 #define _mm_mask3_fmsub_round_sh(W, X, Y, U, R)                                \
2832   ((__m128h)__builtin_ia32_vfmsubsh3_mask3(                                    \
2833       (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y),        \
2834       (__mmask8)(U), (int)(R)))
2835 
2836 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmadd_sh(__m128h __W,
2837                                                               __m128h __A,
2838                                                               __m128h __B) {
2839   return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
2840                                        (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
2841 }
2842 
2843 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2844 _mm_mask_fnmadd_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
2845   return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
2846                                        (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2847 }
2848 
2849 #define _mm_fnmadd_round_sh(A, B, C, R)                                        \
2850   ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2851       (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C),       \
2852       (__mmask8)-1, (int)(R)))
2853 
2854 #define _mm_mask_fnmadd_round_sh(W, U, A, B, R)                                \
2855   ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2856       (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B),       \
2857       (__mmask8)(U), (int)(R)))
2858 
2859 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2860 _mm_maskz_fnmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2861   return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C,
2862                                         (__mmask8)__U,
2863                                         _MM_FROUND_CUR_DIRECTION);
2864 }
2865 
2866 #define _mm_maskz_fnmadd_round_sh(U, A, B, C, R)                               \
2867   ((__m128h)__builtin_ia32_vfmaddsh3_maskz(                                    \
2868       (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C),       \
2869       (__mmask8)(U), (int)(R)))
2870 
2871 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2872 _mm_mask3_fnmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2873   return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)__Y,
2874                                         (__mmask8)__U,
2875                                         _MM_FROUND_CUR_DIRECTION);
2876 }
2877 
2878 #define _mm_mask3_fnmadd_round_sh(W, X, Y, U, R)                               \
2879   ((__m128h)__builtin_ia32_vfmaddsh3_mask3(                                    \
2880       (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y),       \
2881       (__mmask8)(U), (int)(R)))
2882 
2883 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmsub_sh(__m128h __W,
2884                                                               __m128h __A,
2885                                                               __m128h __B) {
2886   return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
2887                                        (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
2888 }
2889 
2890 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2891 _mm_mask_fnmsub_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
2892   return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
2893                                        (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2894 }
2895 
2896 #define _mm_fnmsub_round_sh(A, B, C, R)                                        \
2897   ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2898       (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C),      \
2899       (__mmask8)-1, (int)(R)))
2900 
2901 #define _mm_mask_fnmsub_round_sh(W, U, A, B, R)                                \
2902   ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2903       (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B),      \
2904       (__mmask8)(U), (int)(R)))
2905 
2906 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2907 _mm_maskz_fnmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2908   return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C,
2909                                         (__mmask8)__U,
2910                                         _MM_FROUND_CUR_DIRECTION);
2911 }
2912 
2913 #define _mm_maskz_fnmsub_round_sh(U, A, B, C, R)                               \
2914   ((__m128h)__builtin_ia32_vfmaddsh3_maskz(                                    \
2915       (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C),      \
2916       (__mmask8)(U), (int)(R)))
2917 
2918 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2919 _mm_mask3_fnmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2920   return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)__Y,
2921                                         (__mmask8)__U,
2922                                         _MM_FROUND_CUR_DIRECTION);
2923 }
2924 
2925 #define _mm_mask3_fnmsub_round_sh(W, X, Y, U, R)                               \
2926   ((__m128h)__builtin_ia32_vfmsubsh3_mask3(                                    \
2927       (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y),       \
2928       (__mmask8)(U), (int)(R)))
2929 
2930 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmadd_sch(__m128h __A,
2931                                                                __m128h __B,
2932                                                                __m128h __C) {
2933   return (__m128h)__builtin_ia32_vfcmaddcsh_mask((__v4sf)__A, (__v4sf)__B,
2934                                                  (__v4sf)__C, (__mmask8)-1,
2935                                                  _MM_FROUND_CUR_DIRECTION);
2936 }
2937 
2938 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2939 _mm_mask_fcmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
2940   return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask(
2941       (__v4sf)__A, (__v4sf)(__B), (__v4sf)(__C), __U, _MM_FROUND_CUR_DIRECTION);
2942 }
2943 
2944 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2945 _mm_maskz_fcmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2946   return (__m128h)__builtin_ia32_vfcmaddcsh_maskz((__v4sf)__A, (__v4sf)__B,
2947                                                   (__v4sf)__C, (__mmask8)__U,
2948                                                   _MM_FROUND_CUR_DIRECTION);
2949 }
2950 
2951 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2952 _mm_mask3_fcmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
2953   return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask3(
2954       (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U, _MM_FROUND_CUR_DIRECTION);
2955 }
2956 
2957 #define _mm_fcmadd_round_sch(A, B, C, R)                                       \
2958   ((__m128h)__builtin_ia32_vfcmaddcsh_mask(                                    \
2959       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
2960       (__mmask8)-1, (int)(R)))
2961 
2962 #define _mm_mask_fcmadd_round_sch(A, U, B, C, R)                               \
2963   ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask(                              \
2964       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
2965       (__mmask8)(U), (int)(R)))
2966 
2967 #define _mm_maskz_fcmadd_round_sch(U, A, B, C, R)                              \
2968   ((__m128h)__builtin_ia32_vfcmaddcsh_maskz(                                   \
2969       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
2970       (__mmask8)(U), (int)(R)))
2971 
2972 #define _mm_mask3_fcmadd_round_sch(A, B, C, U, R)                              \
2973   ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask3(                             \
2974       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
2975       (__mmask8)(U), (int)(R)))
2976 
2977 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sch(__m128h __A,
2978                                                               __m128h __B,
2979                                                               __m128h __C) {
2980   return (__m128h)__builtin_ia32_vfmaddcsh_mask((__v4sf)__A, (__v4sf)__B,
2981                                                 (__v4sf)__C, (__mmask8)-1,
2982                                                 _MM_FROUND_CUR_DIRECTION);
2983 }
2984 
2985 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2986 _mm_mask_fmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
2987   return (__m128h)__builtin_ia32_vfmaddcsh_round_mask(
2988       (__v4sf)__A, (__v4sf)(__B), (__v4sf)(__C), __U, _MM_FROUND_CUR_DIRECTION);
2989 }
2990 
2991 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2992 _mm_maskz_fmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2993   return (__m128h)__builtin_ia32_vfmaddcsh_maskz((__v4sf)__A, (__v4sf)__B,
2994                                                  (__v4sf)__C, (__mmask8)__U,
2995                                                  _MM_FROUND_CUR_DIRECTION);
2996 }
2997 
2998 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2999 _mm_mask3_fmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
3000   return (__m128h)__builtin_ia32_vfmaddcsh_round_mask3(
3001       (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U, _MM_FROUND_CUR_DIRECTION);
3002 }
3003 
3004 #define _mm_fmadd_round_sch(A, B, C, R)                                        \
3005   ((__m128h)__builtin_ia32_vfmaddcsh_mask(                                     \
3006       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
3007       (__mmask8)-1, (int)(R)))
3008 
3009 #define _mm_mask_fmadd_round_sch(A, U, B, C, R)                                \
3010   ((__m128h)__builtin_ia32_vfmaddcsh_round_mask(                               \
3011       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
3012       (__mmask8)(U), (int)(R)))
3013 
3014 #define _mm_maskz_fmadd_round_sch(U, A, B, C, R)                               \
3015   ((__m128h)__builtin_ia32_vfmaddcsh_maskz(                                    \
3016       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
3017       (__mmask8)(U), (int)(R)))
3018 
3019 #define _mm_mask3_fmadd_round_sch(A, B, C, U, R)                               \
3020   ((__m128h)__builtin_ia32_vfmaddcsh_round_mask3(                              \
3021       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
3022       (__mmask8)(U), (int)(R)))
3023 
3024 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmul_sch(__m128h __A,
3025                                                               __m128h __B) {
3026   return (__m128h)__builtin_ia32_vfcmulcsh_mask(
3027       (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1,
3028       _MM_FROUND_CUR_DIRECTION);
3029 }
3030 
3031 static __inline__ __m128h __DEFAULT_FN_ATTRS128
3032 _mm_mask_fcmul_sch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
3033   return (__m128h)__builtin_ia32_vfcmulcsh_mask((__v4sf)__A, (__v4sf)__B,
3034                                                 (__v4sf)__W, (__mmask8)__U,
3035                                                 _MM_FROUND_CUR_DIRECTION);
3036 }
3037 
3038 static __inline__ __m128h __DEFAULT_FN_ATTRS128
3039 _mm_maskz_fcmul_sch(__mmask8 __U, __m128h __A, __m128h __B) {
3040   return (__m128h)__builtin_ia32_vfcmulcsh_mask(
3041       (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U,
3042       _MM_FROUND_CUR_DIRECTION);
3043 }
3044 
3045 #define _mm_fcmul_round_sch(A, B, R)                                           \
3046   ((__m128h)__builtin_ia32_vfcmulcsh_mask(                                     \
3047       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B),                              \
3048       (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
3049 
3050 #define _mm_mask_fcmul_round_sch(W, U, A, B, R)                                \
3051   ((__m128h)__builtin_ia32_vfcmulcsh_mask(                                     \
3052       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W),        \
3053       (__mmask8)(U), (int)(R)))
3054 
3055 #define _mm_maskz_fcmul_round_sch(U, A, B, R)                                  \
3056   ((__m128h)__builtin_ia32_vfcmulcsh_mask(                                     \
3057       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B),                              \
3058       (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
3059 
3060 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmul_sch(__m128h __A,
3061                                                              __m128h __B) {
3062   return (__m128h)__builtin_ia32_vfmulcsh_mask(
3063       (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1,
3064       _MM_FROUND_CUR_DIRECTION);
3065 }
3066 
3067 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmul_sch(__m128h __W,
3068                                                                   __mmask8 __U,
3069                                                                   __m128h __A,
3070                                                                   __m128h __B) {
3071   return (__m128h)__builtin_ia32_vfmulcsh_mask((__v4sf)__A, (__v4sf)__B,
3072                                                (__v4sf)__W, (__mmask8)__U,
3073                                                _MM_FROUND_CUR_DIRECTION);
3074 }
3075 
3076 static __inline__ __m128h __DEFAULT_FN_ATTRS128
3077 _mm_maskz_fmul_sch(__mmask8 __U, __m128h __A, __m128h __B) {
3078   return (__m128h)__builtin_ia32_vfmulcsh_mask(
3079       (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U,
3080       _MM_FROUND_CUR_DIRECTION);
3081 }
3082 
3083 #define _mm_fmul_round_sch(A, B, R)                                            \
3084   ((__m128h)__builtin_ia32_vfmulcsh_mask(                                      \
3085       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B),                              \
3086       (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
3087 
3088 #define _mm_mask_fmul_round_sch(W, U, A, B, R)                                 \
3089   ((__m128h)__builtin_ia32_vfmulcsh_mask(                                      \
3090       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W),        \
3091       (__mmask8)(U), (int)(R)))
3092 
3093 #define _mm_maskz_fmul_round_sch(U, A, B, R)                                   \
3094   ((__m128h)__builtin_ia32_vfmulcsh_mask(                                      \
3095       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B),                              \
3096       (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
3097 
3098 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmul_pch(__m512h __A,
3099                                                                  __m512h __B) {
3100   return (__m512h)__builtin_ia32_vfcmulcph512_mask(
3101       (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (__mmask16)-1,
3102       _MM_FROUND_CUR_DIRECTION);
3103 }
3104 
3105 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3106 _mm512_mask_fcmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
3107   return (__m512h)__builtin_ia32_vfcmulcph512_mask((__v16sf)__A, (__v16sf)__B,
3108                                                    (__v16sf)__W, (__mmask16)__U,
3109                                                    _MM_FROUND_CUR_DIRECTION);
3110 }
3111 
3112 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3113 _mm512_maskz_fcmul_pch(__mmask16 __U, __m512h __A, __m512h __B) {
3114   return (__m512h)__builtin_ia32_vfcmulcph512_mask(
3115       (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (__mmask16)__U,
3116       _MM_FROUND_CUR_DIRECTION);
3117 }
3118 
3119 #define _mm512_fcmul_round_pch(A, B, R)                                        \
3120   ((__m512h)__builtin_ia32_vfcmulcph512_mask(                                  \
3121       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B),                            \
3122       (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
3123 
3124 #define _mm512_mask_fcmul_round_pch(W, U, A, B, R)                             \
3125   ((__m512h)__builtin_ia32_vfcmulcph512_mask(                                  \
3126       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W),     \
3127       (__mmask16)(U), (int)(R)))
3128 
3129 #define _mm512_maskz_fcmul_round_pch(U, A, B, R)                               \
3130   ((__m512h)__builtin_ia32_vfcmulcph512_mask(                                  \
3131       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B),                            \
3132       (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
3133 
3134 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmul_pch(__m512h __A,
3135                                                                 __m512h __B) {
3136   return (__m512h)__builtin_ia32_vfmulcph512_mask(
3137       (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (__mmask16)-1,
3138       _MM_FROUND_CUR_DIRECTION);
3139 }
3140 
3141 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3142 _mm512_mask_fmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
3143   return (__m512h)__builtin_ia32_vfmulcph512_mask((__v16sf)__A, (__v16sf)__B,
3144                                                   (__v16sf)__W, (__mmask16)__U,
3145                                                   _MM_FROUND_CUR_DIRECTION);
3146 }
3147 
3148 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3149 _mm512_maskz_fmul_pch(__mmask16 __U, __m512h __A, __m512h __B) {
3150   return (__m512h)__builtin_ia32_vfmulcph512_mask(
3151       (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (__mmask16)__U,
3152       _MM_FROUND_CUR_DIRECTION);
3153 }
3154 
3155 #define _mm512_fmul_round_pch(A, B, R)                                         \
3156   ((__m512h)__builtin_ia32_vfmulcph512_mask(                                   \
3157       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B),                            \
3158       (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
3159 
3160 #define _mm512_mask_fmul_round_pch(W, U, A, B, R)                              \
3161   ((__m512h)__builtin_ia32_vfmulcph512_mask(                                   \
3162       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W),     \
3163       (__mmask16)(U), (int)(R)))
3164 
3165 #define _mm512_maskz_fmul_round_pch(U, A, B, R)                                \
3166   ((__m512h)__builtin_ia32_vfmulcph512_mask(                                   \
3167       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B),                            \
3168       (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
3169 
3170 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmadd_pch(__m512h __A,
3171                                                                   __m512h __B,
3172                                                                   __m512h __C) {
3173   return (__m512h)__builtin_ia32_vfcmaddcph512_mask3(
3174       (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)-1,
3175       _MM_FROUND_CUR_DIRECTION);
3176 }
3177 
3178 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3179 _mm512_mask_fcmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
3180   return (__m512h)__builtin_ia32_vfcmaddcph512_mask(
3181       (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3182       _MM_FROUND_CUR_DIRECTION);
3183 }
3184 
3185 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3186 _mm512_mask3_fcmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
3187   return (__m512h)__builtin_ia32_vfcmaddcph512_mask3(
3188       (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3189       _MM_FROUND_CUR_DIRECTION);
3190 }
3191 
3192 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3193 _mm512_maskz_fcmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
3194   return (__m512h)__builtin_ia32_vfcmaddcph512_maskz(
3195       (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3196       _MM_FROUND_CUR_DIRECTION);
3197 }
3198 
3199 #define _mm512_fcmadd_round_pch(A, B, C, R)                                    \
3200   ((__m512h)__builtin_ia32_vfcmaddcph512_mask3(                                \
3201       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3202       (__mmask16)-1, (int)(R)))
3203 
3204 #define _mm512_mask_fcmadd_round_pch(A, U, B, C, R)                            \
3205   ((__m512h)__builtin_ia32_vfcmaddcph512_mask(                                 \
3206       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3207       (__mmask16)(U), (int)(R)))
3208 
3209 #define _mm512_mask3_fcmadd_round_pch(A, B, C, U, R)                           \
3210   ((__m512h)__builtin_ia32_vfcmaddcph512_mask3(                                \
3211       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3212       (__mmask16)(U), (int)(R)))
3213 
3214 #define _mm512_maskz_fcmadd_round_pch(U, A, B, C, R)                           \
3215   ((__m512h)__builtin_ia32_vfcmaddcph512_maskz(                                \
3216       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3217       (__mmask16)(U), (int)(R)))
3218 
3219 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_pch(__m512h __A,
3220                                                                  __m512h __B,
3221                                                                  __m512h __C) {
3222   return (__m512h)__builtin_ia32_vfmaddcph512_mask3((__v16sf)__A, (__v16sf)__B,
3223                                                     (__v16sf)__C, (__mmask16)-1,
3224                                                     _MM_FROUND_CUR_DIRECTION);
3225 }
3226 
3227 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3228 _mm512_mask_fmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
3229   return (__m512h)__builtin_ia32_vfmaddcph512_mask((__v16sf)__A, (__v16sf)__B,
3230                                                    (__v16sf)__C, (__mmask16)__U,
3231                                                    _MM_FROUND_CUR_DIRECTION);
3232 }
3233 
3234 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3235 _mm512_mask3_fmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
3236   return (__m512h)__builtin_ia32_vfmaddcph512_mask3(
3237       (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3238       _MM_FROUND_CUR_DIRECTION);
3239 }
3240 
3241 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3242 _mm512_maskz_fmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
3243   return (__m512h)__builtin_ia32_vfmaddcph512_maskz(
3244       (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3245       _MM_FROUND_CUR_DIRECTION);
3246 }
3247 
3248 #define _mm512_fmadd_round_pch(A, B, C, R)                                     \
3249   ((__m512h)__builtin_ia32_vfmaddcph512_mask3(                                 \
3250       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3251       (__mmask16)-1, (int)(R)))
3252 
3253 #define _mm512_mask_fmadd_round_pch(A, U, B, C, R)                             \
3254   ((__m512h)__builtin_ia32_vfmaddcph512_mask(                                  \
3255       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3256       (__mmask16)(U), (int)(R)))
3257 
3258 #define _mm512_mask3_fmadd_round_pch(A, B, C, U, R)                            \
3259   ((__m512h)__builtin_ia32_vfmaddcph512_mask3(                                 \
3260       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3261       (__mmask16)(U), (int)(R)))
3262 
3263 #define _mm512_maskz_fmadd_round_pch(U, A, B, C, R)                            \
3264   ((__m512h)__builtin_ia32_vfmaddcph512_maskz(                                 \
3265       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3266       (__mmask16)(U), (int)(R)))
3267 
3268 static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3269 _mm512_reduce_add_ph(__m512h __W) {
3270   return __builtin_ia32_reduce_fadd_ph512(-0.0f16, __W);
3271 }
3272 
3273 static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3274 _mm512_reduce_mul_ph(__m512h __W) {
3275   return __builtin_ia32_reduce_fmul_ph512(1.0f16, __W);
3276 }
3277 
3278 static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3279 _mm512_reduce_max_ph(__m512h __V) {
3280   return __builtin_ia32_reduce_fmax_ph512(__V);
3281 }
3282 
3283 static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3284 _mm512_reduce_min_ph(__m512h __V) {
3285   return __builtin_ia32_reduce_fmin_ph512(__V);
3286 }
3287 
3288 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3289 _mm512_mask_blend_ph(__mmask32 __U, __m512h __A, __m512h __W) {
3290   return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, (__v32hf)__W,
3291                                               (__v32hf)__A);
3292 }
3293 
3294 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3295 _mm512_permutex2var_ph(__m512h __A, __m512i __I, __m512h __B) {
3296   return (__m512h)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
3297                                                  (__v32hi)__B);
3298 }
3299 
3300 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3301 _mm512_permutexvar_ph(__m512i __A, __m512h __B) {
3302   return (__m512h)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A);
3303 }
3304 
3305 // intrinsics below are alias for f*mul_*ch
3306 #define _mm512_mul_pch(A, B) _mm512_fmul_pch(A, B)
3307 #define _mm512_mask_mul_pch(W, U, A, B) _mm512_mask_fmul_pch(W, U, A, B)
3308 #define _mm512_maskz_mul_pch(U, A, B) _mm512_maskz_fmul_pch(U, A, B)
3309 #define _mm512_mul_round_pch(A, B, R) _mm512_fmul_round_pch(A, B, R)
3310 #define _mm512_mask_mul_round_pch(W, U, A, B, R)                               \
3311   _mm512_mask_fmul_round_pch(W, U, A, B, R)
3312 #define _mm512_maskz_mul_round_pch(U, A, B, R)                                 \
3313   _mm512_maskz_fmul_round_pch(U, A, B, R)
3314 
3315 #define _mm512_cmul_pch(A, B) _mm512_fcmul_pch(A, B)
3316 #define _mm512_mask_cmul_pch(W, U, A, B) _mm512_mask_fcmul_pch(W, U, A, B)
3317 #define _mm512_maskz_cmul_pch(U, A, B) _mm512_maskz_fcmul_pch(U, A, B)
3318 #define _mm512_cmul_round_pch(A, B, R) _mm512_fcmul_round_pch(A, B, R)
3319 #define _mm512_mask_cmul_round_pch(W, U, A, B, R)                              \
3320   _mm512_mask_fcmul_round_pch(W, U, A, B, R)
3321 #define _mm512_maskz_cmul_round_pch(U, A, B, R)                                \
3322   _mm512_maskz_fcmul_round_pch(U, A, B, R)
3323 
3324 #define _mm_mul_sch(A, B) _mm_fmul_sch(A, B)
3325 #define _mm_mask_mul_sch(W, U, A, B) _mm_mask_fmul_sch(W, U, A, B)
3326 #define _mm_maskz_mul_sch(U, A, B) _mm_maskz_fmul_sch(U, A, B)
3327 #define _mm_mul_round_sch(A, B, R) _mm_fmul_round_sch(A, B, R)
3328 #define _mm_mask_mul_round_sch(W, U, A, B, R)                                  \
3329   _mm_mask_fmul_round_sch(W, U, A, B, R)
3330 #define _mm_maskz_mul_round_sch(U, A, B, R) _mm_maskz_fmul_round_sch(U, A, B, R)
3331 
3332 #define _mm_cmul_sch(A, B) _mm_fcmul_sch(A, B)
3333 #define _mm_mask_cmul_sch(W, U, A, B) _mm_mask_fcmul_sch(W, U, A, B)
3334 #define _mm_maskz_cmul_sch(U, A, B) _mm_maskz_fcmul_sch(U, A, B)
3335 #define _mm_cmul_round_sch(A, B, R) _mm_fcmul_round_sch(A, B, R)
3336 #define _mm_mask_cmul_round_sch(W, U, A, B, R)                                 \
3337   _mm_mask_fcmul_round_sch(W, U, A, B, R)
3338 #define _mm_maskz_cmul_round_sch(U, A, B, R)                                   \
3339   _mm_maskz_fcmul_round_sch(U, A, B, R)
3340 
3341 #undef __DEFAULT_FN_ATTRS128
3342 #undef __DEFAULT_FN_ATTRS256
3343 #undef __DEFAULT_FN_ATTRS512
3344 
3345 #endif
3346 #endif
3347