xref: /freebsd/contrib/llvm-project/clang/lib/Headers/avx512fp16intrin.h (revision 3a56015a2f5d630910177fa79a522bb95511ccf7)
1 /*===----------- avx512fp16intrin.h - AVX512-FP16 intrinsics ---------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 #ifndef __IMMINTRIN_H
10 #error "Never use <avx512fp16intrin.h> directly; include <immintrin.h> instead."
11 #endif
12 
13 #ifdef __SSE2__
14 
15 #ifndef __AVX512FP16INTRIN_H
16 #define __AVX512FP16INTRIN_H
17 
18 /* Define the default attributes for the functions in this file. */
19 typedef _Float16 __v32hf __attribute__((__vector_size__(64), __aligned__(64)));
20 typedef _Float16 __m512h __attribute__((__vector_size__(64), __aligned__(64)));
21 typedef _Float16 __m512h_u __attribute__((__vector_size__(64), __aligned__(1)));
22 
23 /* Define the default attributes for the functions in this file. */
24 #define __DEFAULT_FN_ATTRS512                                                  \
25   __attribute__((__always_inline__, __nodebug__,                               \
26                  __target__("avx512fp16,evex512"), __min_vector_width__(512)))
27 #define __DEFAULT_FN_ATTRS256                                                  \
28   __attribute__((__always_inline__, __nodebug__,                               \
29                  __target__("avx512fp16,no-evex512"),                          \
30                  __min_vector_width__(256)))
31 #define __DEFAULT_FN_ATTRS128                                                  \
32   __attribute__((__always_inline__, __nodebug__,                               \
33                  __target__("avx512fp16,no-evex512"),                          \
34                  __min_vector_width__(128)))
35 
36 static __inline__ _Float16 __DEFAULT_FN_ATTRS512 _mm512_cvtsh_h(__m512h __a) {
37   return __a[0];
38 }
39 
40 static __inline __m128h __DEFAULT_FN_ATTRS128 _mm_setzero_ph(void) {
41   return (__m128h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
42 }
43 
44 static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_setzero_ph(void) {
45   return (__m256h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
46                    0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
47 }
48 
49 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_undefined_ph(void) {
50   return (__m256h)__builtin_ia32_undef256();
51 }
52 
53 static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_setzero_ph(void) {
54   return (__m512h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
55                    0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
56                    0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
57 }
58 
59 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_undefined_ph(void) {
60   return (__m128h)__builtin_ia32_undef128();
61 }
62 
63 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_undefined_ph(void) {
64   return (__m512h)__builtin_ia32_undef512();
65 }
66 
67 static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_set1_ph(_Float16 __h) {
68   return (__m512h)(__v32hf){__h, __h, __h, __h, __h, __h, __h, __h,
69                             __h, __h, __h, __h, __h, __h, __h, __h,
70                             __h, __h, __h, __h, __h, __h, __h, __h,
71                             __h, __h, __h, __h, __h, __h, __h, __h};
72 }
73 
74 static __inline __m512h __DEFAULT_FN_ATTRS512
75 _mm512_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4,
76               _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8,
77               _Float16 __h9, _Float16 __h10, _Float16 __h11, _Float16 __h12,
78               _Float16 __h13, _Float16 __h14, _Float16 __h15, _Float16 __h16,
79               _Float16 __h17, _Float16 __h18, _Float16 __h19, _Float16 __h20,
80               _Float16 __h21, _Float16 __h22, _Float16 __h23, _Float16 __h24,
81               _Float16 __h25, _Float16 __h26, _Float16 __h27, _Float16 __h28,
82               _Float16 __h29, _Float16 __h30, _Float16 __h31, _Float16 __h32) {
83   return (__m512h)(__v32hf){__h32, __h31, __h30, __h29, __h28, __h27, __h26,
84                             __h25, __h24, __h23, __h22, __h21, __h20, __h19,
85                             __h18, __h17, __h16, __h15, __h14, __h13, __h12,
86                             __h11, __h10, __h9,  __h8,  __h7,  __h6,  __h5,
87                             __h4,  __h3,  __h2,  __h1};
88 }
89 
90 #define _mm512_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11, h12, h13, \
91                        h14, h15, h16, h17, h18, h19, h20, h21, h22, h23, h24,  \
92                        h25, h26, h27, h28, h29, h30, h31, h32)                 \
93   _mm512_set_ph((h32), (h31), (h30), (h29), (h28), (h27), (h26), (h25), (h24), \
94                 (h23), (h22), (h21), (h20), (h19), (h18), (h17), (h16), (h15), \
95                 (h14), (h13), (h12), (h11), (h10), (h9), (h8), (h7), (h6),     \
96                 (h5), (h4), (h3), (h2), (h1))
97 
98 static __inline __m512h __DEFAULT_FN_ATTRS512
99 _mm512_set1_pch(_Float16 _Complex __h) {
100   return (__m512h)_mm512_set1_ps(__builtin_bit_cast(float, __h));
101 }
102 
103 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_castph_ps(__m128h __a) {
104   return (__m128)__a;
105 }
106 
107 static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_castph_ps(__m256h __a) {
108   return (__m256)__a;
109 }
110 
111 static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_castph_ps(__m512h __a) {
112   return (__m512)__a;
113 }
114 
115 static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_castph_pd(__m128h __a) {
116   return (__m128d)__a;
117 }
118 
119 static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_castph_pd(__m256h __a) {
120   return (__m256d)__a;
121 }
122 
123 static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_castph_pd(__m512h __a) {
124   return (__m512d)__a;
125 }
126 
127 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_castph_si128(__m128h __a) {
128   return (__m128i)__a;
129 }
130 
131 static __inline__ __m256i __DEFAULT_FN_ATTRS256
132 _mm256_castph_si256(__m256h __a) {
133   return (__m256i)__a;
134 }
135 
136 static __inline__ __m512i __DEFAULT_FN_ATTRS512
137 _mm512_castph_si512(__m512h __a) {
138   return (__m512i)__a;
139 }
140 
141 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castps_ph(__m128 __a) {
142   return (__m128h)__a;
143 }
144 
145 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_castps_ph(__m256 __a) {
146   return (__m256h)__a;
147 }
148 
149 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_castps_ph(__m512 __a) {
150   return (__m512h)__a;
151 }
152 
153 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castpd_ph(__m128d __a) {
154   return (__m128h)__a;
155 }
156 
157 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_castpd_ph(__m256d __a) {
158   return (__m256h)__a;
159 }
160 
161 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_castpd_ph(__m512d __a) {
162   return (__m512h)__a;
163 }
164 
165 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castsi128_ph(__m128i __a) {
166   return (__m128h)__a;
167 }
168 
169 static __inline__ __m256h __DEFAULT_FN_ATTRS256
170 _mm256_castsi256_ph(__m256i __a) {
171   return (__m256h)__a;
172 }
173 
174 static __inline__ __m512h __DEFAULT_FN_ATTRS512
175 _mm512_castsi512_ph(__m512i __a) {
176   return (__m512h)__a;
177 }
178 
179 static __inline__ __m128h __DEFAULT_FN_ATTRS256
180 _mm256_castph256_ph128(__m256h __a) {
181   return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
182 }
183 
184 static __inline__ __m128h __DEFAULT_FN_ATTRS512
185 _mm512_castph512_ph128(__m512h __a) {
186   return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
187 }
188 
189 static __inline__ __m256h __DEFAULT_FN_ATTRS512
190 _mm512_castph512_ph256(__m512h __a) {
191   return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
192                                  12, 13, 14, 15);
193 }
194 
195 static __inline__ __m256h __DEFAULT_FN_ATTRS256
196 _mm256_castph128_ph256(__m128h __a) {
197   return __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a),
198                                   0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
199 }
200 
201 static __inline__ __m512h __DEFAULT_FN_ATTRS512
202 _mm512_castph128_ph512(__m128h __a) {
203   __m256h __b = __builtin_nondeterministic_value(__b);
204   return __builtin_shufflevector(
205       __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a),
206                               0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
207       __b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
208       20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
209 }
210 
211 static __inline__ __m512h __DEFAULT_FN_ATTRS512
212 _mm512_castph256_ph512(__m256h __a) {
213   return __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a), 0,
214                                  1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
215                                  15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
216                                  27, 28, 29, 30, 31);
217 }
218 
219 /// Constructs a 256-bit floating-point vector of [16 x half] from a
220 ///    128-bit floating-point vector of [8 x half]. The lower 128 bits
221 ///    contain the value of the source vector. The upper 384 bits are set
222 ///    to zero.
223 ///
224 /// \headerfile <x86intrin.h>
225 ///
226 /// This intrinsic has no corresponding instruction.
227 ///
228 /// \param __a
229 ///    A 128-bit vector of [8 x half].
230 /// \returns A 512-bit floating-point vector of [16 x half]. The lower 128 bits
231 ///    contain the value of the parameter. The upper 384 bits are set to zero.
232 static __inline__ __m256h __DEFAULT_FN_ATTRS256
233 _mm256_zextph128_ph256(__m128h __a) {
234   return __builtin_shufflevector(__a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4,
235                                  5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
236 }
237 
238 /// Constructs a 512-bit floating-point vector of [32 x half] from a
239 ///    128-bit floating-point vector of [8 x half]. The lower 128 bits
240 ///    contain the value of the source vector. The upper 384 bits are set
241 ///    to zero.
242 ///
243 /// \headerfile <x86intrin.h>
244 ///
245 /// This intrinsic has no corresponding instruction.
246 ///
247 /// \param __a
248 ///    A 128-bit vector of [8 x half].
249 /// \returns A 512-bit floating-point vector of [32 x half]. The lower 128 bits
250 ///    contain the value of the parameter. The upper 384 bits are set to zero.
251 static __inline__ __m512h __DEFAULT_FN_ATTRS512
252 _mm512_zextph128_ph512(__m128h __a) {
253   return __builtin_shufflevector(
254       __a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
255       13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15);
256 }
257 
258 /// Constructs a 512-bit floating-point vector of [32 x half] from a
259 ///    256-bit floating-point vector of [16 x half]. The lower 256 bits
260 ///    contain the value of the source vector. The upper 256 bits are set
261 ///    to zero.
262 ///
263 /// \headerfile <x86intrin.h>
264 ///
265 /// This intrinsic has no corresponding instruction.
266 ///
267 /// \param __a
268 ///    A 256-bit vector of [16 x half].
269 /// \returns A 512-bit floating-point vector of [32 x half]. The lower 256 bits
270 ///    contain the value of the parameter. The upper 256 bits are set to zero.
271 static __inline__ __m512h __DEFAULT_FN_ATTRS512
272 _mm512_zextph256_ph512(__m256h __a) {
273   return __builtin_shufflevector(__a, (__v16hf)_mm256_setzero_ph(), 0, 1, 2, 3,
274                                  4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
275                                  17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
276                                  29, 30, 31);
277 }
278 
279 #define _mm_comi_round_sh(A, B, P, R)                                          \
280   __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, (int)(P), (int)(R))
281 
282 #define _mm_comi_sh(A, B, pred)                                                \
283   _mm_comi_round_sh((A), (B), (pred), _MM_FROUND_CUR_DIRECTION)
284 
285 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comieq_sh(__m128h __A,
286                                                           __m128h __B) {
287   return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_EQ_OS,
288                                 _MM_FROUND_CUR_DIRECTION);
289 }
290 
291 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comilt_sh(__m128h __A,
292                                                           __m128h __B) {
293   return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_LT_OS,
294                                 _MM_FROUND_CUR_DIRECTION);
295 }
296 
297 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comile_sh(__m128h __A,
298                                                           __m128h __B) {
299   return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_LE_OS,
300                                 _MM_FROUND_CUR_DIRECTION);
301 }
302 
303 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comigt_sh(__m128h __A,
304                                                           __m128h __B) {
305   return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_GT_OS,
306                                 _MM_FROUND_CUR_DIRECTION);
307 }
308 
309 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comige_sh(__m128h __A,
310                                                           __m128h __B) {
311   return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_GE_OS,
312                                 _MM_FROUND_CUR_DIRECTION);
313 }
314 
315 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comineq_sh(__m128h __A,
316                                                            __m128h __B) {
317   return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_NEQ_US,
318                                 _MM_FROUND_CUR_DIRECTION);
319 }
320 
321 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomieq_sh(__m128h __A,
322                                                            __m128h __B) {
323   return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_EQ_OQ,
324                                 _MM_FROUND_CUR_DIRECTION);
325 }
326 
327 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomilt_sh(__m128h __A,
328                                                            __m128h __B) {
329   return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_LT_OQ,
330                                 _MM_FROUND_CUR_DIRECTION);
331 }
332 
333 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomile_sh(__m128h __A,
334                                                            __m128h __B) {
335   return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_LE_OQ,
336                                 _MM_FROUND_CUR_DIRECTION);
337 }
338 
339 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomigt_sh(__m128h __A,
340                                                            __m128h __B) {
341   return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_GT_OQ,
342                                 _MM_FROUND_CUR_DIRECTION);
343 }
344 
345 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomige_sh(__m128h __A,
346                                                            __m128h __B) {
347   return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_GE_OQ,
348                                 _MM_FROUND_CUR_DIRECTION);
349 }
350 
351 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomineq_sh(__m128h __A,
352                                                             __m128h __B) {
353   return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_NEQ_UQ,
354                                 _MM_FROUND_CUR_DIRECTION);
355 }
356 
357 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_add_ph(__m512h __A,
358                                                               __m512h __B) {
359   return (__m512h)((__v32hf)__A + (__v32hf)__B);
360 }
361 
362 static __inline__ __m512h __DEFAULT_FN_ATTRS512
363 _mm512_mask_add_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
364   return (__m512h)__builtin_ia32_selectph_512(
365       (__mmask32)__U, (__v32hf)_mm512_add_ph(__A, __B), (__v32hf)__W);
366 }
367 
368 static __inline__ __m512h __DEFAULT_FN_ATTRS512
369 _mm512_maskz_add_ph(__mmask32 __U, __m512h __A, __m512h __B) {
370   return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
371                                               (__v32hf)_mm512_add_ph(__A, __B),
372                                               (__v32hf)_mm512_setzero_ph());
373 }
374 
375 #define _mm512_add_round_ph(A, B, R)                                           \
376   ((__m512h)__builtin_ia32_addph512((__v32hf)(__m512h)(A),                     \
377                                     (__v32hf)(__m512h)(B), (int)(R)))
378 
379 #define _mm512_mask_add_round_ph(W, U, A, B, R)                                \
380   ((__m512h)__builtin_ia32_selectph_512(                                       \
381       (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)),             \
382       (__v32hf)(__m512h)(W)))
383 
384 #define _mm512_maskz_add_round_ph(U, A, B, R)                                  \
385   ((__m512h)__builtin_ia32_selectph_512(                                       \
386       (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)),             \
387       (__v32hf)_mm512_setzero_ph()))
388 
389 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sub_ph(__m512h __A,
390                                                               __m512h __B) {
391   return (__m512h)((__v32hf)__A - (__v32hf)__B);
392 }
393 
394 static __inline__ __m512h __DEFAULT_FN_ATTRS512
395 _mm512_mask_sub_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
396   return (__m512h)__builtin_ia32_selectph_512(
397       (__mmask32)__U, (__v32hf)_mm512_sub_ph(__A, __B), (__v32hf)__W);
398 }
399 
400 static __inline__ __m512h __DEFAULT_FN_ATTRS512
401 _mm512_maskz_sub_ph(__mmask32 __U, __m512h __A, __m512h __B) {
402   return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
403                                               (__v32hf)_mm512_sub_ph(__A, __B),
404                                               (__v32hf)_mm512_setzero_ph());
405 }
406 
407 #define _mm512_sub_round_ph(A, B, R)                                           \
408   ((__m512h)__builtin_ia32_subph512((__v32hf)(__m512h)(A),                     \
409                                     (__v32hf)(__m512h)(B), (int)(R)))
410 
411 #define _mm512_mask_sub_round_ph(W, U, A, B, R)                                \
412   ((__m512h)__builtin_ia32_selectph_512(                                       \
413       (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)),             \
414       (__v32hf)(__m512h)(W)))
415 
416 #define _mm512_maskz_sub_round_ph(U, A, B, R)                                  \
417   ((__m512h)__builtin_ia32_selectph_512(                                       \
418       (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)),             \
419       (__v32hf)_mm512_setzero_ph()))
420 
421 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_mul_ph(__m512h __A,
422                                                               __m512h __B) {
423   return (__m512h)((__v32hf)__A * (__v32hf)__B);
424 }
425 
426 static __inline__ __m512h __DEFAULT_FN_ATTRS512
427 _mm512_mask_mul_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
428   return (__m512h)__builtin_ia32_selectph_512(
429       (__mmask32)__U, (__v32hf)_mm512_mul_ph(__A, __B), (__v32hf)__W);
430 }
431 
432 static __inline__ __m512h __DEFAULT_FN_ATTRS512
433 _mm512_maskz_mul_ph(__mmask32 __U, __m512h __A, __m512h __B) {
434   return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
435                                               (__v32hf)_mm512_mul_ph(__A, __B),
436                                               (__v32hf)_mm512_setzero_ph());
437 }
438 
439 #define _mm512_mul_round_ph(A, B, R)                                           \
440   ((__m512h)__builtin_ia32_mulph512((__v32hf)(__m512h)(A),                     \
441                                     (__v32hf)(__m512h)(B), (int)(R)))
442 
443 #define _mm512_mask_mul_round_ph(W, U, A, B, R)                                \
444   ((__m512h)__builtin_ia32_selectph_512(                                       \
445       (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)),             \
446       (__v32hf)(__m512h)(W)))
447 
448 #define _mm512_maskz_mul_round_ph(U, A, B, R)                                  \
449   ((__m512h)__builtin_ia32_selectph_512(                                       \
450       (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)),             \
451       (__v32hf)_mm512_setzero_ph()))
452 
453 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_div_ph(__m512h __A,
454                                                               __m512h __B) {
455   return (__m512h)((__v32hf)__A / (__v32hf)__B);
456 }
457 
458 static __inline__ __m512h __DEFAULT_FN_ATTRS512
459 _mm512_mask_div_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
460   return (__m512h)__builtin_ia32_selectph_512(
461       (__mmask32)__U, (__v32hf)_mm512_div_ph(__A, __B), (__v32hf)__W);
462 }
463 
464 static __inline__ __m512h __DEFAULT_FN_ATTRS512
465 _mm512_maskz_div_ph(__mmask32 __U, __m512h __A, __m512h __B) {
466   return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
467                                               (__v32hf)_mm512_div_ph(__A, __B),
468                                               (__v32hf)_mm512_setzero_ph());
469 }
470 
471 #define _mm512_div_round_ph(A, B, R)                                           \
472   ((__m512h)__builtin_ia32_divph512((__v32hf)(__m512h)(A),                     \
473                                     (__v32hf)(__m512h)(B), (int)(R)))
474 
475 #define _mm512_mask_div_round_ph(W, U, A, B, R)                                \
476   ((__m512h)__builtin_ia32_selectph_512(                                       \
477       (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)),             \
478       (__v32hf)(__m512h)(W)))
479 
480 #define _mm512_maskz_div_round_ph(U, A, B, R)                                  \
481   ((__m512h)__builtin_ia32_selectph_512(                                       \
482       (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)),             \
483       (__v32hf)_mm512_setzero_ph()))
484 
485 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_min_ph(__m512h __A,
486                                                               __m512h __B) {
487   return (__m512h)__builtin_ia32_minph512((__v32hf)__A, (__v32hf)__B,
488                                           _MM_FROUND_CUR_DIRECTION);
489 }
490 
491 static __inline__ __m512h __DEFAULT_FN_ATTRS512
492 _mm512_mask_min_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
493   return (__m512h)__builtin_ia32_selectph_512(
494       (__mmask32)__U, (__v32hf)_mm512_min_ph(__A, __B), (__v32hf)__W);
495 }
496 
497 static __inline__ __m512h __DEFAULT_FN_ATTRS512
498 _mm512_maskz_min_ph(__mmask32 __U, __m512h __A, __m512h __B) {
499   return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
500                                               (__v32hf)_mm512_min_ph(__A, __B),
501                                               (__v32hf)_mm512_setzero_ph());
502 }
503 
504 #define _mm512_min_round_ph(A, B, R)                                           \
505   ((__m512h)__builtin_ia32_minph512((__v32hf)(__m512h)(A),                     \
506                                     (__v32hf)(__m512h)(B), (int)(R)))
507 
508 #define _mm512_mask_min_round_ph(W, U, A, B, R)                                \
509   ((__m512h)__builtin_ia32_selectph_512(                                       \
510       (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)),             \
511       (__v32hf)(__m512h)(W)))
512 
513 #define _mm512_maskz_min_round_ph(U, A, B, R)                                  \
514   ((__m512h)__builtin_ia32_selectph_512(                                       \
515       (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)),             \
516       (__v32hf)_mm512_setzero_ph()))
517 
518 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_max_ph(__m512h __A,
519                                                               __m512h __B) {
520   return (__m512h)__builtin_ia32_maxph512((__v32hf)__A, (__v32hf)__B,
521                                           _MM_FROUND_CUR_DIRECTION);
522 }
523 
524 static __inline__ __m512h __DEFAULT_FN_ATTRS512
525 _mm512_mask_max_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
526   return (__m512h)__builtin_ia32_selectph_512(
527       (__mmask32)__U, (__v32hf)_mm512_max_ph(__A, __B), (__v32hf)__W);
528 }
529 
530 static __inline__ __m512h __DEFAULT_FN_ATTRS512
531 _mm512_maskz_max_ph(__mmask32 __U, __m512h __A, __m512h __B) {
532   return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
533                                               (__v32hf)_mm512_max_ph(__A, __B),
534                                               (__v32hf)_mm512_setzero_ph());
535 }
536 
537 #define _mm512_max_round_ph(A, B, R)                                           \
538   ((__m512h)__builtin_ia32_maxph512((__v32hf)(__m512h)(A),                     \
539                                     (__v32hf)(__m512h)(B), (int)(R)))
540 
541 #define _mm512_mask_max_round_ph(W, U, A, B, R)                                \
542   ((__m512h)__builtin_ia32_selectph_512(                                       \
543       (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)),             \
544       (__v32hf)(__m512h)(W)))
545 
546 #define _mm512_maskz_max_round_ph(U, A, B, R)                                  \
547   ((__m512h)__builtin_ia32_selectph_512(                                       \
548       (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)),             \
549       (__v32hf)_mm512_setzero_ph()))
550 
551 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_abs_ph(__m512h __A) {
552   return (__m512h)_mm512_and_epi32(_mm512_set1_epi32(0x7FFF7FFF), (__m512i)__A);
553 }
554 
555 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_conj_pch(__m512h __A) {
556   return (__m512h)_mm512_xor_ps((__m512)__A, _mm512_set1_ps(-0.0f));
557 }
558 
559 static __inline__ __m512h __DEFAULT_FN_ATTRS512
560 _mm512_mask_conj_pch(__m512h __W, __mmask16 __U, __m512h __A) {
561   return (__m512h)__builtin_ia32_selectps_512(
562       (__mmask16)__U, (__v16sf)_mm512_conj_pch(__A), (__v16sf)__W);
563 }
564 
565 static __inline__ __m512h __DEFAULT_FN_ATTRS512
566 _mm512_maskz_conj_pch(__mmask16 __U, __m512h __A) {
567   return (__m512h)__builtin_ia32_selectps_512((__mmask16)__U,
568                                               (__v16sf)_mm512_conj_pch(__A),
569                                               (__v16sf)_mm512_setzero_ps());
570 }
571 
572 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_add_sh(__m128h __A,
573                                                            __m128h __B) {
574   __A[0] += __B[0];
575   return __A;
576 }
577 
578 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_add_sh(__m128h __W,
579                                                                 __mmask8 __U,
580                                                                 __m128h __A,
581                                                                 __m128h __B) {
582   __A = _mm_add_sh(__A, __B);
583   return __builtin_ia32_selectsh_128(__U, __A, __W);
584 }
585 
586 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_add_sh(__mmask8 __U,
587                                                                  __m128h __A,
588                                                                  __m128h __B) {
589   __A = _mm_add_sh(__A, __B);
590   return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
591 }
592 
593 #define _mm_add_round_sh(A, B, R)                                              \
594   ((__m128h)__builtin_ia32_addsh_round_mask(                                   \
595       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
596       (__mmask8)-1, (int)(R)))
597 
598 #define _mm_mask_add_round_sh(W, U, A, B, R)                                   \
599   ((__m128h)__builtin_ia32_addsh_round_mask(                                   \
600       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
601       (__mmask8)(U), (int)(R)))
602 
603 #define _mm_maskz_add_round_sh(U, A, B, R)                                     \
604   ((__m128h)__builtin_ia32_addsh_round_mask(                                   \
605       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
606       (__mmask8)(U), (int)(R)))
607 
608 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sub_sh(__m128h __A,
609                                                            __m128h __B) {
610   __A[0] -= __B[0];
611   return __A;
612 }
613 
614 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sub_sh(__m128h __W,
615                                                                 __mmask8 __U,
616                                                                 __m128h __A,
617                                                                 __m128h __B) {
618   __A = _mm_sub_sh(__A, __B);
619   return __builtin_ia32_selectsh_128(__U, __A, __W);
620 }
621 
622 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sub_sh(__mmask8 __U,
623                                                                  __m128h __A,
624                                                                  __m128h __B) {
625   __A = _mm_sub_sh(__A, __B);
626   return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
627 }
628 
629 #define _mm_sub_round_sh(A, B, R)                                              \
630   ((__m128h)__builtin_ia32_subsh_round_mask(                                   \
631       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
632       (__mmask8)-1, (int)(R)))
633 
634 #define _mm_mask_sub_round_sh(W, U, A, B, R)                                   \
635   ((__m128h)__builtin_ia32_subsh_round_mask(                                   \
636       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
637       (__mmask8)(U), (int)(R)))
638 
639 #define _mm_maskz_sub_round_sh(U, A, B, R)                                     \
640   ((__m128h)__builtin_ia32_subsh_round_mask(                                   \
641       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
642       (__mmask8)(U), (int)(R)))
643 
644 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mul_sh(__m128h __A,
645                                                            __m128h __B) {
646   __A[0] *= __B[0];
647   return __A;
648 }
649 
650 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_mul_sh(__m128h __W,
651                                                                 __mmask8 __U,
652                                                                 __m128h __A,
653                                                                 __m128h __B) {
654   __A = _mm_mul_sh(__A, __B);
655   return __builtin_ia32_selectsh_128(__U, __A, __W);
656 }
657 
658 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_mul_sh(__mmask8 __U,
659                                                                  __m128h __A,
660                                                                  __m128h __B) {
661   __A = _mm_mul_sh(__A, __B);
662   return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
663 }
664 
665 #define _mm_mul_round_sh(A, B, R)                                              \
666   ((__m128h)__builtin_ia32_mulsh_round_mask(                                   \
667       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
668       (__mmask8)-1, (int)(R)))
669 
670 #define _mm_mask_mul_round_sh(W, U, A, B, R)                                   \
671   ((__m128h)__builtin_ia32_mulsh_round_mask(                                   \
672       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
673       (__mmask8)(U), (int)(R)))
674 
675 #define _mm_maskz_mul_round_sh(U, A, B, R)                                     \
676   ((__m128h)__builtin_ia32_mulsh_round_mask(                                   \
677       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
678       (__mmask8)(U), (int)(R)))
679 
680 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_div_sh(__m128h __A,
681                                                            __m128h __B) {
682   __A[0] /= __B[0];
683   return __A;
684 }
685 
686 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_div_sh(__m128h __W,
687                                                                 __mmask8 __U,
688                                                                 __m128h __A,
689                                                                 __m128h __B) {
690   __A = _mm_div_sh(__A, __B);
691   return __builtin_ia32_selectsh_128(__U, __A, __W);
692 }
693 
694 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_div_sh(__mmask8 __U,
695                                                                  __m128h __A,
696                                                                  __m128h __B) {
697   __A = _mm_div_sh(__A, __B);
698   return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
699 }
700 
701 #define _mm_div_round_sh(A, B, R)                                              \
702   ((__m128h)__builtin_ia32_divsh_round_mask(                                   \
703       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
704       (__mmask8)-1, (int)(R)))
705 
706 #define _mm_mask_div_round_sh(W, U, A, B, R)                                   \
707   ((__m128h)__builtin_ia32_divsh_round_mask(                                   \
708       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
709       (__mmask8)(U), (int)(R)))
710 
711 #define _mm_maskz_div_round_sh(U, A, B, R)                                     \
712   ((__m128h)__builtin_ia32_divsh_round_mask(                                   \
713       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
714       (__mmask8)(U), (int)(R)))
715 
716 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_min_sh(__m128h __A,
717                                                            __m128h __B) {
718   return (__m128h)__builtin_ia32_minsh_round_mask(
719       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
720       _MM_FROUND_CUR_DIRECTION);
721 }
722 
723 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_min_sh(__m128h __W,
724                                                                 __mmask8 __U,
725                                                                 __m128h __A,
726                                                                 __m128h __B) {
727   return (__m128h)__builtin_ia32_minsh_round_mask((__v8hf)__A, (__v8hf)__B,
728                                                   (__v8hf)__W, (__mmask8)__U,
729                                                   _MM_FROUND_CUR_DIRECTION);
730 }
731 
732 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_min_sh(__mmask8 __U,
733                                                                  __m128h __A,
734                                                                  __m128h __B) {
735   return (__m128h)__builtin_ia32_minsh_round_mask(
736       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
737       _MM_FROUND_CUR_DIRECTION);
738 }
739 
740 #define _mm_min_round_sh(A, B, R)                                              \
741   ((__m128h)__builtin_ia32_minsh_round_mask(                                   \
742       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
743       (__mmask8)-1, (int)(R)))
744 
745 #define _mm_mask_min_round_sh(W, U, A, B, R)                                   \
746   ((__m128h)__builtin_ia32_minsh_round_mask(                                   \
747       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
748       (__mmask8)(U), (int)(R)))
749 
750 #define _mm_maskz_min_round_sh(U, A, B, R)                                     \
751   ((__m128h)__builtin_ia32_minsh_round_mask(                                   \
752       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
753       (__mmask8)(U), (int)(R)))
754 
755 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_max_sh(__m128h __A,
756                                                            __m128h __B) {
757   return (__m128h)__builtin_ia32_maxsh_round_mask(
758       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
759       _MM_FROUND_CUR_DIRECTION);
760 }
761 
762 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_max_sh(__m128h __W,
763                                                                 __mmask8 __U,
764                                                                 __m128h __A,
765                                                                 __m128h __B) {
766   return (__m128h)__builtin_ia32_maxsh_round_mask((__v8hf)__A, (__v8hf)__B,
767                                                   (__v8hf)__W, (__mmask8)__U,
768                                                   _MM_FROUND_CUR_DIRECTION);
769 }
770 
771 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_max_sh(__mmask8 __U,
772                                                                  __m128h __A,
773                                                                  __m128h __B) {
774   return (__m128h)__builtin_ia32_maxsh_round_mask(
775       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
776       _MM_FROUND_CUR_DIRECTION);
777 }
778 
779 #define _mm_max_round_sh(A, B, R)                                              \
780   ((__m128h)__builtin_ia32_maxsh_round_mask(                                   \
781       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
782       (__mmask8)-1, (int)(R)))
783 
784 #define _mm_mask_max_round_sh(W, U, A, B, R)                                   \
785   ((__m128h)__builtin_ia32_maxsh_round_mask(                                   \
786       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
787       (__mmask8)(U), (int)(R)))
788 
789 #define _mm_maskz_max_round_sh(U, A, B, R)                                     \
790   ((__m128h)__builtin_ia32_maxsh_round_mask(                                   \
791       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
792       (__mmask8)(U), (int)(R)))
793 
794 #define _mm512_cmp_round_ph_mask(A, B, P, R)                                   \
795   ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A),              \
796                                            (__v32hf)(__m512h)(B), (int)(P),    \
797                                            (__mmask32)-1, (int)(R)))
798 
799 #define _mm512_mask_cmp_round_ph_mask(U, A, B, P, R)                           \
800   ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A),              \
801                                            (__v32hf)(__m512h)(B), (int)(P),    \
802                                            (__mmask32)(U), (int)(R)))
803 
804 #define _mm512_cmp_ph_mask(A, B, P)                                            \
805   _mm512_cmp_round_ph_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
806 
807 #define _mm512_mask_cmp_ph_mask(U, A, B, P)                                    \
808   _mm512_mask_cmp_round_ph_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
809 
810 #define _mm_cmp_round_sh_mask(X, Y, P, R)                                      \
811   ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X),                   \
812                                        (__v8hf)(__m128h)(Y), (int)(P),         \
813                                        (__mmask8)-1, (int)(R)))
814 
815 #define _mm_mask_cmp_round_sh_mask(M, X, Y, P, R)                              \
816   ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X),                   \
817                                        (__v8hf)(__m128h)(Y), (int)(P),         \
818                                        (__mmask8)(M), (int)(R)))
819 
820 #define _mm_cmp_sh_mask(X, Y, P)                                               \
821   ((__mmask8)__builtin_ia32_cmpsh_mask(                                        \
822       (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)-1,      \
823       _MM_FROUND_CUR_DIRECTION))
824 
825 #define _mm_mask_cmp_sh_mask(M, X, Y, P)                                       \
826   ((__mmask8)__builtin_ia32_cmpsh_mask(                                        \
827       (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)(M),     \
828       _MM_FROUND_CUR_DIRECTION))
829 // loads with vmovsh:
830 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_sh(void const *__dp) {
831   struct __mm_load_sh_struct {
832     _Float16 __u;
833   } __attribute__((__packed__, __may_alias__));
834   _Float16 __u = ((const struct __mm_load_sh_struct *)__dp)->__u;
835   return (__m128h){__u, 0, 0, 0, 0, 0, 0, 0};
836 }
837 
838 static __inline__ __m128h __DEFAULT_FN_ATTRS128
839 _mm_mask_load_sh(__m128h __W, __mmask8 __U, const void *__A) {
840   __m128h src = (__v8hf)__builtin_shufflevector(
841       (__v8hf)__W, (__v8hf)_mm_setzero_ph(), 0, 8, 8, 8, 8, 8, 8, 8);
842 
843   return (__m128h)__builtin_ia32_loadsh128_mask((const __v8hf *)__A, src, __U & 1);
844 }
845 
846 static __inline__ __m128h __DEFAULT_FN_ATTRS128
847 _mm_maskz_load_sh(__mmask8 __U, const void *__A) {
848   return (__m128h)__builtin_ia32_loadsh128_mask(
849       (const __v8hf *)__A, (__v8hf)_mm_setzero_ph(), __U & 1);
850 }
851 
852 static __inline__ __m512h __DEFAULT_FN_ATTRS512
853 _mm512_load_ph(void const *__p) {
854   return *(const __m512h *)__p;
855 }
856 
857 static __inline__ __m256h __DEFAULT_FN_ATTRS256
858 _mm256_load_ph(void const *__p) {
859   return *(const __m256h *)__p;
860 }
861 
862 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_ph(void const *__p) {
863   return *(const __m128h *)__p;
864 }
865 
866 static __inline__ __m512h __DEFAULT_FN_ATTRS512
867 _mm512_loadu_ph(void const *__p) {
868   struct __loadu_ph {
869     __m512h_u __v;
870   } __attribute__((__packed__, __may_alias__));
871   return ((const struct __loadu_ph *)__p)->__v;
872 }
873 
874 static __inline__ __m256h __DEFAULT_FN_ATTRS256
875 _mm256_loadu_ph(void const *__p) {
876   struct __loadu_ph {
877     __m256h_u __v;
878   } __attribute__((__packed__, __may_alias__));
879   return ((const struct __loadu_ph *)__p)->__v;
880 }
881 
882 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_loadu_ph(void const *__p) {
883   struct __loadu_ph {
884     __m128h_u __v;
885   } __attribute__((__packed__, __may_alias__));
886   return ((const struct __loadu_ph *)__p)->__v;
887 }
888 
889 // stores with vmovsh:
890 static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_sh(void *__dp,
891                                                           __m128h __a) {
892   struct __mm_store_sh_struct {
893     _Float16 __u;
894   } __attribute__((__packed__, __may_alias__));
895   ((struct __mm_store_sh_struct *)__dp)->__u = __a[0];
896 }
897 
898 static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_store_sh(void *__W,
899                                                                __mmask8 __U,
900                                                                __m128h __A) {
901   __builtin_ia32_storesh128_mask((__v8hf *)__W, __A, __U & 1);
902 }
903 
904 static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_store_ph(void *__P,
905                                                              __m512h __A) {
906   *(__m512h *)__P = __A;
907 }
908 
909 static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_store_ph(void *__P,
910                                                              __m256h __A) {
911   *(__m256h *)__P = __A;
912 }
913 
914 static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_ph(void *__P,
915                                                           __m128h __A) {
916   *(__m128h *)__P = __A;
917 }
918 
919 static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_storeu_ph(void *__P,
920                                                               __m512h __A) {
921   struct __storeu_ph {
922     __m512h_u __v;
923   } __attribute__((__packed__, __may_alias__));
924   ((struct __storeu_ph *)__P)->__v = __A;
925 }
926 
927 static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_storeu_ph(void *__P,
928                                                               __m256h __A) {
929   struct __storeu_ph {
930     __m256h_u __v;
931   } __attribute__((__packed__, __may_alias__));
932   ((struct __storeu_ph *)__P)->__v = __A;
933 }
934 
935 static __inline__ void __DEFAULT_FN_ATTRS128 _mm_storeu_ph(void *__P,
936                                                            __m128h __A) {
937   struct __storeu_ph {
938     __m128h_u __v;
939   } __attribute__((__packed__, __may_alias__));
940   ((struct __storeu_ph *)__P)->__v = __A;
941 }
942 
943 // moves with vmovsh:
944 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_move_sh(__m128h __a,
945                                                             __m128h __b) {
946   __a[0] = __b[0];
947   return __a;
948 }
949 
950 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_move_sh(__m128h __W,
951                                                                  __mmask8 __U,
952                                                                  __m128h __A,
953                                                                  __m128h __B) {
954   return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B), __W);
955 }
956 
957 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_move_sh(__mmask8 __U,
958                                                                   __m128h __A,
959                                                                   __m128h __B) {
960   return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B),
961                                      _mm_setzero_ph());
962 }
963 
964 // vmovw:
965 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtsi16_si128(short __a) {
966   return (__m128i)(__v8hi){__a, 0, 0, 0, 0, 0, 0, 0};
967 }
968 
969 static __inline__ short __DEFAULT_FN_ATTRS128 _mm_cvtsi128_si16(__m128i __a) {
970   __v8hi __b = (__v8hi)__a;
971   return __b[0];
972 }
973 
974 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rcp_ph(__m512h __A) {
975   return (__m512h)__builtin_ia32_rcpph512_mask(
976       (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1);
977 }
978 
979 static __inline__ __m512h __DEFAULT_FN_ATTRS512
980 _mm512_mask_rcp_ph(__m512h __W, __mmask32 __U, __m512h __A) {
981   return (__m512h)__builtin_ia32_rcpph512_mask((__v32hf)__A, (__v32hf)__W,
982                                                (__mmask32)__U);
983 }
984 
985 static __inline__ __m512h __DEFAULT_FN_ATTRS512
986 _mm512_maskz_rcp_ph(__mmask32 __U, __m512h __A) {
987   return (__m512h)__builtin_ia32_rcpph512_mask(
988       (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U);
989 }
990 
991 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rsqrt_ph(__m512h __A) {
992   return (__m512h)__builtin_ia32_rsqrtph512_mask(
993       (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1);
994 }
995 
996 static __inline__ __m512h __DEFAULT_FN_ATTRS512
997 _mm512_mask_rsqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) {
998   return (__m512h)__builtin_ia32_rsqrtph512_mask((__v32hf)__A, (__v32hf)__W,
999                                                  (__mmask32)__U);
1000 }
1001 
1002 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1003 _mm512_maskz_rsqrt_ph(__mmask32 __U, __m512h __A) {
1004   return (__m512h)__builtin_ia32_rsqrtph512_mask(
1005       (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U);
1006 }
1007 
1008 #define _mm512_getmant_ph(A, B, C)                                             \
1009   ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
1010       (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)),                          \
1011       (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,                           \
1012       _MM_FROUND_CUR_DIRECTION))
1013 
1014 #define _mm512_mask_getmant_ph(W, U, A, B, C)                                  \
1015   ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
1016       (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W),   \
1017       (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1018 
1019 #define _mm512_maskz_getmant_ph(U, A, B, C)                                    \
1020   ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
1021       (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)),                          \
1022       (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1023 
1024 #define _mm512_getmant_round_ph(A, B, C, R)                                    \
1025   ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
1026       (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)),                          \
1027       (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
1028 
1029 #define _mm512_mask_getmant_round_ph(W, U, A, B, C, R)                         \
1030   ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
1031       (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W),   \
1032       (__mmask32)(U), (int)(R)))
1033 
1034 #define _mm512_maskz_getmant_round_ph(U, A, B, C, R)                           \
1035   ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
1036       (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)),                          \
1037       (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1038 
1039 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_getexp_ph(__m512h __A) {
1040   return (__m512h)__builtin_ia32_getexpph512_mask(
1041       (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,
1042       _MM_FROUND_CUR_DIRECTION);
1043 }
1044 
1045 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1046 _mm512_mask_getexp_ph(__m512h __W, __mmask32 __U, __m512h __A) {
1047   return (__m512h)__builtin_ia32_getexpph512_mask(
1048       (__v32hf)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1049 }
1050 
1051 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1052 _mm512_maskz_getexp_ph(__mmask32 __U, __m512h __A) {
1053   return (__m512h)__builtin_ia32_getexpph512_mask(
1054       (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1055       _MM_FROUND_CUR_DIRECTION);
1056 }
1057 
1058 #define _mm512_getexp_round_ph(A, R)                                           \
1059   ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A),             \
1060                                             (__v32hf)_mm512_undefined_ph(),    \
1061                                             (__mmask32)-1, (int)(R)))
1062 
1063 #define _mm512_mask_getexp_round_ph(W, U, A, R)                                \
1064   ((__m512h)__builtin_ia32_getexpph512_mask(                                   \
1065       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(W), (__mmask32)(U), (int)(R)))
1066 
1067 #define _mm512_maskz_getexp_round_ph(U, A, R)                                  \
1068   ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A),             \
1069                                             (__v32hf)_mm512_setzero_ph(),      \
1070                                             (__mmask32)(U), (int)(R)))
1071 
1072 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_scalef_ph(__m512h __A,
1073                                                                  __m512h __B) {
1074   return (__m512h)__builtin_ia32_scalefph512_mask(
1075       (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,
1076       _MM_FROUND_CUR_DIRECTION);
1077 }
1078 
1079 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1080 _mm512_mask_scalef_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
1081   return (__m512h)__builtin_ia32_scalefph512_mask((__v32hf)__A, (__v32hf)__B,
1082                                                   (__v32hf)__W, (__mmask32)__U,
1083                                                   _MM_FROUND_CUR_DIRECTION);
1084 }
1085 
1086 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1087 _mm512_maskz_scalef_ph(__mmask32 __U, __m512h __A, __m512h __B) {
1088   return (__m512h)__builtin_ia32_scalefph512_mask(
1089       (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1090       _MM_FROUND_CUR_DIRECTION);
1091 }
1092 
1093 #define _mm512_scalef_round_ph(A, B, R)                                        \
1094   ((__m512h)__builtin_ia32_scalefph512_mask(                                   \
1095       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B),                            \
1096       (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
1097 
1098 #define _mm512_mask_scalef_round_ph(W, U, A, B, R)                             \
1099   ((__m512h)__builtin_ia32_scalefph512_mask(                                   \
1100       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(W),     \
1101       (__mmask32)(U), (int)(R)))
1102 
1103 #define _mm512_maskz_scalef_round_ph(U, A, B, R)                               \
1104   ((__m512h)__builtin_ia32_scalefph512_mask(                                   \
1105       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B),                            \
1106       (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1107 
1108 #define _mm512_roundscale_ph(A, B)                                             \
1109   ((__m512h)__builtin_ia32_rndscaleph_mask(                                    \
1110       (__v32hf)(__m512h)(A), (int)(B), (__v32hf)(__m512h)(A), (__mmask32)-1,   \
1111       _MM_FROUND_CUR_DIRECTION))
1112 
1113 #define _mm512_mask_roundscale_ph(A, B, C, imm)                                \
1114   ((__m512h)__builtin_ia32_rndscaleph_mask(                                    \
1115       (__v32hf)(__m512h)(C), (int)(imm), (__v32hf)(__m512h)(A),                \
1116       (__mmask32)(B), _MM_FROUND_CUR_DIRECTION))
1117 
1118 #define _mm512_maskz_roundscale_ph(A, B, imm)                                  \
1119   ((__m512h)__builtin_ia32_rndscaleph_mask(                                    \
1120       (__v32hf)(__m512h)(B), (int)(imm), (__v32hf)_mm512_setzero_ph(),         \
1121       (__mmask32)(A), _MM_FROUND_CUR_DIRECTION))
1122 
1123 #define _mm512_mask_roundscale_round_ph(A, B, C, imm, R)                       \
1124   ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(C), (int)(imm),  \
1125                                            (__v32hf)(__m512h)(A),              \
1126                                            (__mmask32)(B), (int)(R)))
1127 
1128 #define _mm512_maskz_roundscale_round_ph(A, B, imm, R)                         \
1129   ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(B), (int)(imm),  \
1130                                            (__v32hf)_mm512_setzero_ph(),       \
1131                                            (__mmask32)(A), (int)(R)))
1132 
1133 #define _mm512_roundscale_round_ph(A, imm, R)                                  \
1134   ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(A), (int)(imm),  \
1135                                            (__v32hf)_mm512_undefined_ph(),     \
1136                                            (__mmask32)-1, (int)(R)))
1137 
1138 #define _mm512_reduce_ph(A, imm)                                               \
1139   ((__m512h)__builtin_ia32_reduceph512_mask(                                   \
1140       (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_undefined_ph(),       \
1141       (__mmask32)-1, _MM_FROUND_CUR_DIRECTION))
1142 
1143 #define _mm512_mask_reduce_ph(W, U, A, imm)                                    \
1144   ((__m512h)__builtin_ia32_reduceph512_mask(                                   \
1145       (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)(__m512h)(W),                \
1146       (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1147 
1148 #define _mm512_maskz_reduce_ph(U, A, imm)                                      \
1149   ((__m512h)__builtin_ia32_reduceph512_mask(                                   \
1150       (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_setzero_ph(),         \
1151       (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1152 
1153 #define _mm512_mask_reduce_round_ph(W, U, A, imm, R)                           \
1154   ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1155                                             (__v32hf)(__m512h)(W),             \
1156                                             (__mmask32)(U), (int)(R)))
1157 
1158 #define _mm512_maskz_reduce_round_ph(U, A, imm, R)                             \
1159   ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1160                                             (__v32hf)_mm512_setzero_ph(),      \
1161                                             (__mmask32)(U), (int)(R)))
1162 
1163 #define _mm512_reduce_round_ph(A, imm, R)                                      \
1164   ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1165                                             (__v32hf)_mm512_undefined_ph(),    \
1166                                             (__mmask32)-1, (int)(R)))
1167 
1168 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rcp_sh(__m128h __A,
1169                                                            __m128h __B) {
1170   return (__m128h)__builtin_ia32_rcpsh_mask(
1171       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
1172 }
1173 
1174 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rcp_sh(__m128h __W,
1175                                                                 __mmask8 __U,
1176                                                                 __m128h __A,
1177                                                                 __m128h __B) {
1178   return (__m128h)__builtin_ia32_rcpsh_mask((__v8hf)__A, (__v8hf)__B,
1179                                             (__v8hf)__W, (__mmask8)__U);
1180 }
1181 
1182 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_rcp_sh(__mmask8 __U,
1183                                                                  __m128h __A,
1184                                                                  __m128h __B) {
1185   return (__m128h)__builtin_ia32_rcpsh_mask(
1186       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1187 }
1188 
1189 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rsqrt_sh(__m128h __A,
1190                                                              __m128h __B) {
1191   return (__m128h)__builtin_ia32_rsqrtsh_mask(
1192       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
1193 }
1194 
1195 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rsqrt_sh(__m128h __W,
1196                                                                   __mmask8 __U,
1197                                                                   __m128h __A,
1198                                                                   __m128h __B) {
1199   return (__m128h)__builtin_ia32_rsqrtsh_mask((__v8hf)__A, (__v8hf)__B,
1200                                               (__v8hf)__W, (__mmask8)__U);
1201 }
1202 
1203 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1204 _mm_maskz_rsqrt_sh(__mmask8 __U, __m128h __A, __m128h __B) {
1205   return (__m128h)__builtin_ia32_rsqrtsh_mask(
1206       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1207 }
1208 
1209 #define _mm_getmant_round_sh(A, B, C, D, R)                                    \
1210   ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
1211       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
1212       (__v8hf)_mm_setzero_ph(), (__mmask8)-1, (int)(R)))
1213 
1214 #define _mm_getmant_sh(A, B, C, D)                                             \
1215   ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
1216       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
1217       (__v8hf)_mm_setzero_ph(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))
1218 
1219 #define _mm_mask_getmant_sh(W, U, A, B, C, D)                                  \
1220   ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
1221       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
1222       (__v8hf)(__m128h)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
1223 
1224 #define _mm_mask_getmant_round_sh(W, U, A, B, C, D, R)                         \
1225   ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
1226       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
1227       (__v8hf)(__m128h)(W), (__mmask8)(U), (int)(R)))
1228 
1229 #define _mm_maskz_getmant_sh(U, A, B, C, D)                                    \
1230   ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
1231       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
1232       (__v8hf)_mm_setzero_ph(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
1233 
1234 #define _mm_maskz_getmant_round_sh(U, A, B, C, D, R)                           \
1235   ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
1236       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
1237       (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1238 
1239 #define _mm_getexp_round_sh(A, B, R)                                           \
1240   ((__m128h)__builtin_ia32_getexpsh128_round_mask(                             \
1241       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1242       (__mmask8)-1, (int)(R)))
1243 
1244 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_getexp_sh(__m128h __A,
1245                                                               __m128h __B) {
1246   return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1247       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
1248       _MM_FROUND_CUR_DIRECTION);
1249 }
1250 
1251 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1252 _mm_mask_getexp_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
1253   return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1254       (__v8hf)__A, (__v8hf)__B, (__v8hf)__W, (__mmask8)__U,
1255       _MM_FROUND_CUR_DIRECTION);
1256 }
1257 
1258 #define _mm_mask_getexp_round_sh(W, U, A, B, R)                                \
1259   ((__m128h)__builtin_ia32_getexpsh128_round_mask(                             \
1260       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
1261       (__mmask8)(U), (int)(R)))
1262 
1263 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1264 _mm_maskz_getexp_sh(__mmask8 __U, __m128h __A, __m128h __B) {
1265   return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1266       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1267       _MM_FROUND_CUR_DIRECTION);
1268 }
1269 
1270 #define _mm_maskz_getexp_round_sh(U, A, B, R)                                  \
1271   ((__m128h)__builtin_ia32_getexpsh128_round_mask(                             \
1272       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1273       (__mmask8)(U), (int)(R)))
1274 
1275 #define _mm_scalef_round_sh(A, B, R)                                           \
1276   ((__m128h)__builtin_ia32_scalefsh_round_mask(                                \
1277       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1278       (__mmask8)-1, (int)(R)))
1279 
1280 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_scalef_sh(__m128h __A,
1281                                                               __m128h __B) {
1282   return (__m128h)__builtin_ia32_scalefsh_round_mask(
1283       (__v8hf)__A, (__v8hf)(__B), (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
1284       _MM_FROUND_CUR_DIRECTION);
1285 }
1286 
1287 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1288 _mm_mask_scalef_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
1289   return (__m128h)__builtin_ia32_scalefsh_round_mask((__v8hf)__A, (__v8hf)__B,
1290                                                      (__v8hf)__W, (__mmask8)__U,
1291                                                      _MM_FROUND_CUR_DIRECTION);
1292 }
1293 
1294 #define _mm_mask_scalef_round_sh(W, U, A, B, R)                                \
1295   ((__m128h)__builtin_ia32_scalefsh_round_mask(                                \
1296       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
1297       (__mmask8)(U), (int)(R)))
1298 
1299 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1300 _mm_maskz_scalef_sh(__mmask8 __U, __m128h __A, __m128h __B) {
1301   return (__m128h)__builtin_ia32_scalefsh_round_mask(
1302       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1303       _MM_FROUND_CUR_DIRECTION);
1304 }
1305 
1306 #define _mm_maskz_scalef_round_sh(U, A, B, R)                                  \
1307   ((__m128h)__builtin_ia32_scalefsh_round_mask(                                \
1308       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1309       (__mmask8)(U), (int)(R)))
1310 
1311 #define _mm_roundscale_round_sh(A, B, imm, R)                                  \
1312   ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
1313       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1314       (__mmask8)-1, (int)(imm), (int)(R)))
1315 
1316 #define _mm_roundscale_sh(A, B, imm)                                           \
1317   ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
1318       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1319       (__mmask8)-1, (int)(imm), _MM_FROUND_CUR_DIRECTION))
1320 
1321 #define _mm_mask_roundscale_sh(W, U, A, B, I)                                  \
1322   ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
1323       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
1324       (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
1325 
1326 #define _mm_mask_roundscale_round_sh(W, U, A, B, I, R)                         \
1327   ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
1328       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
1329       (__mmask8)(U), (int)(I), (int)(R)))
1330 
1331 #define _mm_maskz_roundscale_sh(U, A, B, I)                                    \
1332   ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
1333       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1334       (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
1335 
1336 #define _mm_maskz_roundscale_round_sh(U, A, B, I, R)                           \
1337   ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
1338       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1339       (__mmask8)(U), (int)(I), (int)(R)))
1340 
1341 #define _mm_reduce_sh(A, B, C)                                                 \
1342   ((__m128h)__builtin_ia32_reducesh_mask(                                      \
1343       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1344       (__mmask8)-1, (int)(C), _MM_FROUND_CUR_DIRECTION))
1345 
1346 #define _mm_mask_reduce_sh(W, U, A, B, C)                                      \
1347   ((__m128h)__builtin_ia32_reducesh_mask(                                      \
1348       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
1349       (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
1350 
1351 #define _mm_maskz_reduce_sh(U, A, B, C)                                        \
1352   ((__m128h)__builtin_ia32_reducesh_mask(                                      \
1353       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1354       (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
1355 
1356 #define _mm_reduce_round_sh(A, B, C, R)                                        \
1357   ((__m128h)__builtin_ia32_reducesh_mask(                                      \
1358       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1359       (__mmask8)-1, (int)(C), (int)(R)))
1360 
1361 #define _mm_mask_reduce_round_sh(W, U, A, B, C, R)                             \
1362   ((__m128h)__builtin_ia32_reducesh_mask(                                      \
1363       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
1364       (__mmask8)(U), (int)(C), (int)(R)))
1365 
1366 #define _mm_maskz_reduce_round_sh(U, A, B, C, R)                               \
1367   ((__m128h)__builtin_ia32_reducesh_mask(                                      \
1368       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1369       (__mmask8)(U), (int)(C), (int)(R)))
1370 
1371 #define _mm512_sqrt_round_ph(A, R)                                             \
1372   ((__m512h)__builtin_ia32_sqrtph512((__v32hf)(__m512h)(A), (int)(R)))
1373 
1374 #define _mm512_mask_sqrt_round_ph(W, U, A, R)                                  \
1375   ((__m512h)__builtin_ia32_selectph_512(                                       \
1376       (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)),                 \
1377       (__v32hf)(__m512h)(W)))
1378 
1379 #define _mm512_maskz_sqrt_round_ph(U, A, R)                                    \
1380   ((__m512h)__builtin_ia32_selectph_512(                                       \
1381       (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)),                 \
1382       (__v32hf)_mm512_setzero_ph()))
1383 
1384 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sqrt_ph(__m512h __A) {
1385   return (__m512h)__builtin_ia32_sqrtph512((__v32hf)__A,
1386                                            _MM_FROUND_CUR_DIRECTION);
1387 }
1388 
1389 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1390 _mm512_mask_sqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) {
1391   return (__m512h)__builtin_ia32_selectph_512(
1392       (__mmask32)(__U),
1393       (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)),
1394       (__v32hf)(__m512h)(__W));
1395 }
1396 
1397 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1398 _mm512_maskz_sqrt_ph(__mmask32 __U, __m512h __A) {
1399   return (__m512h)__builtin_ia32_selectph_512(
1400       (__mmask32)(__U),
1401       (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)),
1402       (__v32hf)_mm512_setzero_ph());
1403 }
1404 
1405 #define _mm_sqrt_round_sh(A, B, R)                                             \
1406   ((__m128h)__builtin_ia32_sqrtsh_round_mask(                                  \
1407       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1408       (__mmask8)-1, (int)(R)))
1409 
1410 #define _mm_mask_sqrt_round_sh(W, U, A, B, R)                                  \
1411   ((__m128h)__builtin_ia32_sqrtsh_round_mask(                                  \
1412       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
1413       (__mmask8)(U), (int)(R)))
1414 
1415 #define _mm_maskz_sqrt_round_sh(U, A, B, R)                                    \
1416   ((__m128h)__builtin_ia32_sqrtsh_round_mask(                                  \
1417       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1418       (__mmask8)(U), (int)(R)))
1419 
1420 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_sh(__m128h __A,
1421                                                             __m128h __B) {
1422   return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1423       (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
1424       (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
1425 }
1426 
1427 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_sh(__m128h __W,
1428                                                                  __mmask32 __U,
1429                                                                  __m128h __A,
1430                                                                  __m128h __B) {
1431   return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1432       (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)(__m128h)(__W),
1433       (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
1434 }
1435 
1436 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_sh(__mmask32 __U,
1437                                                                   __m128h __A,
1438                                                                   __m128h __B) {
1439   return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1440       (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
1441       (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
1442 }
1443 
1444 #define _mm512_mask_fpclass_ph_mask(U, A, imm)                                 \
1445   ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A),          \
1446                                                (int)(imm), (__mmask32)(U)))
1447 
1448 #define _mm512_fpclass_ph_mask(A, imm)                                         \
1449   ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A),          \
1450                                                (int)(imm), (__mmask32)-1))
1451 
1452 #define _mm_fpclass_sh_mask(A, imm)                                            \
1453   ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm),   \
1454                                            (__mmask8)-1))
1455 
1456 #define _mm_mask_fpclass_sh_mask(U, A, imm)                                    \
1457   ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm),   \
1458                                            (__mmask8)(U)))
1459 
1460 #define _mm512_cvt_roundpd_ph(A, R)                                            \
1461   ((__m128h)__builtin_ia32_vcvtpd2ph512_mask(                                  \
1462       (__v8df)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
1463 
1464 #define _mm512_mask_cvt_roundpd_ph(W, U, A, R)                                 \
1465   ((__m128h)__builtin_ia32_vcvtpd2ph512_mask((__v8df)(A), (__v8hf)(W),         \
1466                                              (__mmask8)(U), (int)(R)))
1467 
1468 #define _mm512_maskz_cvt_roundpd_ph(U, A, R)                                   \
1469   ((__m128h)__builtin_ia32_vcvtpd2ph512_mask(                                  \
1470       (__v8df)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1471 
1472 static __inline__ __m128h __DEFAULT_FN_ATTRS512 _mm512_cvtpd_ph(__m512d __A) {
1473   return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1474       (__v8df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
1475       _MM_FROUND_CUR_DIRECTION);
1476 }
1477 
1478 static __inline__ __m128h __DEFAULT_FN_ATTRS512
1479 _mm512_mask_cvtpd_ph(__m128h __W, __mmask8 __U, __m512d __A) {
1480   return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1481       (__v8df)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
1482 }
1483 
1484 static __inline__ __m128h __DEFAULT_FN_ATTRS512
1485 _mm512_maskz_cvtpd_ph(__mmask8 __U, __m512d __A) {
1486   return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1487       (__v8df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1488       _MM_FROUND_CUR_DIRECTION);
1489 }
1490 
1491 #define _mm512_cvt_roundph_pd(A, R)                                            \
1492   ((__m512d)__builtin_ia32_vcvtph2pd512_mask(                                  \
1493       (__v8hf)(A), (__v8df)_mm512_undefined_pd(), (__mmask8)(-1), (int)(R)))
1494 
1495 #define _mm512_mask_cvt_roundph_pd(W, U, A, R)                                 \
1496   ((__m512d)__builtin_ia32_vcvtph2pd512_mask((__v8hf)(A), (__v8df)(W),         \
1497                                              (__mmask8)(U), (int)(R)))
1498 
1499 #define _mm512_maskz_cvt_roundph_pd(U, A, R)                                   \
1500   ((__m512d)__builtin_ia32_vcvtph2pd512_mask(                                  \
1501       (__v8hf)(A), (__v8df)_mm512_setzero_pd(), (__mmask8)(U), (int)(R)))
1502 
1503 static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_cvtph_pd(__m128h __A) {
1504   return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1505       (__v8hf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)-1,
1506       _MM_FROUND_CUR_DIRECTION);
1507 }
1508 
1509 static __inline__ __m512d __DEFAULT_FN_ATTRS512
1510 _mm512_mask_cvtph_pd(__m512d __W, __mmask8 __U, __m128h __A) {
1511   return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1512       (__v8hf)__A, (__v8df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
1513 }
1514 
1515 static __inline__ __m512d __DEFAULT_FN_ATTRS512
1516 _mm512_maskz_cvtph_pd(__mmask8 __U, __m128h __A) {
1517   return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1518       (__v8hf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U,
1519       _MM_FROUND_CUR_DIRECTION);
1520 }
1521 
1522 #define _mm_cvt_roundsh_ss(A, B, R)                                            \
1523   ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B),       \
1524                                                (__v4sf)_mm_undefined_ps(),     \
1525                                                (__mmask8)(-1), (int)(R)))
1526 
1527 #define _mm_mask_cvt_roundsh_ss(W, U, A, B, R)                                 \
1528   ((__m128)__builtin_ia32_vcvtsh2ss_round_mask(                                \
1529       (__v4sf)(A), (__v8hf)(B), (__v4sf)(W), (__mmask8)(U), (int)(R)))
1530 
1531 #define _mm_maskz_cvt_roundsh_ss(U, A, B, R)                                   \
1532   ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B),       \
1533                                                (__v4sf)_mm_setzero_ps(),       \
1534                                                (__mmask8)(U), (int)(R)))
1535 
1536 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtsh_ss(__m128 __A,
1537                                                             __m128h __B) {
1538   return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
1539       (__v4sf)__A, (__v8hf)__B, (__v4sf)_mm_undefined_ps(), (__mmask8)-1,
1540       _MM_FROUND_CUR_DIRECTION);
1541 }
1542 
1543 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtsh_ss(__m128 __W,
1544                                                                  __mmask8 __U,
1545                                                                  __m128 __A,
1546                                                                  __m128h __B) {
1547   return (__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)__A, (__v8hf)__B,
1548                                                      (__v4sf)__W, (__mmask8)__U,
1549                                                      _MM_FROUND_CUR_DIRECTION);
1550 }
1551 
1552 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_cvtsh_ss(__mmask8 __U,
1553                                                                   __m128 __A,
1554                                                                   __m128h __B) {
1555   return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
1556       (__v4sf)__A, (__v8hf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U,
1557       _MM_FROUND_CUR_DIRECTION);
1558 }
1559 
1560 #define _mm_cvt_roundss_sh(A, B, R)                                            \
1561   ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B),      \
1562                                                 (__v8hf)_mm_undefined_ph(),    \
1563                                                 (__mmask8)(-1), (int)(R)))
1564 
1565 #define _mm_mask_cvt_roundss_sh(W, U, A, B, R)                                 \
1566   ((__m128h)__builtin_ia32_vcvtss2sh_round_mask(                               \
1567       (__v8hf)(A), (__v4sf)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
1568 
1569 #define _mm_maskz_cvt_roundss_sh(U, A, B, R)                                   \
1570   ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B),      \
1571                                                 (__v8hf)_mm_setzero_ph(),      \
1572                                                 (__mmask8)(U), (int)(R)))
1573 
1574 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtss_sh(__m128h __A,
1575                                                              __m128 __B) {
1576   return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1577       (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_undefined_ph(), (__mmask8)-1,
1578       _MM_FROUND_CUR_DIRECTION);
1579 }
1580 
1581 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtss_sh(__m128h __W,
1582                                                                   __mmask8 __U,
1583                                                                   __m128h __A,
1584                                                                   __m128 __B) {
1585   return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1586       (__v8hf)__A, (__v4sf)__B, (__v8hf)__W, (__mmask8)__U,
1587       _MM_FROUND_CUR_DIRECTION);
1588 }
1589 
1590 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_cvtss_sh(__mmask8 __U,
1591                                                                    __m128h __A,
1592                                                                    __m128 __B) {
1593   return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1594       (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1595       _MM_FROUND_CUR_DIRECTION);
1596 }
1597 
1598 #define _mm_cvt_roundsd_sh(A, B, R)                                            \
1599   ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B),      \
1600                                                 (__v8hf)_mm_undefined_ph(),    \
1601                                                 (__mmask8)(-1), (int)(R)))
1602 
1603 #define _mm_mask_cvt_roundsd_sh(W, U, A, B, R)                                 \
1604   ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask(                               \
1605       (__v8hf)(A), (__v2df)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
1606 
1607 #define _mm_maskz_cvt_roundsd_sh(U, A, B, R)                                   \
1608   ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B),      \
1609                                                 (__v8hf)_mm_setzero_ph(),      \
1610                                                 (__mmask8)(U), (int)(R)))
1611 
1612 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtsd_sh(__m128h __A,
1613                                                              __m128d __B) {
1614   return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1615       (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_undefined_ph(), (__mmask8)-1,
1616       _MM_FROUND_CUR_DIRECTION);
1617 }
1618 
1619 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtsd_sh(__m128h __W,
1620                                                                   __mmask8 __U,
1621                                                                   __m128h __A,
1622                                                                   __m128d __B) {
1623   return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1624       (__v8hf)__A, (__v2df)__B, (__v8hf)__W, (__mmask8)__U,
1625       _MM_FROUND_CUR_DIRECTION);
1626 }
1627 
1628 static __inline__ __m128h __DEFAULT_FN_ATTRS128
1629 _mm_maskz_cvtsd_sh(__mmask8 __U, __m128h __A, __m128d __B) {
1630   return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1631       (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1632       _MM_FROUND_CUR_DIRECTION);
1633 }
1634 
1635 #define _mm_cvt_roundsh_sd(A, B, R)                                            \
1636   ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B),      \
1637                                                 (__v2df)_mm_undefined_pd(),    \
1638                                                 (__mmask8)(-1), (int)(R)))
1639 
1640 #define _mm_mask_cvt_roundsh_sd(W, U, A, B, R)                                 \
1641   ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask(                               \
1642       (__v2df)(A), (__v8hf)(B), (__v2df)(W), (__mmask8)(U), (int)(R)))
1643 
1644 #define _mm_maskz_cvt_roundsh_sd(U, A, B, R)                                   \
1645   ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B),      \
1646                                                 (__v2df)_mm_setzero_pd(),      \
1647                                                 (__mmask8)(U), (int)(R)))
1648 
1649 static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_cvtsh_sd(__m128d __A,
1650                                                              __m128h __B) {
1651   return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1652       (__v2df)__A, (__v8hf)__B, (__v2df)_mm_undefined_pd(), (__mmask8)-1,
1653       _MM_FROUND_CUR_DIRECTION);
1654 }
1655 
1656 static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_cvtsh_sd(__m128d __W,
1657                                                                   __mmask8 __U,
1658                                                                   __m128d __A,
1659                                                                   __m128h __B) {
1660   return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1661       (__v2df)__A, (__v8hf)__B, (__v2df)__W, (__mmask8)__U,
1662       _MM_FROUND_CUR_DIRECTION);
1663 }
1664 
1665 static __inline__ __m128d __DEFAULT_FN_ATTRS128
1666 _mm_maskz_cvtsh_sd(__mmask8 __U, __m128d __A, __m128h __B) {
1667   return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1668       (__v2df)__A, (__v8hf)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U,
1669       _MM_FROUND_CUR_DIRECTION);
1670 }
1671 
1672 #define _mm512_cvt_roundph_epi16(A, R)                                         \
1673   ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A),                      \
1674                                             (__v32hi)_mm512_undefined_epi32(), \
1675                                             (__mmask32)(-1), (int)(R)))
1676 
1677 #define _mm512_mask_cvt_roundph_epi16(W, U, A, R)                              \
1678   ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), (__v32hi)(W),        \
1679                                             (__mmask32)(U), (int)(R)))
1680 
1681 #define _mm512_maskz_cvt_roundph_epi16(U, A, R)                                \
1682   ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A),                      \
1683                                             (__v32hi)_mm512_setzero_epi32(),   \
1684                                             (__mmask32)(U), (int)(R)))
1685 
1686 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1687 _mm512_cvtph_epi16(__m512h __A) {
1688   return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1689       (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)-1,
1690       _MM_FROUND_CUR_DIRECTION);
1691 }
1692 
1693 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1694 _mm512_mask_cvtph_epi16(__m512i __W, __mmask32 __U, __m512h __A) {
1695   return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1696       (__v32hf)__A, (__v32hi)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1697 }
1698 
1699 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1700 _mm512_maskz_cvtph_epi16(__mmask32 __U, __m512h __A) {
1701   return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1702       (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)__U,
1703       _MM_FROUND_CUR_DIRECTION);
1704 }
1705 
1706 #define _mm512_cvtt_roundph_epi16(A, R)                                        \
1707   ((__m512i)__builtin_ia32_vcvttph2w512_mask(                                  \
1708       (__v32hf)(A), (__v32hi)_mm512_undefined_epi32(), (__mmask32)(-1),        \
1709       (int)(R)))
1710 
1711 #define _mm512_mask_cvtt_roundph_epi16(W, U, A, R)                             \
1712   ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), (__v32hi)(W),       \
1713                                              (__mmask32)(U), (int)(R)))
1714 
1715 #define _mm512_maskz_cvtt_roundph_epi16(U, A, R)                               \
1716   ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A),                     \
1717                                              (__v32hi)_mm512_setzero_epi32(),  \
1718                                              (__mmask32)(U), (int)(R)))
1719 
1720 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1721 _mm512_cvttph_epi16(__m512h __A) {
1722   return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1723       (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)-1,
1724       _MM_FROUND_CUR_DIRECTION);
1725 }
1726 
1727 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1728 _mm512_mask_cvttph_epi16(__m512i __W, __mmask32 __U, __m512h __A) {
1729   return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1730       (__v32hf)__A, (__v32hi)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1731 }
1732 
1733 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1734 _mm512_maskz_cvttph_epi16(__mmask32 __U, __m512h __A) {
1735   return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1736       (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)__U,
1737       _MM_FROUND_CUR_DIRECTION);
1738 }
1739 
1740 #define _mm512_cvt_roundepi16_ph(A, R)                                         \
1741   ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A),                      \
1742                                             (__v32hf)_mm512_undefined_ph(),    \
1743                                             (__mmask32)(-1), (int)(R)))
1744 
1745 #define _mm512_mask_cvt_roundepi16_ph(W, U, A, R)                              \
1746   ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), (__v32hf)(W),        \
1747                                             (__mmask32)(U), (int)(R)))
1748 
1749 #define _mm512_maskz_cvt_roundepi16_ph(U, A, R)                                \
1750   ((__m512h)__builtin_ia32_vcvtw2ph512_mask(                                   \
1751       (__v32hi)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1752 
1753 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1754 _mm512_cvtepi16_ph(__m512i __A) {
1755   return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1756       (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)-1,
1757       _MM_FROUND_CUR_DIRECTION);
1758 }
1759 
1760 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1761 _mm512_mask_cvtepi16_ph(__m512h __W, __mmask32 __U, __m512i __A) {
1762   return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1763       (__v32hi)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1764 }
1765 
1766 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1767 _mm512_maskz_cvtepi16_ph(__mmask32 __U, __m512i __A) {
1768   return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1769       (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1770       _MM_FROUND_CUR_DIRECTION);
1771 }
1772 
1773 #define _mm512_cvt_roundph_epu16(A, R)                                         \
1774   ((__m512i)__builtin_ia32_vcvtph2uw512_mask(                                  \
1775       (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1),        \
1776       (int)(R)))
1777 
1778 #define _mm512_mask_cvt_roundph_epu16(W, U, A, R)                              \
1779   ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), (__v32hu)(W),       \
1780                                              (__mmask32)(U), (int)(R)))
1781 
1782 #define _mm512_maskz_cvt_roundph_epu16(U, A, R)                                \
1783   ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A),                     \
1784                                              (__v32hu)_mm512_setzero_epi32(),  \
1785                                              (__mmask32)(U), (int)(R)))
1786 
1787 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1788 _mm512_cvtph_epu16(__m512h __A) {
1789   return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1790       (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)-1,
1791       _MM_FROUND_CUR_DIRECTION);
1792 }
1793 
1794 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1795 _mm512_mask_cvtph_epu16(__m512i __W, __mmask32 __U, __m512h __A) {
1796   return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1797       (__v32hf)__A, (__v32hu)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1798 }
1799 
1800 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1801 _mm512_maskz_cvtph_epu16(__mmask32 __U, __m512h __A) {
1802   return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1803       (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)__U,
1804       _MM_FROUND_CUR_DIRECTION);
1805 }
1806 
1807 #define _mm512_cvtt_roundph_epu16(A, R)                                        \
1808   ((__m512i)__builtin_ia32_vcvttph2uw512_mask(                                 \
1809       (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1),        \
1810       (int)(R)))
1811 
1812 #define _mm512_mask_cvtt_roundph_epu16(W, U, A, R)                             \
1813   ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), (__v32hu)(W),      \
1814                                               (__mmask32)(U), (int)(R)))
1815 
1816 #define _mm512_maskz_cvtt_roundph_epu16(U, A, R)                               \
1817   ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A),                    \
1818                                               (__v32hu)_mm512_setzero_epi32(), \
1819                                               (__mmask32)(U), (int)(R)))
1820 
1821 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1822 _mm512_cvttph_epu16(__m512h __A) {
1823   return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1824       (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)-1,
1825       _MM_FROUND_CUR_DIRECTION);
1826 }
1827 
1828 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1829 _mm512_mask_cvttph_epu16(__m512i __W, __mmask32 __U, __m512h __A) {
1830   return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1831       (__v32hf)__A, (__v32hu)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1832 }
1833 
1834 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1835 _mm512_maskz_cvttph_epu16(__mmask32 __U, __m512h __A) {
1836   return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1837       (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)__U,
1838       _MM_FROUND_CUR_DIRECTION);
1839 }
1840 
1841 #define _mm512_cvt_roundepu16_ph(A, R)                                         \
1842   ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A),                     \
1843                                              (__v32hf)_mm512_undefined_ph(),   \
1844                                              (__mmask32)(-1), (int)(R)))
1845 
1846 #define _mm512_mask_cvt_roundepu16_ph(W, U, A, R)                              \
1847   ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), (__v32hf)(W),       \
1848                                              (__mmask32)(U), (int)(R)))
1849 
1850 #define _mm512_maskz_cvt_roundepu16_ph(U, A, R)                                \
1851   ((__m512h)__builtin_ia32_vcvtuw2ph512_mask(                                  \
1852       (__v32hu)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1853 
1854 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1855 _mm512_cvtepu16_ph(__m512i __A) {
1856   return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1857       (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)-1,
1858       _MM_FROUND_CUR_DIRECTION);
1859 }
1860 
1861 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1862 _mm512_mask_cvtepu16_ph(__m512h __W, __mmask32 __U, __m512i __A) {
1863   return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1864       (__v32hu)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1865 }
1866 
1867 static __inline__ __m512h __DEFAULT_FN_ATTRS512
1868 _mm512_maskz_cvtepu16_ph(__mmask32 __U, __m512i __A) {
1869   return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1870       (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1871       _MM_FROUND_CUR_DIRECTION);
1872 }
1873 
1874 #define _mm512_cvt_roundph_epi32(A, R)                                         \
1875   ((__m512i)__builtin_ia32_vcvtph2dq512_mask(                                  \
1876       (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1),        \
1877       (int)(R)))
1878 
1879 #define _mm512_mask_cvt_roundph_epi32(W, U, A, R)                              \
1880   ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), (__v16si)(W),       \
1881                                              (__mmask16)(U), (int)(R)))
1882 
1883 #define _mm512_maskz_cvt_roundph_epi32(U, A, R)                                \
1884   ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A),                     \
1885                                              (__v16si)_mm512_setzero_epi32(),  \
1886                                              (__mmask16)(U), (int)(R)))
1887 
1888 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1889 _mm512_cvtph_epi32(__m256h __A) {
1890   return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1891       (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)-1,
1892       _MM_FROUND_CUR_DIRECTION);
1893 }
1894 
1895 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1896 _mm512_mask_cvtph_epi32(__m512i __W, __mmask16 __U, __m256h __A) {
1897   return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1898       (__v16hf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1899 }
1900 
1901 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1902 _mm512_maskz_cvtph_epi32(__mmask16 __U, __m256h __A) {
1903   return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1904       (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)__U,
1905       _MM_FROUND_CUR_DIRECTION);
1906 }
1907 
1908 #define _mm512_cvt_roundph_epu32(A, R)                                         \
1909   ((__m512i)__builtin_ia32_vcvtph2udq512_mask(                                 \
1910       (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1),        \
1911       (int)(R)))
1912 
1913 #define _mm512_mask_cvt_roundph_epu32(W, U, A, R)                              \
1914   ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), (__v16su)(W),      \
1915                                               (__mmask16)(U), (int)(R)))
1916 
1917 #define _mm512_maskz_cvt_roundph_epu32(U, A, R)                                \
1918   ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A),                    \
1919                                               (__v16su)_mm512_setzero_epi32(), \
1920                                               (__mmask16)(U), (int)(R)))
1921 
1922 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1923 _mm512_cvtph_epu32(__m256h __A) {
1924   return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1925       (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)-1,
1926       _MM_FROUND_CUR_DIRECTION);
1927 }
1928 
1929 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1930 _mm512_mask_cvtph_epu32(__m512i __W, __mmask16 __U, __m256h __A) {
1931   return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1932       (__v16hf)__A, (__v16su)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1933 }
1934 
1935 static __inline__ __m512i __DEFAULT_FN_ATTRS512
1936 _mm512_maskz_cvtph_epu32(__mmask16 __U, __m256h __A) {
1937   return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1938       (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)__U,
1939       _MM_FROUND_CUR_DIRECTION);
1940 }
1941 
1942 #define _mm512_cvt_roundepi32_ph(A, R)                                         \
1943   ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A),                     \
1944                                              (__v16hf)_mm256_undefined_ph(),   \
1945                                              (__mmask16)(-1), (int)(R)))
1946 
1947 #define _mm512_mask_cvt_roundepi32_ph(W, U, A, R)                              \
1948   ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), (__v16hf)(W),       \
1949                                              (__mmask16)(U), (int)(R)))
1950 
1951 #define _mm512_maskz_cvt_roundepi32_ph(U, A, R)                                \
1952   ((__m256h)__builtin_ia32_vcvtdq2ph512_mask(                                  \
1953       (__v16si)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1954 
1955 static __inline__ __m256h __DEFAULT_FN_ATTRS512
1956 _mm512_cvtepi32_ph(__m512i __A) {
1957   return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1958       (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
1959       _MM_FROUND_CUR_DIRECTION);
1960 }
1961 
1962 static __inline__ __m256h __DEFAULT_FN_ATTRS512
1963 _mm512_mask_cvtepi32_ph(__m256h __W, __mmask16 __U, __m512i __A) {
1964   return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1965       (__v16si)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1966 }
1967 
1968 static __inline__ __m256h __DEFAULT_FN_ATTRS512
1969 _mm512_maskz_cvtepi32_ph(__mmask16 __U, __m512i __A) {
1970   return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1971       (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
1972       _MM_FROUND_CUR_DIRECTION);
1973 }
1974 
1975 #define _mm512_cvt_roundepu32_ph(A, R)                                         \
1976   ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A),                    \
1977                                               (__v16hf)_mm256_undefined_ph(),  \
1978                                               (__mmask16)(-1), (int)(R)))
1979 
1980 #define _mm512_mask_cvt_roundepu32_ph(W, U, A, R)                              \
1981   ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), (__v16hf)(W),      \
1982                                               (__mmask16)(U), (int)(R)))
1983 
1984 #define _mm512_maskz_cvt_roundepu32_ph(U, A, R)                                \
1985   ((__m256h)__builtin_ia32_vcvtudq2ph512_mask(                                 \
1986       (__v16su)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1987 
1988 static __inline__ __m256h __DEFAULT_FN_ATTRS512
1989 _mm512_cvtepu32_ph(__m512i __A) {
1990   return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
1991       (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
1992       _MM_FROUND_CUR_DIRECTION);
1993 }
1994 
1995 static __inline__ __m256h __DEFAULT_FN_ATTRS512
1996 _mm512_mask_cvtepu32_ph(__m256h __W, __mmask16 __U, __m512i __A) {
1997   return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
1998       (__v16su)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1999 }
2000 
2001 static __inline__ __m256h __DEFAULT_FN_ATTRS512
2002 _mm512_maskz_cvtepu32_ph(__mmask16 __U, __m512i __A) {
2003   return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
2004       (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
2005       _MM_FROUND_CUR_DIRECTION);
2006 }
2007 
2008 #define _mm512_cvtt_roundph_epi32(A, R)                                        \
2009   ((__m512i)__builtin_ia32_vcvttph2dq512_mask(                                 \
2010       (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1),        \
2011       (int)(R)))
2012 
2013 #define _mm512_mask_cvtt_roundph_epi32(W, U, A, R)                             \
2014   ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), (__v16si)(W),      \
2015                                               (__mmask16)(U), (int)(R)))
2016 
2017 #define _mm512_maskz_cvtt_roundph_epi32(U, A, R)                               \
2018   ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A),                    \
2019                                               (__v16si)_mm512_setzero_epi32(), \
2020                                               (__mmask16)(U), (int)(R)))
2021 
2022 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2023 _mm512_cvttph_epi32(__m256h __A) {
2024   return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2025       (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)-1,
2026       _MM_FROUND_CUR_DIRECTION);
2027 }
2028 
2029 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2030 _mm512_mask_cvttph_epi32(__m512i __W, __mmask16 __U, __m256h __A) {
2031   return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2032       (__v16hf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2033 }
2034 
2035 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2036 _mm512_maskz_cvttph_epi32(__mmask16 __U, __m256h __A) {
2037   return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2038       (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)__U,
2039       _MM_FROUND_CUR_DIRECTION);
2040 }
2041 
2042 #define _mm512_cvtt_roundph_epu32(A, R)                                        \
2043   ((__m512i)__builtin_ia32_vcvttph2udq512_mask(                                \
2044       (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1),        \
2045       (int)(R)))
2046 
2047 #define _mm512_mask_cvtt_roundph_epu32(W, U, A, R)                             \
2048   ((__m512i)__builtin_ia32_vcvttph2udq512_mask((__v16hf)(A), (__v16su)(W),     \
2049                                                (__mmask16)(U), (int)(R)))
2050 
2051 #define _mm512_maskz_cvtt_roundph_epu32(U, A, R)                               \
2052   ((__m512i)__builtin_ia32_vcvttph2udq512_mask(                                \
2053       (__v16hf)(A), (__v16su)_mm512_setzero_epi32(), (__mmask16)(U),           \
2054       (int)(R)))
2055 
2056 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2057 _mm512_cvttph_epu32(__m256h __A) {
2058   return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2059       (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)-1,
2060       _MM_FROUND_CUR_DIRECTION);
2061 }
2062 
2063 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2064 _mm512_mask_cvttph_epu32(__m512i __W, __mmask16 __U, __m256h __A) {
2065   return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2066       (__v16hf)__A, (__v16su)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2067 }
2068 
2069 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2070 _mm512_maskz_cvttph_epu32(__mmask16 __U, __m256h __A) {
2071   return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2072       (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)__U,
2073       _MM_FROUND_CUR_DIRECTION);
2074 }
2075 
2076 #define _mm512_cvt_roundepi64_ph(A, R)                                         \
2077   ((__m128h)__builtin_ia32_vcvtqq2ph512_mask(                                  \
2078       (__v8di)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
2079 
2080 #define _mm512_mask_cvt_roundepi64_ph(W, U, A, R)                              \
2081   ((__m128h)__builtin_ia32_vcvtqq2ph512_mask((__v8di)(A), (__v8hf)(W),         \
2082                                              (__mmask8)(U), (int)(R)))
2083 
2084 #define _mm512_maskz_cvt_roundepi64_ph(U, A, R)                                \
2085   ((__m128h)__builtin_ia32_vcvtqq2ph512_mask(                                  \
2086       (__v8di)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
2087 
2088 static __inline__ __m128h __DEFAULT_FN_ATTRS512
2089 _mm512_cvtepi64_ph(__m512i __A) {
2090   return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2091       (__v8di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
2092       _MM_FROUND_CUR_DIRECTION);
2093 }
2094 
2095 static __inline__ __m128h __DEFAULT_FN_ATTRS512
2096 _mm512_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m512i __A) {
2097   return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2098       (__v8di)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2099 }
2100 
2101 static __inline__ __m128h __DEFAULT_FN_ATTRS512
2102 _mm512_maskz_cvtepi64_ph(__mmask8 __U, __m512i __A) {
2103   return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2104       (__v8di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
2105       _MM_FROUND_CUR_DIRECTION);
2106 }
2107 
2108 #define _mm512_cvt_roundph_epi64(A, R)                                         \
2109   ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A),                      \
2110                                              (__v8di)_mm512_undefined_epi32(), \
2111                                              (__mmask8)(-1), (int)(R)))
2112 
2113 #define _mm512_mask_cvt_roundph_epi64(W, U, A, R)                              \
2114   ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), (__v8di)(W),         \
2115                                              (__mmask8)(U), (int)(R)))
2116 
2117 #define _mm512_maskz_cvt_roundph_epi64(U, A, R)                                \
2118   ((__m512i)__builtin_ia32_vcvtph2qq512_mask(                                  \
2119       (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2120 
2121 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2122 _mm512_cvtph_epi64(__m128h __A) {
2123   return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2124       (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)-1,
2125       _MM_FROUND_CUR_DIRECTION);
2126 }
2127 
2128 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2129 _mm512_mask_cvtph_epi64(__m512i __W, __mmask8 __U, __m128h __A) {
2130   return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2131       (__v8hf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2132 }
2133 
2134 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2135 _mm512_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) {
2136   return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2137       (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)__U,
2138       _MM_FROUND_CUR_DIRECTION);
2139 }
2140 
2141 #define _mm512_cvt_roundepu64_ph(A, R)                                         \
2142   ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask(                                 \
2143       (__v8du)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
2144 
2145 #define _mm512_mask_cvt_roundepu64_ph(W, U, A, R)                              \
2146   ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask((__v8du)(A), (__v8hf)(W),        \
2147                                               (__mmask8)(U), (int)(R)))
2148 
2149 #define _mm512_maskz_cvt_roundepu64_ph(U, A, R)                                \
2150   ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask(                                 \
2151       (__v8du)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
2152 
2153 static __inline__ __m128h __DEFAULT_FN_ATTRS512
2154 _mm512_cvtepu64_ph(__m512i __A) {
2155   return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2156       (__v8du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
2157       _MM_FROUND_CUR_DIRECTION);
2158 }
2159 
2160 static __inline__ __m128h __DEFAULT_FN_ATTRS512
2161 _mm512_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m512i __A) {
2162   return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2163       (__v8du)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2164 }
2165 
2166 static __inline__ __m128h __DEFAULT_FN_ATTRS512
2167 _mm512_maskz_cvtepu64_ph(__mmask8 __U, __m512i __A) {
2168   return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2169       (__v8du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
2170       _MM_FROUND_CUR_DIRECTION);
2171 }
2172 
2173 #define _mm512_cvt_roundph_epu64(A, R)                                         \
2174   ((__m512i)__builtin_ia32_vcvtph2uqq512_mask(                                 \
2175       (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1),           \
2176       (int)(R)))
2177 
2178 #define _mm512_mask_cvt_roundph_epu64(W, U, A, R)                              \
2179   ((__m512i)__builtin_ia32_vcvtph2uqq512_mask((__v8hf)(A), (__v8du)(W),        \
2180                                               (__mmask8)(U), (int)(R)))
2181 
2182 #define _mm512_maskz_cvt_roundph_epu64(U, A, R)                                \
2183   ((__m512i)__builtin_ia32_vcvtph2uqq512_mask(                                 \
2184       (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2185 
2186 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2187 _mm512_cvtph_epu64(__m128h __A) {
2188   return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2189       (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)-1,
2190       _MM_FROUND_CUR_DIRECTION);
2191 }
2192 
2193 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2194 _mm512_mask_cvtph_epu64(__m512i __W, __mmask8 __U, __m128h __A) {
2195   return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2196       (__v8hf)__A, (__v8du)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2197 }
2198 
2199 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2200 _mm512_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) {
2201   return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2202       (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)__U,
2203       _MM_FROUND_CUR_DIRECTION);
2204 }
2205 
2206 #define _mm512_cvtt_roundph_epi64(A, R)                                        \
2207   ((__m512i)__builtin_ia32_vcvttph2qq512_mask(                                 \
2208       (__v8hf)(A), (__v8di)_mm512_undefined_epi32(), (__mmask8)(-1),           \
2209       (int)(R)))
2210 
2211 #define _mm512_mask_cvtt_roundph_epi64(W, U, A, R)                             \
2212   ((__m512i)__builtin_ia32_vcvttph2qq512_mask((__v8hf)(A), (__v8di)(W),        \
2213                                               (__mmask8)(U), (int)(R)))
2214 
2215 #define _mm512_maskz_cvtt_roundph_epi64(U, A, R)                               \
2216   ((__m512i)__builtin_ia32_vcvttph2qq512_mask(                                 \
2217       (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2218 
2219 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2220 _mm512_cvttph_epi64(__m128h __A) {
2221   return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2222       (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)-1,
2223       _MM_FROUND_CUR_DIRECTION);
2224 }
2225 
2226 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2227 _mm512_mask_cvttph_epi64(__m512i __W, __mmask8 __U, __m128h __A) {
2228   return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2229       (__v8hf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2230 }
2231 
2232 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2233 _mm512_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) {
2234   return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2235       (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)__U,
2236       _MM_FROUND_CUR_DIRECTION);
2237 }
2238 
2239 #define _mm512_cvtt_roundph_epu64(A, R)                                        \
2240   ((__m512i)__builtin_ia32_vcvttph2uqq512_mask(                                \
2241       (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1),           \
2242       (int)(R)))
2243 
2244 #define _mm512_mask_cvtt_roundph_epu64(W, U, A, R)                             \
2245   ((__m512i)__builtin_ia32_vcvttph2uqq512_mask((__v8hf)(A), (__v8du)(W),       \
2246                                                (__mmask8)(U), (int)(R)))
2247 
2248 #define _mm512_maskz_cvtt_roundph_epu64(U, A, R)                               \
2249   ((__m512i)__builtin_ia32_vcvttph2uqq512_mask(                                \
2250       (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2251 
2252 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2253 _mm512_cvttph_epu64(__m128h __A) {
2254   return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2255       (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)-1,
2256       _MM_FROUND_CUR_DIRECTION);
2257 }
2258 
2259 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2260 _mm512_mask_cvttph_epu64(__m512i __W, __mmask8 __U, __m128h __A) {
2261   return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2262       (__v8hf)__A, (__v8du)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2263 }
2264 
2265 static __inline__ __m512i __DEFAULT_FN_ATTRS512
2266 _mm512_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) {
2267   return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2268       (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)__U,
2269       _MM_FROUND_CUR_DIRECTION);
2270 }
2271 
2272 #define _mm_cvt_roundsh_i32(A, R)                                              \
2273   ((int)__builtin_ia32_vcvtsh2si32((__v8hf)(A), (int)(R)))
2274 
2275 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvtsh_i32(__m128h __A) {
2276   return (int)__builtin_ia32_vcvtsh2si32((__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
2277 }
2278 
2279 #define _mm_cvt_roundsh_u32(A, R)                                              \
2280   ((unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)(A), (int)(R)))
2281 
2282 static __inline__ unsigned int __DEFAULT_FN_ATTRS128
2283 _mm_cvtsh_u32(__m128h __A) {
2284   return (unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)__A,
2285                                                    _MM_FROUND_CUR_DIRECTION);
2286 }
2287 
2288 #ifdef __x86_64__
2289 #define _mm_cvt_roundsh_i64(A, R)                                              \
2290   ((long long)__builtin_ia32_vcvtsh2si64((__v8hf)(A), (int)(R)))
2291 
2292 static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvtsh_i64(__m128h __A) {
2293   return (long long)__builtin_ia32_vcvtsh2si64((__v8hf)__A,
2294                                                _MM_FROUND_CUR_DIRECTION);
2295 }
2296 
2297 #define _mm_cvt_roundsh_u64(A, R)                                              \
2298   ((unsigned long long)__builtin_ia32_vcvtsh2usi64((__v8hf)(A), (int)(R)))
2299 
2300 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
2301 _mm_cvtsh_u64(__m128h __A) {
2302   return (unsigned long long)__builtin_ia32_vcvtsh2usi64(
2303       (__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
2304 }
2305 #endif // __x86_64__
2306 
2307 #define _mm_cvt_roundu32_sh(A, B, R)                                           \
2308   ((__m128h)__builtin_ia32_vcvtusi2sh((__v8hf)(A), (unsigned int)(B), (int)(R)))
2309 
2310 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2311 _mm_cvtu32_sh(__m128h __A, unsigned int __B) {
2312   __A[0] = __B;
2313   return __A;
2314 }
2315 
2316 #ifdef __x86_64__
2317 #define _mm_cvt_roundu64_sh(A, B, R)                                           \
2318   ((__m128h)__builtin_ia32_vcvtusi642sh((__v8hf)(A), (unsigned long long)(B),  \
2319                                         (int)(R)))
2320 
2321 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2322 _mm_cvtu64_sh(__m128h __A, unsigned long long __B) {
2323   __A[0] = __B;
2324   return __A;
2325 }
2326 #endif
2327 
2328 #define _mm_cvt_roundi32_sh(A, B, R)                                           \
2329   ((__m128h)__builtin_ia32_vcvtsi2sh((__v8hf)(A), (int)(B), (int)(R)))
2330 
2331 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvti32_sh(__m128h __A,
2332                                                               int __B) {
2333   __A[0] = __B;
2334   return __A;
2335 }
2336 
2337 #ifdef __x86_64__
2338 #define _mm_cvt_roundi64_sh(A, B, R)                                           \
2339   ((__m128h)__builtin_ia32_vcvtsi642sh((__v8hf)(A), (long long)(B), (int)(R)))
2340 
2341 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvti64_sh(__m128h __A,
2342                                                               long long __B) {
2343   __A[0] = __B;
2344   return __A;
2345 }
2346 #endif
2347 
2348 #define _mm_cvtt_roundsh_i32(A, R)                                             \
2349   ((int)__builtin_ia32_vcvttsh2si32((__v8hf)(A), (int)(R)))
2350 
2351 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvttsh_i32(__m128h __A) {
2352   return (int)__builtin_ia32_vcvttsh2si32((__v8hf)__A,
2353                                           _MM_FROUND_CUR_DIRECTION);
2354 }
2355 
2356 #ifdef __x86_64__
2357 #define _mm_cvtt_roundsh_i64(A, R)                                             \
2358   ((long long)__builtin_ia32_vcvttsh2si64((__v8hf)(A), (int)(R)))
2359 
2360 static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvttsh_i64(__m128h __A) {
2361   return (long long)__builtin_ia32_vcvttsh2si64((__v8hf)__A,
2362                                                 _MM_FROUND_CUR_DIRECTION);
2363 }
2364 #endif
2365 
2366 #define _mm_cvtt_roundsh_u32(A, R)                                             \
2367   ((unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)(A), (int)(R)))
2368 
2369 static __inline__ unsigned int __DEFAULT_FN_ATTRS128
2370 _mm_cvttsh_u32(__m128h __A) {
2371   return (unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)__A,
2372                                                     _MM_FROUND_CUR_DIRECTION);
2373 }
2374 
2375 #ifdef __x86_64__
2376 #define _mm_cvtt_roundsh_u64(A, R)                                             \
2377   ((unsigned long long)__builtin_ia32_vcvttsh2usi64((__v8hf)(A), (int)(R)))
2378 
2379 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
2380 _mm_cvttsh_u64(__m128h __A) {
2381   return (unsigned long long)__builtin_ia32_vcvttsh2usi64(
2382       (__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
2383 }
2384 #endif
2385 
2386 #define _mm512_cvtx_roundph_ps(A, R)                                           \
2387   ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A),                     \
2388                                              (__v16sf)_mm512_undefined_ps(),   \
2389                                              (__mmask16)(-1), (int)(R)))
2390 
2391 #define _mm512_mask_cvtx_roundph_ps(W, U, A, R)                                \
2392   ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), (__v16sf)(W),       \
2393                                              (__mmask16)(U), (int)(R)))
2394 
2395 #define _mm512_maskz_cvtx_roundph_ps(U, A, R)                                  \
2396   ((__m512)__builtin_ia32_vcvtph2psx512_mask(                                  \
2397       (__v16hf)(A), (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), (int)(R)))
2398 
2399 static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtxph_ps(__m256h __A) {
2400   return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2401       (__v16hf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)-1,
2402       _MM_FROUND_CUR_DIRECTION);
2403 }
2404 
2405 static __inline__ __m512 __DEFAULT_FN_ATTRS512
2406 _mm512_mask_cvtxph_ps(__m512 __W, __mmask16 __U, __m256h __A) {
2407   return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2408       (__v16hf)__A, (__v16sf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2409 }
2410 
2411 static __inline__ __m512 __DEFAULT_FN_ATTRS512
2412 _mm512_maskz_cvtxph_ps(__mmask16 __U, __m256h __A) {
2413   return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2414       (__v16hf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U,
2415       _MM_FROUND_CUR_DIRECTION);
2416 }
2417 
2418 #define _mm512_cvtx_roundps_ph(A, R)                                           \
2419   ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A),                    \
2420                                               (__v16hf)_mm256_undefined_ph(),  \
2421                                               (__mmask16)(-1), (int)(R)))
2422 
2423 #define _mm512_mask_cvtx_roundps_ph(W, U, A, R)                                \
2424   ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), (__v16hf)(W),      \
2425                                               (__mmask16)(U), (int)(R)))
2426 
2427 #define _mm512_maskz_cvtx_roundps_ph(U, A, R)                                  \
2428   ((__m256h)__builtin_ia32_vcvtps2phx512_mask(                                 \
2429       (__v16sf)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
2430 
2431 static __inline__ __m256h __DEFAULT_FN_ATTRS512 _mm512_cvtxps_ph(__m512 __A) {
2432   return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2433       (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
2434       _MM_FROUND_CUR_DIRECTION);
2435 }
2436 
2437 static __inline__ __m256h __DEFAULT_FN_ATTRS512
2438 _mm512_mask_cvtxps_ph(__m256h __W, __mmask16 __U, __m512 __A) {
2439   return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2440       (__v16sf)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2441 }
2442 
2443 static __inline__ __m256h __DEFAULT_FN_ATTRS512
2444 _mm512_maskz_cvtxps_ph(__mmask16 __U, __m512 __A) {
2445   return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2446       (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
2447       _MM_FROUND_CUR_DIRECTION);
2448 }
2449 
2450 #define _mm512_fmadd_round_ph(A, B, C, R)                                      \
2451   ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2452       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2453       (__mmask32)-1, (int)(R)))
2454 
2455 #define _mm512_mask_fmadd_round_ph(A, U, B, C, R)                              \
2456   ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2457       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2458       (__mmask32)(U), (int)(R)))
2459 
2460 #define _mm512_mask3_fmadd_round_ph(A, B, C, U, R)                             \
2461   ((__m512h)__builtin_ia32_vfmaddph512_mask3(                                  \
2462       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2463       (__mmask32)(U), (int)(R)))
2464 
2465 #define _mm512_maskz_fmadd_round_ph(U, A, B, C, R)                             \
2466   ((__m512h)__builtin_ia32_vfmaddph512_maskz(                                  \
2467       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2468       (__mmask32)(U), (int)(R)))
2469 
2470 #define _mm512_fmsub_round_ph(A, B, C, R)                                      \
2471   ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2472       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
2473       (__mmask32)-1, (int)(R)))
2474 
2475 #define _mm512_mask_fmsub_round_ph(A, U, B, C, R)                              \
2476   ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2477       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
2478       (__mmask32)(U), (int)(R)))
2479 
2480 #define _mm512_maskz_fmsub_round_ph(U, A, B, C, R)                             \
2481   ((__m512h)__builtin_ia32_vfmaddph512_maskz(                                  \
2482       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
2483       (__mmask32)(U), (int)(R)))
2484 
2485 #define _mm512_fnmadd_round_ph(A, B, C, R)                                     \
2486   ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2487       (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),    \
2488       (__mmask32)-1, (int)(R)))
2489 
2490 #define _mm512_mask3_fnmadd_round_ph(A, B, C, U, R)                            \
2491   ((__m512h)__builtin_ia32_vfmaddph512_mask3(                                  \
2492       -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),    \
2493       (__mmask32)(U), (int)(R)))
2494 
2495 #define _mm512_maskz_fnmadd_round_ph(U, A, B, C, R)                            \
2496   ((__m512h)__builtin_ia32_vfmaddph512_maskz(                                  \
2497       -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),    \
2498       (__mmask32)(U), (int)(R)))
2499 
2500 #define _mm512_fnmsub_round_ph(A, B, C, R)                                     \
2501   ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2502       (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),   \
2503       (__mmask32)-1, (int)(R)))
2504 
2505 #define _mm512_maskz_fnmsub_round_ph(U, A, B, C, R)                            \
2506   ((__m512h)__builtin_ia32_vfmaddph512_maskz(                                  \
2507       -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),   \
2508       (__mmask32)(U), (int)(R)))
2509 
2510 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_ph(__m512h __A,
2511                                                                 __m512h __B,
2512                                                                 __m512h __C) {
2513   return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2514                                                   (__v32hf)__C, (__mmask32)-1,
2515                                                   _MM_FROUND_CUR_DIRECTION);
2516 }
2517 
2518 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2519 _mm512_mask_fmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2520   return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2521                                                   (__v32hf)__C, (__mmask32)__U,
2522                                                   _MM_FROUND_CUR_DIRECTION);
2523 }
2524 
2525 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2526 _mm512_mask3_fmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2527   return (__m512h)__builtin_ia32_vfmaddph512_mask3((__v32hf)__A, (__v32hf)__B,
2528                                                    (__v32hf)__C, (__mmask32)__U,
2529                                                    _MM_FROUND_CUR_DIRECTION);
2530 }
2531 
2532 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2533 _mm512_maskz_fmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2534   return (__m512h)__builtin_ia32_vfmaddph512_maskz((__v32hf)__A, (__v32hf)__B,
2535                                                    (__v32hf)__C, (__mmask32)__U,
2536                                                    _MM_FROUND_CUR_DIRECTION);
2537 }
2538 
2539 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmsub_ph(__m512h __A,
2540                                                                 __m512h __B,
2541                                                                 __m512h __C) {
2542   return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2543                                                   -(__v32hf)__C, (__mmask32)-1,
2544                                                   _MM_FROUND_CUR_DIRECTION);
2545 }
2546 
2547 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2548 _mm512_mask_fmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2549   return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2550                                                   -(__v32hf)__C, (__mmask32)__U,
2551                                                   _MM_FROUND_CUR_DIRECTION);
2552 }
2553 
2554 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2555 _mm512_maskz_fmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2556   return (__m512h)__builtin_ia32_vfmaddph512_maskz(
2557       (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2558       _MM_FROUND_CUR_DIRECTION);
2559 }
2560 
2561 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fnmadd_ph(__m512h __A,
2562                                                                  __m512h __B,
2563                                                                  __m512h __C) {
2564   return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2565                                                   (__v32hf)__C, (__mmask32)-1,
2566                                                   _MM_FROUND_CUR_DIRECTION);
2567 }
2568 
2569 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2570 _mm512_mask3_fnmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2571   return (__m512h)__builtin_ia32_vfmaddph512_mask3(-(__v32hf)__A, (__v32hf)__B,
2572                                                    (__v32hf)__C, (__mmask32)__U,
2573                                                    _MM_FROUND_CUR_DIRECTION);
2574 }
2575 
2576 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2577 _mm512_maskz_fnmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2578   return (__m512h)__builtin_ia32_vfmaddph512_maskz(-(__v32hf)__A, (__v32hf)__B,
2579                                                    (__v32hf)__C, (__mmask32)__U,
2580                                                    _MM_FROUND_CUR_DIRECTION);
2581 }
2582 
2583 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fnmsub_ph(__m512h __A,
2584                                                                  __m512h __B,
2585                                                                  __m512h __C) {
2586   return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2587                                                   -(__v32hf)__C, (__mmask32)-1,
2588                                                   _MM_FROUND_CUR_DIRECTION);
2589 }
2590 
2591 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2592 _mm512_maskz_fnmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2593   return (__m512h)__builtin_ia32_vfmaddph512_maskz(
2594       -(__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2595       _MM_FROUND_CUR_DIRECTION);
2596 }
2597 
2598 #define _mm512_fmaddsub_round_ph(A, B, C, R)                                   \
2599   ((__m512h)__builtin_ia32_vfmaddsubph512_mask(                                \
2600       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2601       (__mmask32)-1, (int)(R)))
2602 
2603 #define _mm512_mask_fmaddsub_round_ph(A, U, B, C, R)                           \
2604   ((__m512h)__builtin_ia32_vfmaddsubph512_mask(                                \
2605       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2606       (__mmask32)(U), (int)(R)))
2607 
2608 #define _mm512_mask3_fmaddsub_round_ph(A, B, C, U, R)                          \
2609   ((__m512h)__builtin_ia32_vfmaddsubph512_mask3(                               \
2610       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2611       (__mmask32)(U), (int)(R)))
2612 
2613 #define _mm512_maskz_fmaddsub_round_ph(U, A, B, C, R)                          \
2614   ((__m512h)__builtin_ia32_vfmaddsubph512_maskz(                               \
2615       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2616       (__mmask32)(U), (int)(R)))
2617 
2618 #define _mm512_fmsubadd_round_ph(A, B, C, R)                                   \
2619   ((__m512h)__builtin_ia32_vfmaddsubph512_mask(                                \
2620       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
2621       (__mmask32)-1, (int)(R)))
2622 
2623 #define _mm512_mask_fmsubadd_round_ph(A, U, B, C, R)                           \
2624   ((__m512h)__builtin_ia32_vfmaddsubph512_mask(                                \
2625       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
2626       (__mmask32)(U), (int)(R)))
2627 
2628 #define _mm512_maskz_fmsubadd_round_ph(U, A, B, C, R)                          \
2629   ((__m512h)__builtin_ia32_vfmaddsubph512_maskz(                               \
2630       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
2631       (__mmask32)(U), (int)(R)))
2632 
2633 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2634 _mm512_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C) {
2635   return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2636       (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)-1,
2637       _MM_FROUND_CUR_DIRECTION);
2638 }
2639 
2640 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2641 _mm512_mask_fmaddsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2642   return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2643       (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2644       _MM_FROUND_CUR_DIRECTION);
2645 }
2646 
2647 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2648 _mm512_mask3_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2649   return (__m512h)__builtin_ia32_vfmaddsubph512_mask3(
2650       (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2651       _MM_FROUND_CUR_DIRECTION);
2652 }
2653 
2654 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2655 _mm512_maskz_fmaddsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2656   return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
2657       (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2658       _MM_FROUND_CUR_DIRECTION);
2659 }
2660 
2661 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2662 _mm512_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C) {
2663   return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2664       (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)-1,
2665       _MM_FROUND_CUR_DIRECTION);
2666 }
2667 
2668 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2669 _mm512_mask_fmsubadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2670   return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2671       (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2672       _MM_FROUND_CUR_DIRECTION);
2673 }
2674 
2675 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2676 _mm512_maskz_fmsubadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2677   return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
2678       (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2679       _MM_FROUND_CUR_DIRECTION);
2680 }
2681 
2682 #define _mm512_mask3_fmsub_round_ph(A, B, C, U, R)                             \
2683   ((__m512h)__builtin_ia32_vfmsubph512_mask3(                                  \
2684       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2685       (__mmask32)(U), (int)(R)))
2686 
2687 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2688 _mm512_mask3_fmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2689   return (__m512h)__builtin_ia32_vfmsubph512_mask3((__v32hf)__A, (__v32hf)__B,
2690                                                    (__v32hf)__C, (__mmask32)__U,
2691                                                    _MM_FROUND_CUR_DIRECTION);
2692 }
2693 
2694 #define _mm512_mask3_fmsubadd_round_ph(A, B, C, U, R)                          \
2695   ((__m512h)__builtin_ia32_vfmsubaddph512_mask3(                               \
2696       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2697       (__mmask32)(U), (int)(R)))
2698 
2699 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2700 _mm512_mask3_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2701   return (__m512h)__builtin_ia32_vfmsubaddph512_mask3(
2702       (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2703       _MM_FROUND_CUR_DIRECTION);
2704 }
2705 
2706 #define _mm512_mask_fnmadd_round_ph(A, U, B, C, R)                             \
2707   ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2708       (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),    \
2709       (__mmask32)(U), (int)(R)))
2710 
2711 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2712 _mm512_mask_fnmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2713   return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2714                                                   (__v32hf)__C, (__mmask32)__U,
2715                                                   _MM_FROUND_CUR_DIRECTION);
2716 }
2717 
2718 #define _mm512_mask_fnmsub_round_ph(A, U, B, C, R)                             \
2719   ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2720       (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),   \
2721       (__mmask32)(U), (int)(R)))
2722 
2723 #define _mm512_mask3_fnmsub_round_ph(A, B, C, U, R)                            \
2724   ((__m512h)__builtin_ia32_vfmsubph512_mask3(                                  \
2725       -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),    \
2726       (__mmask32)(U), (int)(R)))
2727 
2728 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2729 _mm512_mask_fnmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2730   return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2731                                                   -(__v32hf)__C, (__mmask32)__U,
2732                                                   _MM_FROUND_CUR_DIRECTION);
2733 }
2734 
2735 static __inline__ __m512h __DEFAULT_FN_ATTRS512
2736 _mm512_mask3_fnmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2737   return (__m512h)__builtin_ia32_vfmsubph512_mask3(-(__v32hf)__A, (__v32hf)__B,
2738                                                    (__v32hf)__C, (__mmask32)__U,
2739                                                    _MM_FROUND_CUR_DIRECTION);
2740 }
2741 
2742 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sh(__m128h __W,
2743                                                              __m128h __A,
2744                                                              __m128h __B) {
2745   return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
2746                                        (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
2747 }
2748 
2749 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_sh(__m128h __W,
2750                                                                   __mmask8 __U,
2751                                                                   __m128h __A,
2752                                                                   __m128h __B) {
2753   return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
2754                                        (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2755 }
2756 
2757 #define _mm_fmadd_round_sh(A, B, C, R)                                         \
2758   ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2759       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C),        \
2760       (__mmask8)-1, (int)(R)))
2761 
2762 #define _mm_mask_fmadd_round_sh(W, U, A, B, R)                                 \
2763   ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2764       (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B),        \
2765       (__mmask8)(U), (int)(R)))
2766 
2767 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2768 _mm_maskz_fmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2769   return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B, (__v8hf)__C,
2770                                         (__mmask8)__U,
2771                                         _MM_FROUND_CUR_DIRECTION);
2772 }
2773 
2774 #define _mm_maskz_fmadd_round_sh(U, A, B, C, R)                                \
2775   ((__m128h)__builtin_ia32_vfmaddsh3_maskz(                                    \
2776       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C),        \
2777       (__mmask8)(U), (int)(R)))
2778 
2779 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2780 _mm_mask3_fmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2781   return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)__Y,
2782                                         (__mmask8)__U,
2783                                         _MM_FROUND_CUR_DIRECTION);
2784 }
2785 
2786 #define _mm_mask3_fmadd_round_sh(W, X, Y, U, R)                                \
2787   ((__m128h)__builtin_ia32_vfmaddsh3_mask3(                                    \
2788       (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y),        \
2789       (__mmask8)(U), (int)(R)))
2790 
2791 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsub_sh(__m128h __W,
2792                                                              __m128h __A,
2793                                                              __m128h __B) {
2794   return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
2795                                                 -(__v8hf)__B, (__mmask8)-1,
2796                                                 _MM_FROUND_CUR_DIRECTION);
2797 }
2798 
2799 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_sh(__m128h __W,
2800                                                                   __mmask8 __U,
2801                                                                   __m128h __A,
2802                                                                   __m128h __B) {
2803   return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
2804                                                 -(__v8hf)__B, (__mmask8)__U,
2805                                                 _MM_FROUND_CUR_DIRECTION);
2806 }
2807 
2808 #define _mm_fmsub_round_sh(A, B, C, R)                                         \
2809   ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2810       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C),       \
2811       (__mmask8)-1, (int)(R)))
2812 
2813 #define _mm_mask_fmsub_round_sh(W, U, A, B, R)                                 \
2814   ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2815       (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B),       \
2816       (__mmask8)(U), (int)(R)))
2817 
2818 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2819 _mm_maskz_fmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2820   return (__m128h)__builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B,
2821                                                  -(__v8hf)__C, (__mmask8)__U,
2822                                                  _MM_FROUND_CUR_DIRECTION);
2823 }
2824 
2825 #define _mm_maskz_fmsub_round_sh(U, A, B, C, R)                                \
2826   ((__m128h)__builtin_ia32_vfmaddsh3_maskz(                                    \
2827       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C),       \
2828       (__mmask8)(U), (int)R))
2829 
2830 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2831 _mm_mask3_fmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2832   return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)__Y,
2833                                         (__mmask8)__U,
2834                                         _MM_FROUND_CUR_DIRECTION);
2835 }
2836 
2837 #define _mm_mask3_fmsub_round_sh(W, X, Y, U, R)                                \
2838   ((__m128h)__builtin_ia32_vfmsubsh3_mask3(                                    \
2839       (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y),        \
2840       (__mmask8)(U), (int)(R)))
2841 
2842 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmadd_sh(__m128h __W,
2843                                                               __m128h __A,
2844                                                               __m128h __B) {
2845   return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
2846                                        (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
2847 }
2848 
2849 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2850 _mm_mask_fnmadd_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
2851   return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
2852                                        (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2853 }
2854 
2855 #define _mm_fnmadd_round_sh(A, B, C, R)                                        \
2856   ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2857       (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C),       \
2858       (__mmask8)-1, (int)(R)))
2859 
2860 #define _mm_mask_fnmadd_round_sh(W, U, A, B, R)                                \
2861   ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2862       (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B),       \
2863       (__mmask8)(U), (int)(R)))
2864 
2865 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2866 _mm_maskz_fnmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2867   return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C,
2868                                         (__mmask8)__U,
2869                                         _MM_FROUND_CUR_DIRECTION);
2870 }
2871 
2872 #define _mm_maskz_fnmadd_round_sh(U, A, B, C, R)                               \
2873   ((__m128h)__builtin_ia32_vfmaddsh3_maskz(                                    \
2874       (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C),       \
2875       (__mmask8)(U), (int)(R)))
2876 
2877 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2878 _mm_mask3_fnmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2879   return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)__Y,
2880                                         (__mmask8)__U,
2881                                         _MM_FROUND_CUR_DIRECTION);
2882 }
2883 
2884 #define _mm_mask3_fnmadd_round_sh(W, X, Y, U, R)                               \
2885   ((__m128h)__builtin_ia32_vfmaddsh3_mask3(                                    \
2886       (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y),       \
2887       (__mmask8)(U), (int)(R)))
2888 
2889 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmsub_sh(__m128h __W,
2890                                                               __m128h __A,
2891                                                               __m128h __B) {
2892   return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
2893                                        (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
2894 }
2895 
2896 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2897 _mm_mask_fnmsub_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
2898   return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
2899                                        (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2900 }
2901 
2902 #define _mm_fnmsub_round_sh(A, B, C, R)                                        \
2903   ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2904       (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C),      \
2905       (__mmask8)-1, (int)(R)))
2906 
2907 #define _mm_mask_fnmsub_round_sh(W, U, A, B, R)                                \
2908   ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2909       (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B),      \
2910       (__mmask8)(U), (int)(R)))
2911 
2912 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2913 _mm_maskz_fnmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2914   return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C,
2915                                         (__mmask8)__U,
2916                                         _MM_FROUND_CUR_DIRECTION);
2917 }
2918 
2919 #define _mm_maskz_fnmsub_round_sh(U, A, B, C, R)                               \
2920   ((__m128h)__builtin_ia32_vfmaddsh3_maskz(                                    \
2921       (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C),      \
2922       (__mmask8)(U), (int)(R)))
2923 
2924 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2925 _mm_mask3_fnmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2926   return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)__Y,
2927                                         (__mmask8)__U,
2928                                         _MM_FROUND_CUR_DIRECTION);
2929 }
2930 
2931 #define _mm_mask3_fnmsub_round_sh(W, X, Y, U, R)                               \
2932   ((__m128h)__builtin_ia32_vfmsubsh3_mask3(                                    \
2933       (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y),       \
2934       (__mmask8)(U), (int)(R)))
2935 
2936 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmadd_sch(__m128h __A,
2937                                                                __m128h __B,
2938                                                                __m128h __C) {
2939   return (__m128h)__builtin_ia32_vfcmaddcsh_mask((__v4sf)__A, (__v4sf)__B,
2940                                                  (__v4sf)__C, (__mmask8)-1,
2941                                                  _MM_FROUND_CUR_DIRECTION);
2942 }
2943 
2944 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2945 _mm_mask_fcmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
2946   return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask(
2947       (__v4sf)__A, (__v4sf)(__B), (__v4sf)(__C), __U, _MM_FROUND_CUR_DIRECTION);
2948 }
2949 
2950 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2951 _mm_maskz_fcmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2952   return (__m128h)__builtin_ia32_vfcmaddcsh_maskz((__v4sf)__A, (__v4sf)__B,
2953                                                   (__v4sf)__C, (__mmask8)__U,
2954                                                   _MM_FROUND_CUR_DIRECTION);
2955 }
2956 
2957 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2958 _mm_mask3_fcmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
2959   return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask3(
2960       (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U, _MM_FROUND_CUR_DIRECTION);
2961 }
2962 
2963 #define _mm_fcmadd_round_sch(A, B, C, R)                                       \
2964   ((__m128h)__builtin_ia32_vfcmaddcsh_mask(                                    \
2965       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
2966       (__mmask8)-1, (int)(R)))
2967 
2968 #define _mm_mask_fcmadd_round_sch(A, U, B, C, R)                               \
2969   ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask(                              \
2970       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
2971       (__mmask8)(U), (int)(R)))
2972 
2973 #define _mm_maskz_fcmadd_round_sch(U, A, B, C, R)                              \
2974   ((__m128h)__builtin_ia32_vfcmaddcsh_maskz(                                   \
2975       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
2976       (__mmask8)(U), (int)(R)))
2977 
2978 #define _mm_mask3_fcmadd_round_sch(A, B, C, U, R)                              \
2979   ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask3(                             \
2980       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
2981       (__mmask8)(U), (int)(R)))
2982 
2983 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sch(__m128h __A,
2984                                                               __m128h __B,
2985                                                               __m128h __C) {
2986   return (__m128h)__builtin_ia32_vfmaddcsh_mask((__v4sf)__A, (__v4sf)__B,
2987                                                 (__v4sf)__C, (__mmask8)-1,
2988                                                 _MM_FROUND_CUR_DIRECTION);
2989 }
2990 
2991 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2992 _mm_mask_fmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
2993   return (__m128h)__builtin_ia32_vfmaddcsh_round_mask(
2994       (__v4sf)__A, (__v4sf)(__B), (__v4sf)(__C), __U, _MM_FROUND_CUR_DIRECTION);
2995 }
2996 
2997 static __inline__ __m128h __DEFAULT_FN_ATTRS128
2998 _mm_maskz_fmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2999   return (__m128h)__builtin_ia32_vfmaddcsh_maskz((__v4sf)__A, (__v4sf)__B,
3000                                                  (__v4sf)__C, (__mmask8)__U,
3001                                                  _MM_FROUND_CUR_DIRECTION);
3002 }
3003 
3004 static __inline__ __m128h __DEFAULT_FN_ATTRS128
3005 _mm_mask3_fmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
3006   return (__m128h)__builtin_ia32_vfmaddcsh_round_mask3(
3007       (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U, _MM_FROUND_CUR_DIRECTION);
3008 }
3009 
3010 #define _mm_fmadd_round_sch(A, B, C, R)                                        \
3011   ((__m128h)__builtin_ia32_vfmaddcsh_mask(                                     \
3012       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
3013       (__mmask8)-1, (int)(R)))
3014 
3015 #define _mm_mask_fmadd_round_sch(A, U, B, C, R)                                \
3016   ((__m128h)__builtin_ia32_vfmaddcsh_round_mask(                               \
3017       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
3018       (__mmask8)(U), (int)(R)))
3019 
3020 #define _mm_maskz_fmadd_round_sch(U, A, B, C, R)                               \
3021   ((__m128h)__builtin_ia32_vfmaddcsh_maskz(                                    \
3022       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
3023       (__mmask8)(U), (int)(R)))
3024 
3025 #define _mm_mask3_fmadd_round_sch(A, B, C, U, R)                               \
3026   ((__m128h)__builtin_ia32_vfmaddcsh_round_mask3(                              \
3027       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
3028       (__mmask8)(U), (int)(R)))
3029 
3030 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmul_sch(__m128h __A,
3031                                                               __m128h __B) {
3032   return (__m128h)__builtin_ia32_vfcmulcsh_mask(
3033       (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1,
3034       _MM_FROUND_CUR_DIRECTION);
3035 }
3036 
3037 static __inline__ __m128h __DEFAULT_FN_ATTRS128
3038 _mm_mask_fcmul_sch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
3039   return (__m128h)__builtin_ia32_vfcmulcsh_mask((__v4sf)__A, (__v4sf)__B,
3040                                                 (__v4sf)__W, (__mmask8)__U,
3041                                                 _MM_FROUND_CUR_DIRECTION);
3042 }
3043 
3044 static __inline__ __m128h __DEFAULT_FN_ATTRS128
3045 _mm_maskz_fcmul_sch(__mmask8 __U, __m128h __A, __m128h __B) {
3046   return (__m128h)__builtin_ia32_vfcmulcsh_mask(
3047       (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U,
3048       _MM_FROUND_CUR_DIRECTION);
3049 }
3050 
3051 #define _mm_fcmul_round_sch(A, B, R)                                           \
3052   ((__m128h)__builtin_ia32_vfcmulcsh_mask(                                     \
3053       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B),                              \
3054       (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
3055 
3056 #define _mm_mask_fcmul_round_sch(W, U, A, B, R)                                \
3057   ((__m128h)__builtin_ia32_vfcmulcsh_mask(                                     \
3058       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W),        \
3059       (__mmask8)(U), (int)(R)))
3060 
3061 #define _mm_maskz_fcmul_round_sch(U, A, B, R)                                  \
3062   ((__m128h)__builtin_ia32_vfcmulcsh_mask(                                     \
3063       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B),                              \
3064       (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
3065 
3066 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmul_sch(__m128h __A,
3067                                                              __m128h __B) {
3068   return (__m128h)__builtin_ia32_vfmulcsh_mask(
3069       (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1,
3070       _MM_FROUND_CUR_DIRECTION);
3071 }
3072 
3073 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmul_sch(__m128h __W,
3074                                                                   __mmask8 __U,
3075                                                                   __m128h __A,
3076                                                                   __m128h __B) {
3077   return (__m128h)__builtin_ia32_vfmulcsh_mask((__v4sf)__A, (__v4sf)__B,
3078                                                (__v4sf)__W, (__mmask8)__U,
3079                                                _MM_FROUND_CUR_DIRECTION);
3080 }
3081 
3082 static __inline__ __m128h __DEFAULT_FN_ATTRS128
3083 _mm_maskz_fmul_sch(__mmask8 __U, __m128h __A, __m128h __B) {
3084   return (__m128h)__builtin_ia32_vfmulcsh_mask(
3085       (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U,
3086       _MM_FROUND_CUR_DIRECTION);
3087 }
3088 
3089 #define _mm_fmul_round_sch(A, B, R)                                            \
3090   ((__m128h)__builtin_ia32_vfmulcsh_mask(                                      \
3091       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B),                              \
3092       (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
3093 
3094 #define _mm_mask_fmul_round_sch(W, U, A, B, R)                                 \
3095   ((__m128h)__builtin_ia32_vfmulcsh_mask(                                      \
3096       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W),        \
3097       (__mmask8)(U), (int)(R)))
3098 
3099 #define _mm_maskz_fmul_round_sch(U, A, B, R)                                   \
3100   ((__m128h)__builtin_ia32_vfmulcsh_mask(                                      \
3101       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B),                              \
3102       (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
3103 
3104 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmul_pch(__m512h __A,
3105                                                                  __m512h __B) {
3106   return (__m512h)__builtin_ia32_vfcmulcph512_mask(
3107       (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (__mmask16)-1,
3108       _MM_FROUND_CUR_DIRECTION);
3109 }
3110 
3111 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3112 _mm512_mask_fcmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
3113   return (__m512h)__builtin_ia32_vfcmulcph512_mask((__v16sf)__A, (__v16sf)__B,
3114                                                    (__v16sf)__W, (__mmask16)__U,
3115                                                    _MM_FROUND_CUR_DIRECTION);
3116 }
3117 
3118 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3119 _mm512_maskz_fcmul_pch(__mmask16 __U, __m512h __A, __m512h __B) {
3120   return (__m512h)__builtin_ia32_vfcmulcph512_mask(
3121       (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (__mmask16)__U,
3122       _MM_FROUND_CUR_DIRECTION);
3123 }
3124 
3125 #define _mm512_fcmul_round_pch(A, B, R)                                        \
3126   ((__m512h)__builtin_ia32_vfcmulcph512_mask(                                  \
3127       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B),                            \
3128       (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
3129 
3130 #define _mm512_mask_fcmul_round_pch(W, U, A, B, R)                             \
3131   ((__m512h)__builtin_ia32_vfcmulcph512_mask(                                  \
3132       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W),     \
3133       (__mmask16)(U), (int)(R)))
3134 
3135 #define _mm512_maskz_fcmul_round_pch(U, A, B, R)                               \
3136   ((__m512h)__builtin_ia32_vfcmulcph512_mask(                                  \
3137       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B),                            \
3138       (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
3139 
3140 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmul_pch(__m512h __A,
3141                                                                 __m512h __B) {
3142   return (__m512h)__builtin_ia32_vfmulcph512_mask(
3143       (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (__mmask16)-1,
3144       _MM_FROUND_CUR_DIRECTION);
3145 }
3146 
3147 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3148 _mm512_mask_fmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
3149   return (__m512h)__builtin_ia32_vfmulcph512_mask((__v16sf)__A, (__v16sf)__B,
3150                                                   (__v16sf)__W, (__mmask16)__U,
3151                                                   _MM_FROUND_CUR_DIRECTION);
3152 }
3153 
3154 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3155 _mm512_maskz_fmul_pch(__mmask16 __U, __m512h __A, __m512h __B) {
3156   return (__m512h)__builtin_ia32_vfmulcph512_mask(
3157       (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (__mmask16)__U,
3158       _MM_FROUND_CUR_DIRECTION);
3159 }
3160 
3161 #define _mm512_fmul_round_pch(A, B, R)                                         \
3162   ((__m512h)__builtin_ia32_vfmulcph512_mask(                                   \
3163       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B),                            \
3164       (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
3165 
3166 #define _mm512_mask_fmul_round_pch(W, U, A, B, R)                              \
3167   ((__m512h)__builtin_ia32_vfmulcph512_mask(                                   \
3168       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W),     \
3169       (__mmask16)(U), (int)(R)))
3170 
3171 #define _mm512_maskz_fmul_round_pch(U, A, B, R)                                \
3172   ((__m512h)__builtin_ia32_vfmulcph512_mask(                                   \
3173       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B),                            \
3174       (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
3175 
3176 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmadd_pch(__m512h __A,
3177                                                                   __m512h __B,
3178                                                                   __m512h __C) {
3179   return (__m512h)__builtin_ia32_vfcmaddcph512_mask3(
3180       (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)-1,
3181       _MM_FROUND_CUR_DIRECTION);
3182 }
3183 
3184 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3185 _mm512_mask_fcmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
3186   return (__m512h)__builtin_ia32_vfcmaddcph512_mask(
3187       (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3188       _MM_FROUND_CUR_DIRECTION);
3189 }
3190 
3191 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3192 _mm512_mask3_fcmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
3193   return (__m512h)__builtin_ia32_vfcmaddcph512_mask3(
3194       (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3195       _MM_FROUND_CUR_DIRECTION);
3196 }
3197 
3198 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3199 _mm512_maskz_fcmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
3200   return (__m512h)__builtin_ia32_vfcmaddcph512_maskz(
3201       (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3202       _MM_FROUND_CUR_DIRECTION);
3203 }
3204 
3205 #define _mm512_fcmadd_round_pch(A, B, C, R)                                    \
3206   ((__m512h)__builtin_ia32_vfcmaddcph512_mask3(                                \
3207       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3208       (__mmask16)-1, (int)(R)))
3209 
3210 #define _mm512_mask_fcmadd_round_pch(A, U, B, C, R)                            \
3211   ((__m512h)__builtin_ia32_vfcmaddcph512_mask(                                 \
3212       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3213       (__mmask16)(U), (int)(R)))
3214 
3215 #define _mm512_mask3_fcmadd_round_pch(A, B, C, U, R)                           \
3216   ((__m512h)__builtin_ia32_vfcmaddcph512_mask3(                                \
3217       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3218       (__mmask16)(U), (int)(R)))
3219 
3220 #define _mm512_maskz_fcmadd_round_pch(U, A, B, C, R)                           \
3221   ((__m512h)__builtin_ia32_vfcmaddcph512_maskz(                                \
3222       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3223       (__mmask16)(U), (int)(R)))
3224 
3225 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_pch(__m512h __A,
3226                                                                  __m512h __B,
3227                                                                  __m512h __C) {
3228   return (__m512h)__builtin_ia32_vfmaddcph512_mask3((__v16sf)__A, (__v16sf)__B,
3229                                                     (__v16sf)__C, (__mmask16)-1,
3230                                                     _MM_FROUND_CUR_DIRECTION);
3231 }
3232 
3233 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3234 _mm512_mask_fmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
3235   return (__m512h)__builtin_ia32_vfmaddcph512_mask((__v16sf)__A, (__v16sf)__B,
3236                                                    (__v16sf)__C, (__mmask16)__U,
3237                                                    _MM_FROUND_CUR_DIRECTION);
3238 }
3239 
3240 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3241 _mm512_mask3_fmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
3242   return (__m512h)__builtin_ia32_vfmaddcph512_mask3(
3243       (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3244       _MM_FROUND_CUR_DIRECTION);
3245 }
3246 
3247 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3248 _mm512_maskz_fmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
3249   return (__m512h)__builtin_ia32_vfmaddcph512_maskz(
3250       (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3251       _MM_FROUND_CUR_DIRECTION);
3252 }
3253 
3254 #define _mm512_fmadd_round_pch(A, B, C, R)                                     \
3255   ((__m512h)__builtin_ia32_vfmaddcph512_mask3(                                 \
3256       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3257       (__mmask16)-1, (int)(R)))
3258 
3259 #define _mm512_mask_fmadd_round_pch(A, U, B, C, R)                             \
3260   ((__m512h)__builtin_ia32_vfmaddcph512_mask(                                  \
3261       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3262       (__mmask16)(U), (int)(R)))
3263 
3264 #define _mm512_mask3_fmadd_round_pch(A, B, C, U, R)                            \
3265   ((__m512h)__builtin_ia32_vfmaddcph512_mask3(                                 \
3266       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3267       (__mmask16)(U), (int)(R)))
3268 
3269 #define _mm512_maskz_fmadd_round_pch(U, A, B, C, R)                            \
3270   ((__m512h)__builtin_ia32_vfmaddcph512_maskz(                                 \
3271       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3272       (__mmask16)(U), (int)(R)))
3273 
3274 static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3275 _mm512_reduce_add_ph(__m512h __W) {
3276   return __builtin_ia32_reduce_fadd_ph512(-0.0f16, __W);
3277 }
3278 
3279 static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3280 _mm512_reduce_mul_ph(__m512h __W) {
3281   return __builtin_ia32_reduce_fmul_ph512(1.0f16, __W);
3282 }
3283 
3284 static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3285 _mm512_reduce_max_ph(__m512h __V) {
3286   return __builtin_ia32_reduce_fmax_ph512(__V);
3287 }
3288 
3289 static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3290 _mm512_reduce_min_ph(__m512h __V) {
3291   return __builtin_ia32_reduce_fmin_ph512(__V);
3292 }
3293 
3294 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3295 _mm512_mask_blend_ph(__mmask32 __U, __m512h __A, __m512h __W) {
3296   return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, (__v32hf)__W,
3297                                               (__v32hf)__A);
3298 }
3299 
3300 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3301 _mm512_permutex2var_ph(__m512h __A, __m512i __I, __m512h __B) {
3302   return (__m512h)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
3303                                                  (__v32hi)__B);
3304 }
3305 
3306 static __inline__ __m512h __DEFAULT_FN_ATTRS512
3307 _mm512_permutexvar_ph(__m512i __A, __m512h __B) {
3308   return (__m512h)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A);
3309 }
3310 
3311 // intrinsics below are alias for f*mul_*ch
3312 #define _mm512_mul_pch(A, B) _mm512_fmul_pch(A, B)
3313 #define _mm512_mask_mul_pch(W, U, A, B) _mm512_mask_fmul_pch(W, U, A, B)
3314 #define _mm512_maskz_mul_pch(U, A, B) _mm512_maskz_fmul_pch(U, A, B)
3315 #define _mm512_mul_round_pch(A, B, R) _mm512_fmul_round_pch(A, B, R)
3316 #define _mm512_mask_mul_round_pch(W, U, A, B, R)                               \
3317   _mm512_mask_fmul_round_pch(W, U, A, B, R)
3318 #define _mm512_maskz_mul_round_pch(U, A, B, R)                                 \
3319   _mm512_maskz_fmul_round_pch(U, A, B, R)
3320 
3321 #define _mm512_cmul_pch(A, B) _mm512_fcmul_pch(A, B)
3322 #define _mm512_mask_cmul_pch(W, U, A, B) _mm512_mask_fcmul_pch(W, U, A, B)
3323 #define _mm512_maskz_cmul_pch(U, A, B) _mm512_maskz_fcmul_pch(U, A, B)
3324 #define _mm512_cmul_round_pch(A, B, R) _mm512_fcmul_round_pch(A, B, R)
3325 #define _mm512_mask_cmul_round_pch(W, U, A, B, R)                              \
3326   _mm512_mask_fcmul_round_pch(W, U, A, B, R)
3327 #define _mm512_maskz_cmul_round_pch(U, A, B, R)                                \
3328   _mm512_maskz_fcmul_round_pch(U, A, B, R)
3329 
3330 #define _mm_mul_sch(A, B) _mm_fmul_sch(A, B)
3331 #define _mm_mask_mul_sch(W, U, A, B) _mm_mask_fmul_sch(W, U, A, B)
3332 #define _mm_maskz_mul_sch(U, A, B) _mm_maskz_fmul_sch(U, A, B)
3333 #define _mm_mul_round_sch(A, B, R) _mm_fmul_round_sch(A, B, R)
3334 #define _mm_mask_mul_round_sch(W, U, A, B, R)                                  \
3335   _mm_mask_fmul_round_sch(W, U, A, B, R)
3336 #define _mm_maskz_mul_round_sch(U, A, B, R) _mm_maskz_fmul_round_sch(U, A, B, R)
3337 
3338 #define _mm_cmul_sch(A, B) _mm_fcmul_sch(A, B)
3339 #define _mm_mask_cmul_sch(W, U, A, B) _mm_mask_fcmul_sch(W, U, A, B)
3340 #define _mm_maskz_cmul_sch(U, A, B) _mm_maskz_fcmul_sch(U, A, B)
3341 #define _mm_cmul_round_sch(A, B, R) _mm_fcmul_round_sch(A, B, R)
3342 #define _mm_mask_cmul_round_sch(W, U, A, B, R)                                 \
3343   _mm_mask_fcmul_round_sch(W, U, A, B, R)
3344 #define _mm_maskz_cmul_round_sch(U, A, B, R)                                   \
3345   _mm_maskz_fcmul_round_sch(U, A, B, R)
3346 
3347 #undef __DEFAULT_FN_ATTRS128
3348 #undef __DEFAULT_FN_ATTRS256
3349 #undef __DEFAULT_FN_ATTRS512
3350 
3351 #endif
3352 #endif
3353