xref: /freebsd/contrib/llvm-project/clang/lib/Headers/avx512fp16intrin.h (revision 700637cbb5e582861067a11aaca4d053546871d2)
1 /*===----------- avx512fp16intrin.h - AVX512-FP16 intrinsics ---------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 #ifndef __IMMINTRIN_H
10 #error "Never use <avx512fp16intrin.h> directly; include <immintrin.h> instead."
11 #endif
12 
13 #ifdef __SSE2__
14 
15 #ifndef __AVX512FP16INTRIN_H
16 #define __AVX512FP16INTRIN_H
17 
18 /* Define the default attributes for the functions in this file. */
19 typedef _Float16 __v32hf __attribute__((__vector_size__(64), __aligned__(64)));
20 typedef _Float16 __m512h __attribute__((__vector_size__(64), __aligned__(64)));
21 typedef _Float16 __m512h_u __attribute__((__vector_size__(64), __aligned__(1)));
22 
23 /* Define the default attributes for the functions in this file. */
24 #define __DEFAULT_FN_ATTRS512                                                  \
25   __attribute__((__always_inline__, __nodebug__,                               \
26                  __target__("avx512fp16,evex512"), __min_vector_width__(512)))
27 #define __DEFAULT_FN_ATTRS256                                                  \
28   __attribute__((__always_inline__, __nodebug__,                               \
29                  __target__("avx512fp16,no-evex512"),                          \
30                  __min_vector_width__(256)))
31 #define __DEFAULT_FN_ATTRS128                                                  \
32   __attribute__((__always_inline__, __nodebug__,                               \
33                  __target__("avx512fp16,no-evex512"),                          \
34                  __min_vector_width__(128)))
35 
_mm512_cvtsh_h(__m512h __a)36 static __inline__ _Float16 __DEFAULT_FN_ATTRS512 _mm512_cvtsh_h(__m512h __a) {
37   return __a[0];
38 }
39 
_mm_setzero_ph(void)40 static __inline __m128h __DEFAULT_FN_ATTRS128 _mm_setzero_ph(void) {
41   return (__m128h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
42 }
43 
_mm256_setzero_ph(void)44 static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_setzero_ph(void) {
45   return (__m256h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
46                    0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
47 }
48 
_mm256_undefined_ph(void)49 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_undefined_ph(void) {
50   return (__m256h)__builtin_ia32_undef256();
51 }
52 
_mm512_setzero_ph(void)53 static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_setzero_ph(void) {
54   return (__m512h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
55                    0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
56                    0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
57 }
58 
_mm_undefined_ph(void)59 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_undefined_ph(void) {
60   return (__m128h)__builtin_ia32_undef128();
61 }
62 
_mm512_undefined_ph(void)63 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_undefined_ph(void) {
64   return (__m512h)__builtin_ia32_undef512();
65 }
66 
_mm512_set1_ph(_Float16 __h)67 static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_set1_ph(_Float16 __h) {
68   return (__m512h)(__v32hf){__h, __h, __h, __h, __h, __h, __h, __h,
69                             __h, __h, __h, __h, __h, __h, __h, __h,
70                             __h, __h, __h, __h, __h, __h, __h, __h,
71                             __h, __h, __h, __h, __h, __h, __h, __h};
72 }
73 
74 static __inline __m512h __DEFAULT_FN_ATTRS512
_mm512_set_ph(_Float16 __h1,_Float16 __h2,_Float16 __h3,_Float16 __h4,_Float16 __h5,_Float16 __h6,_Float16 __h7,_Float16 __h8,_Float16 __h9,_Float16 __h10,_Float16 __h11,_Float16 __h12,_Float16 __h13,_Float16 __h14,_Float16 __h15,_Float16 __h16,_Float16 __h17,_Float16 __h18,_Float16 __h19,_Float16 __h20,_Float16 __h21,_Float16 __h22,_Float16 __h23,_Float16 __h24,_Float16 __h25,_Float16 __h26,_Float16 __h27,_Float16 __h28,_Float16 __h29,_Float16 __h30,_Float16 __h31,_Float16 __h32)75 _mm512_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4,
76               _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8,
77               _Float16 __h9, _Float16 __h10, _Float16 __h11, _Float16 __h12,
78               _Float16 __h13, _Float16 __h14, _Float16 __h15, _Float16 __h16,
79               _Float16 __h17, _Float16 __h18, _Float16 __h19, _Float16 __h20,
80               _Float16 __h21, _Float16 __h22, _Float16 __h23, _Float16 __h24,
81               _Float16 __h25, _Float16 __h26, _Float16 __h27, _Float16 __h28,
82               _Float16 __h29, _Float16 __h30, _Float16 __h31, _Float16 __h32) {
83   return (__m512h)(__v32hf){__h32, __h31, __h30, __h29, __h28, __h27, __h26,
84                             __h25, __h24, __h23, __h22, __h21, __h20, __h19,
85                             __h18, __h17, __h16, __h15, __h14, __h13, __h12,
86                             __h11, __h10, __h9,  __h8,  __h7,  __h6,  __h5,
87                             __h4,  __h3,  __h2,  __h1};
88 }
89 
90 #define _mm512_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11, h12, h13, \
91                        h14, h15, h16, h17, h18, h19, h20, h21, h22, h23, h24,  \
92                        h25, h26, h27, h28, h29, h30, h31, h32)                 \
93   _mm512_set_ph((h32), (h31), (h30), (h29), (h28), (h27), (h26), (h25), (h24), \
94                 (h23), (h22), (h21), (h20), (h19), (h18), (h17), (h16), (h15), \
95                 (h14), (h13), (h12), (h11), (h10), (h9), (h8), (h7), (h6),     \
96                 (h5), (h4), (h3), (h2), (h1))
97 
98 static __inline __m512h __DEFAULT_FN_ATTRS512
_mm512_set1_pch(_Float16 _Complex __h)99 _mm512_set1_pch(_Float16 _Complex __h) {
100   return (__m512h)_mm512_set1_ps(__builtin_bit_cast(float, __h));
101 }
102 
_mm_castph_ps(__m128h __a)103 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_castph_ps(__m128h __a) {
104   return (__m128)__a;
105 }
106 
_mm256_castph_ps(__m256h __a)107 static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_castph_ps(__m256h __a) {
108   return (__m256)__a;
109 }
110 
_mm512_castph_ps(__m512h __a)111 static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_castph_ps(__m512h __a) {
112   return (__m512)__a;
113 }
114 
_mm_castph_pd(__m128h __a)115 static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_castph_pd(__m128h __a) {
116   return (__m128d)__a;
117 }
118 
_mm256_castph_pd(__m256h __a)119 static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_castph_pd(__m256h __a) {
120   return (__m256d)__a;
121 }
122 
_mm512_castph_pd(__m512h __a)123 static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_castph_pd(__m512h __a) {
124   return (__m512d)__a;
125 }
126 
_mm_castph_si128(__m128h __a)127 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_castph_si128(__m128h __a) {
128   return (__m128i)__a;
129 }
130 
131 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_castph_si256(__m256h __a)132 _mm256_castph_si256(__m256h __a) {
133   return (__m256i)__a;
134 }
135 
136 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_castph_si512(__m512h __a)137 _mm512_castph_si512(__m512h __a) {
138   return (__m512i)__a;
139 }
140 
_mm_castps_ph(__m128 __a)141 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castps_ph(__m128 __a) {
142   return (__m128h)__a;
143 }
144 
_mm256_castps_ph(__m256 __a)145 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_castps_ph(__m256 __a) {
146   return (__m256h)__a;
147 }
148 
_mm512_castps_ph(__m512 __a)149 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_castps_ph(__m512 __a) {
150   return (__m512h)__a;
151 }
152 
_mm_castpd_ph(__m128d __a)153 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castpd_ph(__m128d __a) {
154   return (__m128h)__a;
155 }
156 
_mm256_castpd_ph(__m256d __a)157 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_castpd_ph(__m256d __a) {
158   return (__m256h)__a;
159 }
160 
_mm512_castpd_ph(__m512d __a)161 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_castpd_ph(__m512d __a) {
162   return (__m512h)__a;
163 }
164 
_mm_castsi128_ph(__m128i __a)165 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castsi128_ph(__m128i __a) {
166   return (__m128h)__a;
167 }
168 
169 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_castsi256_ph(__m256i __a)170 _mm256_castsi256_ph(__m256i __a) {
171   return (__m256h)__a;
172 }
173 
174 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_castsi512_ph(__m512i __a)175 _mm512_castsi512_ph(__m512i __a) {
176   return (__m512h)__a;
177 }
178 
179 static __inline__ __m128h __DEFAULT_FN_ATTRS256
_mm256_castph256_ph128(__m256h __a)180 _mm256_castph256_ph128(__m256h __a) {
181   return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
182 }
183 
184 static __inline__ __m128h __DEFAULT_FN_ATTRS512
_mm512_castph512_ph128(__m512h __a)185 _mm512_castph512_ph128(__m512h __a) {
186   return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
187 }
188 
189 static __inline__ __m256h __DEFAULT_FN_ATTRS512
_mm512_castph512_ph256(__m512h __a)190 _mm512_castph512_ph256(__m512h __a) {
191   return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
192                                  12, 13, 14, 15);
193 }
194 
195 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_castph128_ph256(__m128h __a)196 _mm256_castph128_ph256(__m128h __a) {
197   return __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a),
198                                   0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
199 }
200 
201 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_castph128_ph512(__m128h __a)202 _mm512_castph128_ph512(__m128h __a) {
203   __m256h __b = __builtin_nondeterministic_value(__b);
204   return __builtin_shufflevector(
205       __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a),
206                               0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
207       __b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
208       20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
209 }
210 
211 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_castph256_ph512(__m256h __a)212 _mm512_castph256_ph512(__m256h __a) {
213   return __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a), 0,
214                                  1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
215                                  15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
216                                  27, 28, 29, 30, 31);
217 }
218 
219 /// Constructs a 256-bit floating-point vector of [16 x half] from a
220 ///    128-bit floating-point vector of [8 x half]. The lower 128 bits
221 ///    contain the value of the source vector. The upper 384 bits are set
222 ///    to zero.
223 ///
224 /// \headerfile <x86intrin.h>
225 ///
226 /// This intrinsic has no corresponding instruction.
227 ///
228 /// \param __a
229 ///    A 128-bit vector of [8 x half].
230 /// \returns A 512-bit floating-point vector of [16 x half]. The lower 128 bits
231 ///    contain the value of the parameter. The upper 384 bits are set to zero.
232 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_zextph128_ph256(__m128h __a)233 _mm256_zextph128_ph256(__m128h __a) {
234   return __builtin_shufflevector(__a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4,
235                                  5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
236 }
237 
238 /// Constructs a 512-bit floating-point vector of [32 x half] from a
239 ///    128-bit floating-point vector of [8 x half]. The lower 128 bits
240 ///    contain the value of the source vector. The upper 384 bits are set
241 ///    to zero.
242 ///
243 /// \headerfile <x86intrin.h>
244 ///
245 /// This intrinsic has no corresponding instruction.
246 ///
247 /// \param __a
248 ///    A 128-bit vector of [8 x half].
249 /// \returns A 512-bit floating-point vector of [32 x half]. The lower 128 bits
250 ///    contain the value of the parameter. The upper 384 bits are set to zero.
251 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_zextph128_ph512(__m128h __a)252 _mm512_zextph128_ph512(__m128h __a) {
253   return __builtin_shufflevector(
254       __a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
255       13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15);
256 }
257 
258 /// Constructs a 512-bit floating-point vector of [32 x half] from a
259 ///    256-bit floating-point vector of [16 x half]. The lower 256 bits
260 ///    contain the value of the source vector. The upper 256 bits are set
261 ///    to zero.
262 ///
263 /// \headerfile <x86intrin.h>
264 ///
265 /// This intrinsic has no corresponding instruction.
266 ///
267 /// \param __a
268 ///    A 256-bit vector of [16 x half].
269 /// \returns A 512-bit floating-point vector of [32 x half]. The lower 256 bits
270 ///    contain the value of the parameter. The upper 256 bits are set to zero.
271 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_zextph256_ph512(__m256h __a)272 _mm512_zextph256_ph512(__m256h __a) {
273   return __builtin_shufflevector(__a, (__v16hf)_mm256_setzero_ph(), 0, 1, 2, 3,
274                                  4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
275                                  17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
276                                  29, 30, 31);
277 }
278 
279 #define _mm_comi_round_sh(A, B, P, R)                                          \
280   __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, (int)(P), (int)(R))
281 
282 #define _mm_comi_sh(A, B, pred)                                                \
283   _mm_comi_round_sh((A), (B), (pred), _MM_FROUND_CUR_DIRECTION)
284 
_mm_comieq_sh(__m128h __A,__m128h __B)285 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comieq_sh(__m128h __A,
286                                                           __m128h __B) {
287   return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_EQ_OS,
288                                 _MM_FROUND_CUR_DIRECTION);
289 }
290 
_mm_comilt_sh(__m128h __A,__m128h __B)291 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comilt_sh(__m128h __A,
292                                                           __m128h __B) {
293   return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_LT_OS,
294                                 _MM_FROUND_CUR_DIRECTION);
295 }
296 
_mm_comile_sh(__m128h __A,__m128h __B)297 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comile_sh(__m128h __A,
298                                                           __m128h __B) {
299   return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_LE_OS,
300                                 _MM_FROUND_CUR_DIRECTION);
301 }
302 
_mm_comigt_sh(__m128h __A,__m128h __B)303 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comigt_sh(__m128h __A,
304                                                           __m128h __B) {
305   return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_GT_OS,
306                                 _MM_FROUND_CUR_DIRECTION);
307 }
308 
_mm_comige_sh(__m128h __A,__m128h __B)309 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comige_sh(__m128h __A,
310                                                           __m128h __B) {
311   return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_GE_OS,
312                                 _MM_FROUND_CUR_DIRECTION);
313 }
314 
_mm_comineq_sh(__m128h __A,__m128h __B)315 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comineq_sh(__m128h __A,
316                                                            __m128h __B) {
317   return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_NEQ_US,
318                                 _MM_FROUND_CUR_DIRECTION);
319 }
320 
_mm_ucomieq_sh(__m128h __A,__m128h __B)321 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomieq_sh(__m128h __A,
322                                                            __m128h __B) {
323   return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_EQ_OQ,
324                                 _MM_FROUND_CUR_DIRECTION);
325 }
326 
_mm_ucomilt_sh(__m128h __A,__m128h __B)327 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomilt_sh(__m128h __A,
328                                                            __m128h __B) {
329   return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_LT_OQ,
330                                 _MM_FROUND_CUR_DIRECTION);
331 }
332 
_mm_ucomile_sh(__m128h __A,__m128h __B)333 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomile_sh(__m128h __A,
334                                                            __m128h __B) {
335   return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_LE_OQ,
336                                 _MM_FROUND_CUR_DIRECTION);
337 }
338 
_mm_ucomigt_sh(__m128h __A,__m128h __B)339 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomigt_sh(__m128h __A,
340                                                            __m128h __B) {
341   return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_GT_OQ,
342                                 _MM_FROUND_CUR_DIRECTION);
343 }
344 
_mm_ucomige_sh(__m128h __A,__m128h __B)345 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomige_sh(__m128h __A,
346                                                            __m128h __B) {
347   return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_GE_OQ,
348                                 _MM_FROUND_CUR_DIRECTION);
349 }
350 
_mm_ucomineq_sh(__m128h __A,__m128h __B)351 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomineq_sh(__m128h __A,
352                                                             __m128h __B) {
353   return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_NEQ_UQ,
354                                 _MM_FROUND_CUR_DIRECTION);
355 }
356 
_mm512_add_ph(__m512h __A,__m512h __B)357 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_add_ph(__m512h __A,
358                                                               __m512h __B) {
359   return (__m512h)((__v32hf)__A + (__v32hf)__B);
360 }
361 
362 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_add_ph(__m512h __W,__mmask32 __U,__m512h __A,__m512h __B)363 _mm512_mask_add_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
364   return (__m512h)__builtin_ia32_selectph_512(
365       (__mmask32)__U, (__v32hf)_mm512_add_ph(__A, __B), (__v32hf)__W);
366 }
367 
368 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_add_ph(__mmask32 __U,__m512h __A,__m512h __B)369 _mm512_maskz_add_ph(__mmask32 __U, __m512h __A, __m512h __B) {
370   return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
371                                               (__v32hf)_mm512_add_ph(__A, __B),
372                                               (__v32hf)_mm512_setzero_ph());
373 }
374 
375 #define _mm512_add_round_ph(A, B, R)                                           \
376   ((__m512h)__builtin_ia32_addph512((__v32hf)(__m512h)(A),                     \
377                                     (__v32hf)(__m512h)(B), (int)(R)))
378 
379 #define _mm512_mask_add_round_ph(W, U, A, B, R)                                \
380   ((__m512h)__builtin_ia32_selectph_512(                                       \
381       (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)),             \
382       (__v32hf)(__m512h)(W)))
383 
384 #define _mm512_maskz_add_round_ph(U, A, B, R)                                  \
385   ((__m512h)__builtin_ia32_selectph_512(                                       \
386       (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)),             \
387       (__v32hf)_mm512_setzero_ph()))
388 
_mm512_sub_ph(__m512h __A,__m512h __B)389 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sub_ph(__m512h __A,
390                                                               __m512h __B) {
391   return (__m512h)((__v32hf)__A - (__v32hf)__B);
392 }
393 
394 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_sub_ph(__m512h __W,__mmask32 __U,__m512h __A,__m512h __B)395 _mm512_mask_sub_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
396   return (__m512h)__builtin_ia32_selectph_512(
397       (__mmask32)__U, (__v32hf)_mm512_sub_ph(__A, __B), (__v32hf)__W);
398 }
399 
400 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_sub_ph(__mmask32 __U,__m512h __A,__m512h __B)401 _mm512_maskz_sub_ph(__mmask32 __U, __m512h __A, __m512h __B) {
402   return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
403                                               (__v32hf)_mm512_sub_ph(__A, __B),
404                                               (__v32hf)_mm512_setzero_ph());
405 }
406 
407 #define _mm512_sub_round_ph(A, B, R)                                           \
408   ((__m512h)__builtin_ia32_subph512((__v32hf)(__m512h)(A),                     \
409                                     (__v32hf)(__m512h)(B), (int)(R)))
410 
411 #define _mm512_mask_sub_round_ph(W, U, A, B, R)                                \
412   ((__m512h)__builtin_ia32_selectph_512(                                       \
413       (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)),             \
414       (__v32hf)(__m512h)(W)))
415 
416 #define _mm512_maskz_sub_round_ph(U, A, B, R)                                  \
417   ((__m512h)__builtin_ia32_selectph_512(                                       \
418       (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)),             \
419       (__v32hf)_mm512_setzero_ph()))
420 
_mm512_mul_ph(__m512h __A,__m512h __B)421 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_mul_ph(__m512h __A,
422                                                               __m512h __B) {
423   return (__m512h)((__v32hf)__A * (__v32hf)__B);
424 }
425 
426 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_mul_ph(__m512h __W,__mmask32 __U,__m512h __A,__m512h __B)427 _mm512_mask_mul_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
428   return (__m512h)__builtin_ia32_selectph_512(
429       (__mmask32)__U, (__v32hf)_mm512_mul_ph(__A, __B), (__v32hf)__W);
430 }
431 
432 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_mul_ph(__mmask32 __U,__m512h __A,__m512h __B)433 _mm512_maskz_mul_ph(__mmask32 __U, __m512h __A, __m512h __B) {
434   return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
435                                               (__v32hf)_mm512_mul_ph(__A, __B),
436                                               (__v32hf)_mm512_setzero_ph());
437 }
438 
439 #define _mm512_mul_round_ph(A, B, R)                                           \
440   ((__m512h)__builtin_ia32_mulph512((__v32hf)(__m512h)(A),                     \
441                                     (__v32hf)(__m512h)(B), (int)(R)))
442 
443 #define _mm512_mask_mul_round_ph(W, U, A, B, R)                                \
444   ((__m512h)__builtin_ia32_selectph_512(                                       \
445       (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)),             \
446       (__v32hf)(__m512h)(W)))
447 
448 #define _mm512_maskz_mul_round_ph(U, A, B, R)                                  \
449   ((__m512h)__builtin_ia32_selectph_512(                                       \
450       (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)),             \
451       (__v32hf)_mm512_setzero_ph()))
452 
_mm512_div_ph(__m512h __A,__m512h __B)453 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_div_ph(__m512h __A,
454                                                               __m512h __B) {
455   return (__m512h)((__v32hf)__A / (__v32hf)__B);
456 }
457 
458 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_div_ph(__m512h __W,__mmask32 __U,__m512h __A,__m512h __B)459 _mm512_mask_div_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
460   return (__m512h)__builtin_ia32_selectph_512(
461       (__mmask32)__U, (__v32hf)_mm512_div_ph(__A, __B), (__v32hf)__W);
462 }
463 
464 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_div_ph(__mmask32 __U,__m512h __A,__m512h __B)465 _mm512_maskz_div_ph(__mmask32 __U, __m512h __A, __m512h __B) {
466   return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
467                                               (__v32hf)_mm512_div_ph(__A, __B),
468                                               (__v32hf)_mm512_setzero_ph());
469 }
470 
471 #define _mm512_div_round_ph(A, B, R)                                           \
472   ((__m512h)__builtin_ia32_divph512((__v32hf)(__m512h)(A),                     \
473                                     (__v32hf)(__m512h)(B), (int)(R)))
474 
475 #define _mm512_mask_div_round_ph(W, U, A, B, R)                                \
476   ((__m512h)__builtin_ia32_selectph_512(                                       \
477       (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)),             \
478       (__v32hf)(__m512h)(W)))
479 
480 #define _mm512_maskz_div_round_ph(U, A, B, R)                                  \
481   ((__m512h)__builtin_ia32_selectph_512(                                       \
482       (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)),             \
483       (__v32hf)_mm512_setzero_ph()))
484 
_mm512_min_ph(__m512h __A,__m512h __B)485 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_min_ph(__m512h __A,
486                                                               __m512h __B) {
487   return (__m512h)__builtin_ia32_minph512((__v32hf)__A, (__v32hf)__B,
488                                           _MM_FROUND_CUR_DIRECTION);
489 }
490 
491 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_min_ph(__m512h __W,__mmask32 __U,__m512h __A,__m512h __B)492 _mm512_mask_min_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
493   return (__m512h)__builtin_ia32_selectph_512(
494       (__mmask32)__U, (__v32hf)_mm512_min_ph(__A, __B), (__v32hf)__W);
495 }
496 
497 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_min_ph(__mmask32 __U,__m512h __A,__m512h __B)498 _mm512_maskz_min_ph(__mmask32 __U, __m512h __A, __m512h __B) {
499   return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
500                                               (__v32hf)_mm512_min_ph(__A, __B),
501                                               (__v32hf)_mm512_setzero_ph());
502 }
503 
504 #define _mm512_min_round_ph(A, B, R)                                           \
505   ((__m512h)__builtin_ia32_minph512((__v32hf)(__m512h)(A),                     \
506                                     (__v32hf)(__m512h)(B), (int)(R)))
507 
508 #define _mm512_mask_min_round_ph(W, U, A, B, R)                                \
509   ((__m512h)__builtin_ia32_selectph_512(                                       \
510       (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)),             \
511       (__v32hf)(__m512h)(W)))
512 
513 #define _mm512_maskz_min_round_ph(U, A, B, R)                                  \
514   ((__m512h)__builtin_ia32_selectph_512(                                       \
515       (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)),             \
516       (__v32hf)_mm512_setzero_ph()))
517 
_mm512_max_ph(__m512h __A,__m512h __B)518 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_max_ph(__m512h __A,
519                                                               __m512h __B) {
520   return (__m512h)__builtin_ia32_maxph512((__v32hf)__A, (__v32hf)__B,
521                                           _MM_FROUND_CUR_DIRECTION);
522 }
523 
524 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_max_ph(__m512h __W,__mmask32 __U,__m512h __A,__m512h __B)525 _mm512_mask_max_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
526   return (__m512h)__builtin_ia32_selectph_512(
527       (__mmask32)__U, (__v32hf)_mm512_max_ph(__A, __B), (__v32hf)__W);
528 }
529 
530 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_max_ph(__mmask32 __U,__m512h __A,__m512h __B)531 _mm512_maskz_max_ph(__mmask32 __U, __m512h __A, __m512h __B) {
532   return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
533                                               (__v32hf)_mm512_max_ph(__A, __B),
534                                               (__v32hf)_mm512_setzero_ph());
535 }
536 
537 #define _mm512_max_round_ph(A, B, R)                                           \
538   ((__m512h)__builtin_ia32_maxph512((__v32hf)(__m512h)(A),                     \
539                                     (__v32hf)(__m512h)(B), (int)(R)))
540 
541 #define _mm512_mask_max_round_ph(W, U, A, B, R)                                \
542   ((__m512h)__builtin_ia32_selectph_512(                                       \
543       (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)),             \
544       (__v32hf)(__m512h)(W)))
545 
546 #define _mm512_maskz_max_round_ph(U, A, B, R)                                  \
547   ((__m512h)__builtin_ia32_selectph_512(                                       \
548       (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)),             \
549       (__v32hf)_mm512_setzero_ph()))
550 
_mm512_abs_ph(__m512h __A)551 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_abs_ph(__m512h __A) {
552   return (__m512h)_mm512_and_epi32(_mm512_set1_epi32(0x7FFF7FFF), (__m512i)__A);
553 }
554 
_mm512_conj_pch(__m512h __A)555 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_conj_pch(__m512h __A) {
556   return (__m512h)_mm512_xor_epi32((__m512i)__A,
557                                    _mm512_set1_epi32(-2147483648));
558 }
559 
560 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_conj_pch(__m512h __W,__mmask16 __U,__m512h __A)561 _mm512_mask_conj_pch(__m512h __W, __mmask16 __U, __m512h __A) {
562   return (__m512h)__builtin_ia32_selectps_512(
563       (__mmask16)__U, (__v16sf)_mm512_conj_pch(__A), (__v16sf)__W);
564 }
565 
566 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_conj_pch(__mmask16 __U,__m512h __A)567 _mm512_maskz_conj_pch(__mmask16 __U, __m512h __A) {
568   return (__m512h)__builtin_ia32_selectps_512((__mmask16)__U,
569                                               (__v16sf)_mm512_conj_pch(__A),
570                                               (__v16sf)_mm512_setzero_ps());
571 }
572 
_mm_add_sh(__m128h __A,__m128h __B)573 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_add_sh(__m128h __A,
574                                                            __m128h __B) {
575   __A[0] += __B[0];
576   return __A;
577 }
578 
_mm_mask_add_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)579 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_add_sh(__m128h __W,
580                                                                 __mmask8 __U,
581                                                                 __m128h __A,
582                                                                 __m128h __B) {
583   __A = _mm_add_sh(__A, __B);
584   return __builtin_ia32_selectsh_128(__U, __A, __W);
585 }
586 
_mm_maskz_add_sh(__mmask8 __U,__m128h __A,__m128h __B)587 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_add_sh(__mmask8 __U,
588                                                                  __m128h __A,
589                                                                  __m128h __B) {
590   __A = _mm_add_sh(__A, __B);
591   return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
592 }
593 
594 #define _mm_add_round_sh(A, B, R)                                              \
595   ((__m128h)__builtin_ia32_addsh_round_mask(                                   \
596       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
597       (__mmask8)-1, (int)(R)))
598 
599 #define _mm_mask_add_round_sh(W, U, A, B, R)                                   \
600   ((__m128h)__builtin_ia32_addsh_round_mask(                                   \
601       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
602       (__mmask8)(U), (int)(R)))
603 
604 #define _mm_maskz_add_round_sh(U, A, B, R)                                     \
605   ((__m128h)__builtin_ia32_addsh_round_mask(                                   \
606       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
607       (__mmask8)(U), (int)(R)))
608 
_mm_sub_sh(__m128h __A,__m128h __B)609 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sub_sh(__m128h __A,
610                                                            __m128h __B) {
611   __A[0] -= __B[0];
612   return __A;
613 }
614 
_mm_mask_sub_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)615 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sub_sh(__m128h __W,
616                                                                 __mmask8 __U,
617                                                                 __m128h __A,
618                                                                 __m128h __B) {
619   __A = _mm_sub_sh(__A, __B);
620   return __builtin_ia32_selectsh_128(__U, __A, __W);
621 }
622 
_mm_maskz_sub_sh(__mmask8 __U,__m128h __A,__m128h __B)623 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sub_sh(__mmask8 __U,
624                                                                  __m128h __A,
625                                                                  __m128h __B) {
626   __A = _mm_sub_sh(__A, __B);
627   return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
628 }
629 
630 #define _mm_sub_round_sh(A, B, R)                                              \
631   ((__m128h)__builtin_ia32_subsh_round_mask(                                   \
632       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
633       (__mmask8)-1, (int)(R)))
634 
635 #define _mm_mask_sub_round_sh(W, U, A, B, R)                                   \
636   ((__m128h)__builtin_ia32_subsh_round_mask(                                   \
637       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
638       (__mmask8)(U), (int)(R)))
639 
640 #define _mm_maskz_sub_round_sh(U, A, B, R)                                     \
641   ((__m128h)__builtin_ia32_subsh_round_mask(                                   \
642       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
643       (__mmask8)(U), (int)(R)))
644 
_mm_mul_sh(__m128h __A,__m128h __B)645 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mul_sh(__m128h __A,
646                                                            __m128h __B) {
647   __A[0] *= __B[0];
648   return __A;
649 }
650 
_mm_mask_mul_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)651 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_mul_sh(__m128h __W,
652                                                                 __mmask8 __U,
653                                                                 __m128h __A,
654                                                                 __m128h __B) {
655   __A = _mm_mul_sh(__A, __B);
656   return __builtin_ia32_selectsh_128(__U, __A, __W);
657 }
658 
_mm_maskz_mul_sh(__mmask8 __U,__m128h __A,__m128h __B)659 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_mul_sh(__mmask8 __U,
660                                                                  __m128h __A,
661                                                                  __m128h __B) {
662   __A = _mm_mul_sh(__A, __B);
663   return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
664 }
665 
666 #define _mm_mul_round_sh(A, B, R)                                              \
667   ((__m128h)__builtin_ia32_mulsh_round_mask(                                   \
668       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
669       (__mmask8)-1, (int)(R)))
670 
671 #define _mm_mask_mul_round_sh(W, U, A, B, R)                                   \
672   ((__m128h)__builtin_ia32_mulsh_round_mask(                                   \
673       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
674       (__mmask8)(U), (int)(R)))
675 
676 #define _mm_maskz_mul_round_sh(U, A, B, R)                                     \
677   ((__m128h)__builtin_ia32_mulsh_round_mask(                                   \
678       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
679       (__mmask8)(U), (int)(R)))
680 
_mm_div_sh(__m128h __A,__m128h __B)681 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_div_sh(__m128h __A,
682                                                            __m128h __B) {
683   __A[0] /= __B[0];
684   return __A;
685 }
686 
_mm_mask_div_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)687 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_div_sh(__m128h __W,
688                                                                 __mmask8 __U,
689                                                                 __m128h __A,
690                                                                 __m128h __B) {
691   __A = _mm_div_sh(__A, __B);
692   return __builtin_ia32_selectsh_128(__U, __A, __W);
693 }
694 
_mm_maskz_div_sh(__mmask8 __U,__m128h __A,__m128h __B)695 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_div_sh(__mmask8 __U,
696                                                                  __m128h __A,
697                                                                  __m128h __B) {
698   __A = _mm_div_sh(__A, __B);
699   return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
700 }
701 
702 #define _mm_div_round_sh(A, B, R)                                              \
703   ((__m128h)__builtin_ia32_divsh_round_mask(                                   \
704       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
705       (__mmask8)-1, (int)(R)))
706 
707 #define _mm_mask_div_round_sh(W, U, A, B, R)                                   \
708   ((__m128h)__builtin_ia32_divsh_round_mask(                                   \
709       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
710       (__mmask8)(U), (int)(R)))
711 
712 #define _mm_maskz_div_round_sh(U, A, B, R)                                     \
713   ((__m128h)__builtin_ia32_divsh_round_mask(                                   \
714       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
715       (__mmask8)(U), (int)(R)))
716 
_mm_min_sh(__m128h __A,__m128h __B)717 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_min_sh(__m128h __A,
718                                                            __m128h __B) {
719   return (__m128h)__builtin_ia32_minsh_round_mask(
720       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
721       _MM_FROUND_CUR_DIRECTION);
722 }
723 
_mm_mask_min_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)724 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_min_sh(__m128h __W,
725                                                                 __mmask8 __U,
726                                                                 __m128h __A,
727                                                                 __m128h __B) {
728   return (__m128h)__builtin_ia32_minsh_round_mask((__v8hf)__A, (__v8hf)__B,
729                                                   (__v8hf)__W, (__mmask8)__U,
730                                                   _MM_FROUND_CUR_DIRECTION);
731 }
732 
_mm_maskz_min_sh(__mmask8 __U,__m128h __A,__m128h __B)733 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_min_sh(__mmask8 __U,
734                                                                  __m128h __A,
735                                                                  __m128h __B) {
736   return (__m128h)__builtin_ia32_minsh_round_mask(
737       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
738       _MM_FROUND_CUR_DIRECTION);
739 }
740 
741 #define _mm_min_round_sh(A, B, R)                                              \
742   ((__m128h)__builtin_ia32_minsh_round_mask(                                   \
743       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
744       (__mmask8)-1, (int)(R)))
745 
746 #define _mm_mask_min_round_sh(W, U, A, B, R)                                   \
747   ((__m128h)__builtin_ia32_minsh_round_mask(                                   \
748       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
749       (__mmask8)(U), (int)(R)))
750 
751 #define _mm_maskz_min_round_sh(U, A, B, R)                                     \
752   ((__m128h)__builtin_ia32_minsh_round_mask(                                   \
753       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
754       (__mmask8)(U), (int)(R)))
755 
_mm_max_sh(__m128h __A,__m128h __B)756 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_max_sh(__m128h __A,
757                                                            __m128h __B) {
758   return (__m128h)__builtin_ia32_maxsh_round_mask(
759       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
760       _MM_FROUND_CUR_DIRECTION);
761 }
762 
_mm_mask_max_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)763 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_max_sh(__m128h __W,
764                                                                 __mmask8 __U,
765                                                                 __m128h __A,
766                                                                 __m128h __B) {
767   return (__m128h)__builtin_ia32_maxsh_round_mask((__v8hf)__A, (__v8hf)__B,
768                                                   (__v8hf)__W, (__mmask8)__U,
769                                                   _MM_FROUND_CUR_DIRECTION);
770 }
771 
_mm_maskz_max_sh(__mmask8 __U,__m128h __A,__m128h __B)772 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_max_sh(__mmask8 __U,
773                                                                  __m128h __A,
774                                                                  __m128h __B) {
775   return (__m128h)__builtin_ia32_maxsh_round_mask(
776       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
777       _MM_FROUND_CUR_DIRECTION);
778 }
779 
780 #define _mm_max_round_sh(A, B, R)                                              \
781   ((__m128h)__builtin_ia32_maxsh_round_mask(                                   \
782       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
783       (__mmask8)-1, (int)(R)))
784 
785 #define _mm_mask_max_round_sh(W, U, A, B, R)                                   \
786   ((__m128h)__builtin_ia32_maxsh_round_mask(                                   \
787       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
788       (__mmask8)(U), (int)(R)))
789 
790 #define _mm_maskz_max_round_sh(U, A, B, R)                                     \
791   ((__m128h)__builtin_ia32_maxsh_round_mask(                                   \
792       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
793       (__mmask8)(U), (int)(R)))
794 
795 #define _mm512_cmp_round_ph_mask(A, B, P, R)                                   \
796   ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A),              \
797                                            (__v32hf)(__m512h)(B), (int)(P),    \
798                                            (__mmask32)-1, (int)(R)))
799 
800 #define _mm512_mask_cmp_round_ph_mask(U, A, B, P, R)                           \
801   ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A),              \
802                                            (__v32hf)(__m512h)(B), (int)(P),    \
803                                            (__mmask32)(U), (int)(R)))
804 
805 #define _mm512_cmp_ph_mask(A, B, P)                                            \
806   _mm512_cmp_round_ph_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
807 
808 #define _mm512_mask_cmp_ph_mask(U, A, B, P)                                    \
809   _mm512_mask_cmp_round_ph_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
810 
811 #define _mm_cmp_round_sh_mask(X, Y, P, R)                                      \
812   ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X),                   \
813                                        (__v8hf)(__m128h)(Y), (int)(P),         \
814                                        (__mmask8)-1, (int)(R)))
815 
816 #define _mm_mask_cmp_round_sh_mask(M, X, Y, P, R)                              \
817   ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X),                   \
818                                        (__v8hf)(__m128h)(Y), (int)(P),         \
819                                        (__mmask8)(M), (int)(R)))
820 
821 #define _mm_cmp_sh_mask(X, Y, P)                                               \
822   ((__mmask8)__builtin_ia32_cmpsh_mask(                                        \
823       (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)-1,      \
824       _MM_FROUND_CUR_DIRECTION))
825 
826 #define _mm_mask_cmp_sh_mask(M, X, Y, P)                                       \
827   ((__mmask8)__builtin_ia32_cmpsh_mask(                                        \
828       (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)(M),     \
829       _MM_FROUND_CUR_DIRECTION))
830 // loads with vmovsh:
_mm_load_sh(void const * __dp)831 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_sh(void const *__dp) {
832   struct __mm_load_sh_struct {
833     _Float16 __u;
834   } __attribute__((__packed__, __may_alias__));
835   _Float16 __u = ((const struct __mm_load_sh_struct *)__dp)->__u;
836   return (__m128h){__u, 0, 0, 0, 0, 0, 0, 0};
837 }
838 
839 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_load_sh(__m128h __W,__mmask8 __U,const void * __A)840 _mm_mask_load_sh(__m128h __W, __mmask8 __U, const void *__A) {
841   __m128h src = (__v8hf)__builtin_shufflevector(
842       (__v8hf)__W, (__v8hf)_mm_setzero_ph(), 0, 8, 8, 8, 8, 8, 8, 8);
843 
844   return (__m128h)__builtin_ia32_loadsh128_mask((const __v8hf *)__A, src, __U & 1);
845 }
846 
847 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_load_sh(__mmask8 __U,const void * __A)848 _mm_maskz_load_sh(__mmask8 __U, const void *__A) {
849   return (__m128h)__builtin_ia32_loadsh128_mask(
850       (const __v8hf *)__A, (__v8hf)_mm_setzero_ph(), __U & 1);
851 }
852 
853 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_load_ph(void const * __p)854 _mm512_load_ph(void const *__p) {
855   return *(const __m512h *)__p;
856 }
857 
858 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_load_ph(void const * __p)859 _mm256_load_ph(void const *__p) {
860   return *(const __m256h *)__p;
861 }
862 
_mm_load_ph(void const * __p)863 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_ph(void const *__p) {
864   return *(const __m128h *)__p;
865 }
866 
867 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_loadu_ph(void const * __p)868 _mm512_loadu_ph(void const *__p) {
869   struct __loadu_ph {
870     __m512h_u __v;
871   } __attribute__((__packed__, __may_alias__));
872   return ((const struct __loadu_ph *)__p)->__v;
873 }
874 
875 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_loadu_ph(void const * __p)876 _mm256_loadu_ph(void const *__p) {
877   struct __loadu_ph {
878     __m256h_u __v;
879   } __attribute__((__packed__, __may_alias__));
880   return ((const struct __loadu_ph *)__p)->__v;
881 }
882 
_mm_loadu_ph(void const * __p)883 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_loadu_ph(void const *__p) {
884   struct __loadu_ph {
885     __m128h_u __v;
886   } __attribute__((__packed__, __may_alias__));
887   return ((const struct __loadu_ph *)__p)->__v;
888 }
889 
890 // stores with vmovsh:
_mm_store_sh(void * __dp,__m128h __a)891 static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_sh(void *__dp,
892                                                           __m128h __a) {
893   struct __mm_store_sh_struct {
894     _Float16 __u;
895   } __attribute__((__packed__, __may_alias__));
896   ((struct __mm_store_sh_struct *)__dp)->__u = __a[0];
897 }
898 
_mm_mask_store_sh(void * __W,__mmask8 __U,__m128h __A)899 static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_store_sh(void *__W,
900                                                                __mmask8 __U,
901                                                                __m128h __A) {
902   __builtin_ia32_storesh128_mask((__v8hf *)__W, __A, __U & 1);
903 }
904 
_mm512_store_ph(void * __P,__m512h __A)905 static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_store_ph(void *__P,
906                                                              __m512h __A) {
907   *(__m512h *)__P = __A;
908 }
909 
_mm256_store_ph(void * __P,__m256h __A)910 static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_store_ph(void *__P,
911                                                              __m256h __A) {
912   *(__m256h *)__P = __A;
913 }
914 
_mm_store_ph(void * __P,__m128h __A)915 static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_ph(void *__P,
916                                                           __m128h __A) {
917   *(__m128h *)__P = __A;
918 }
919 
_mm512_storeu_ph(void * __P,__m512h __A)920 static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_storeu_ph(void *__P,
921                                                               __m512h __A) {
922   struct __storeu_ph {
923     __m512h_u __v;
924   } __attribute__((__packed__, __may_alias__));
925   ((struct __storeu_ph *)__P)->__v = __A;
926 }
927 
_mm256_storeu_ph(void * __P,__m256h __A)928 static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_storeu_ph(void *__P,
929                                                               __m256h __A) {
930   struct __storeu_ph {
931     __m256h_u __v;
932   } __attribute__((__packed__, __may_alias__));
933   ((struct __storeu_ph *)__P)->__v = __A;
934 }
935 
_mm_storeu_ph(void * __P,__m128h __A)936 static __inline__ void __DEFAULT_FN_ATTRS128 _mm_storeu_ph(void *__P,
937                                                            __m128h __A) {
938   struct __storeu_ph {
939     __m128h_u __v;
940   } __attribute__((__packed__, __may_alias__));
941   ((struct __storeu_ph *)__P)->__v = __A;
942 }
943 
944 // moves with vmovsh:
_mm_move_sh(__m128h __a,__m128h __b)945 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_move_sh(__m128h __a,
946                                                             __m128h __b) {
947   __a[0] = __b[0];
948   return __a;
949 }
950 
_mm_mask_move_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)951 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_move_sh(__m128h __W,
952                                                                  __mmask8 __U,
953                                                                  __m128h __A,
954                                                                  __m128h __B) {
955   return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B), __W);
956 }
957 
_mm_maskz_move_sh(__mmask8 __U,__m128h __A,__m128h __B)958 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_move_sh(__mmask8 __U,
959                                                                   __m128h __A,
960                                                                   __m128h __B) {
961   return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B),
962                                      _mm_setzero_ph());
963 }
964 
965 // vmovw:
_mm_cvtsi16_si128(short __a)966 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtsi16_si128(short __a) {
967   return (__m128i)(__v8hi){__a, 0, 0, 0, 0, 0, 0, 0};
968 }
969 
_mm_cvtsi128_si16(__m128i __a)970 static __inline__ short __DEFAULT_FN_ATTRS128 _mm_cvtsi128_si16(__m128i __a) {
971   __v8hi __b = (__v8hi)__a;
972   return __b[0];
973 }
974 
_mm512_rcp_ph(__m512h __A)975 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rcp_ph(__m512h __A) {
976   return (__m512h)__builtin_ia32_rcpph512_mask(
977       (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1);
978 }
979 
980 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_rcp_ph(__m512h __W,__mmask32 __U,__m512h __A)981 _mm512_mask_rcp_ph(__m512h __W, __mmask32 __U, __m512h __A) {
982   return (__m512h)__builtin_ia32_rcpph512_mask((__v32hf)__A, (__v32hf)__W,
983                                                (__mmask32)__U);
984 }
985 
986 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_rcp_ph(__mmask32 __U,__m512h __A)987 _mm512_maskz_rcp_ph(__mmask32 __U, __m512h __A) {
988   return (__m512h)__builtin_ia32_rcpph512_mask(
989       (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U);
990 }
991 
_mm512_rsqrt_ph(__m512h __A)992 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rsqrt_ph(__m512h __A) {
993   return (__m512h)__builtin_ia32_rsqrtph512_mask(
994       (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1);
995 }
996 
997 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_rsqrt_ph(__m512h __W,__mmask32 __U,__m512h __A)998 _mm512_mask_rsqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) {
999   return (__m512h)__builtin_ia32_rsqrtph512_mask((__v32hf)__A, (__v32hf)__W,
1000                                                  (__mmask32)__U);
1001 }
1002 
1003 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_rsqrt_ph(__mmask32 __U,__m512h __A)1004 _mm512_maskz_rsqrt_ph(__mmask32 __U, __m512h __A) {
1005   return (__m512h)__builtin_ia32_rsqrtph512_mask(
1006       (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U);
1007 }
1008 
1009 #define _mm512_getmant_ph(A, B, C)                                             \
1010   ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
1011       (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)),                          \
1012       (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,                           \
1013       _MM_FROUND_CUR_DIRECTION))
1014 
1015 #define _mm512_mask_getmant_ph(W, U, A, B, C)                                  \
1016   ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
1017       (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W),   \
1018       (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1019 
1020 #define _mm512_maskz_getmant_ph(U, A, B, C)                                    \
1021   ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
1022       (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)),                          \
1023       (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1024 
1025 #define _mm512_getmant_round_ph(A, B, C, R)                                    \
1026   ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
1027       (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)),                          \
1028       (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
1029 
1030 #define _mm512_mask_getmant_round_ph(W, U, A, B, C, R)                         \
1031   ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
1032       (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W),   \
1033       (__mmask32)(U), (int)(R)))
1034 
1035 #define _mm512_maskz_getmant_round_ph(U, A, B, C, R)                           \
1036   ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
1037       (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)),                          \
1038       (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1039 
_mm512_getexp_ph(__m512h __A)1040 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_getexp_ph(__m512h __A) {
1041   return (__m512h)__builtin_ia32_getexpph512_mask(
1042       (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,
1043       _MM_FROUND_CUR_DIRECTION);
1044 }
1045 
1046 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_getexp_ph(__m512h __W,__mmask32 __U,__m512h __A)1047 _mm512_mask_getexp_ph(__m512h __W, __mmask32 __U, __m512h __A) {
1048   return (__m512h)__builtin_ia32_getexpph512_mask(
1049       (__v32hf)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1050 }
1051 
1052 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_getexp_ph(__mmask32 __U,__m512h __A)1053 _mm512_maskz_getexp_ph(__mmask32 __U, __m512h __A) {
1054   return (__m512h)__builtin_ia32_getexpph512_mask(
1055       (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1056       _MM_FROUND_CUR_DIRECTION);
1057 }
1058 
1059 #define _mm512_getexp_round_ph(A, R)                                           \
1060   ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A),             \
1061                                             (__v32hf)_mm512_undefined_ph(),    \
1062                                             (__mmask32)-1, (int)(R)))
1063 
1064 #define _mm512_mask_getexp_round_ph(W, U, A, R)                                \
1065   ((__m512h)__builtin_ia32_getexpph512_mask(                                   \
1066       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(W), (__mmask32)(U), (int)(R)))
1067 
1068 #define _mm512_maskz_getexp_round_ph(U, A, R)                                  \
1069   ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A),             \
1070                                             (__v32hf)_mm512_setzero_ph(),      \
1071                                             (__mmask32)(U), (int)(R)))
1072 
_mm512_scalef_ph(__m512h __A,__m512h __B)1073 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_scalef_ph(__m512h __A,
1074                                                                  __m512h __B) {
1075   return (__m512h)__builtin_ia32_scalefph512_mask(
1076       (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,
1077       _MM_FROUND_CUR_DIRECTION);
1078 }
1079 
1080 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_scalef_ph(__m512h __W,__mmask32 __U,__m512h __A,__m512h __B)1081 _mm512_mask_scalef_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
1082   return (__m512h)__builtin_ia32_scalefph512_mask((__v32hf)__A, (__v32hf)__B,
1083                                                   (__v32hf)__W, (__mmask32)__U,
1084                                                   _MM_FROUND_CUR_DIRECTION);
1085 }
1086 
1087 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_scalef_ph(__mmask32 __U,__m512h __A,__m512h __B)1088 _mm512_maskz_scalef_ph(__mmask32 __U, __m512h __A, __m512h __B) {
1089   return (__m512h)__builtin_ia32_scalefph512_mask(
1090       (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1091       _MM_FROUND_CUR_DIRECTION);
1092 }
1093 
1094 #define _mm512_scalef_round_ph(A, B, R)                                        \
1095   ((__m512h)__builtin_ia32_scalefph512_mask(                                   \
1096       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B),                            \
1097       (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
1098 
1099 #define _mm512_mask_scalef_round_ph(W, U, A, B, R)                             \
1100   ((__m512h)__builtin_ia32_scalefph512_mask(                                   \
1101       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(W),     \
1102       (__mmask32)(U), (int)(R)))
1103 
1104 #define _mm512_maskz_scalef_round_ph(U, A, B, R)                               \
1105   ((__m512h)__builtin_ia32_scalefph512_mask(                                   \
1106       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B),                            \
1107       (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1108 
1109 #define _mm512_roundscale_ph(A, B)                                             \
1110   ((__m512h)__builtin_ia32_rndscaleph_mask(                                    \
1111       (__v32hf)(__m512h)(A), (int)(B), (__v32hf)(__m512h)(A), (__mmask32)-1,   \
1112       _MM_FROUND_CUR_DIRECTION))
1113 
1114 #define _mm512_mask_roundscale_ph(A, B, C, imm)                                \
1115   ((__m512h)__builtin_ia32_rndscaleph_mask(                                    \
1116       (__v32hf)(__m512h)(C), (int)(imm), (__v32hf)(__m512h)(A),                \
1117       (__mmask32)(B), _MM_FROUND_CUR_DIRECTION))
1118 
1119 #define _mm512_maskz_roundscale_ph(A, B, imm)                                  \
1120   ((__m512h)__builtin_ia32_rndscaleph_mask(                                    \
1121       (__v32hf)(__m512h)(B), (int)(imm), (__v32hf)_mm512_setzero_ph(),         \
1122       (__mmask32)(A), _MM_FROUND_CUR_DIRECTION))
1123 
1124 #define _mm512_mask_roundscale_round_ph(A, B, C, imm, R)                       \
1125   ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(C), (int)(imm),  \
1126                                            (__v32hf)(__m512h)(A),              \
1127                                            (__mmask32)(B), (int)(R)))
1128 
1129 #define _mm512_maskz_roundscale_round_ph(A, B, imm, R)                         \
1130   ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(B), (int)(imm),  \
1131                                            (__v32hf)_mm512_setzero_ph(),       \
1132                                            (__mmask32)(A), (int)(R)))
1133 
1134 #define _mm512_roundscale_round_ph(A, imm, R)                                  \
1135   ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(A), (int)(imm),  \
1136                                            (__v32hf)_mm512_undefined_ph(),     \
1137                                            (__mmask32)-1, (int)(R)))
1138 
1139 #define _mm512_reduce_ph(A, imm)                                               \
1140   ((__m512h)__builtin_ia32_reduceph512_mask(                                   \
1141       (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_undefined_ph(),       \
1142       (__mmask32)-1, _MM_FROUND_CUR_DIRECTION))
1143 
1144 #define _mm512_mask_reduce_ph(W, U, A, imm)                                    \
1145   ((__m512h)__builtin_ia32_reduceph512_mask(                                   \
1146       (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)(__m512h)(W),                \
1147       (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1148 
1149 #define _mm512_maskz_reduce_ph(U, A, imm)                                      \
1150   ((__m512h)__builtin_ia32_reduceph512_mask(                                   \
1151       (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_setzero_ph(),         \
1152       (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1153 
1154 #define _mm512_mask_reduce_round_ph(W, U, A, imm, R)                           \
1155   ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1156                                             (__v32hf)(__m512h)(W),             \
1157                                             (__mmask32)(U), (int)(R)))
1158 
1159 #define _mm512_maskz_reduce_round_ph(U, A, imm, R)                             \
1160   ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1161                                             (__v32hf)_mm512_setzero_ph(),      \
1162                                             (__mmask32)(U), (int)(R)))
1163 
1164 #define _mm512_reduce_round_ph(A, imm, R)                                      \
1165   ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1166                                             (__v32hf)_mm512_undefined_ph(),    \
1167                                             (__mmask32)-1, (int)(R)))
1168 
_mm_rcp_sh(__m128h __A,__m128h __B)1169 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rcp_sh(__m128h __A,
1170                                                            __m128h __B) {
1171   return (__m128h)__builtin_ia32_rcpsh_mask(
1172       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
1173 }
1174 
_mm_mask_rcp_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)1175 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rcp_sh(__m128h __W,
1176                                                                 __mmask8 __U,
1177                                                                 __m128h __A,
1178                                                                 __m128h __B) {
1179   return (__m128h)__builtin_ia32_rcpsh_mask((__v8hf)__A, (__v8hf)__B,
1180                                             (__v8hf)__W, (__mmask8)__U);
1181 }
1182 
_mm_maskz_rcp_sh(__mmask8 __U,__m128h __A,__m128h __B)1183 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_rcp_sh(__mmask8 __U,
1184                                                                  __m128h __A,
1185                                                                  __m128h __B) {
1186   return (__m128h)__builtin_ia32_rcpsh_mask(
1187       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1188 }
1189 
_mm_rsqrt_sh(__m128h __A,__m128h __B)1190 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rsqrt_sh(__m128h __A,
1191                                                              __m128h __B) {
1192   return (__m128h)__builtin_ia32_rsqrtsh_mask(
1193       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
1194 }
1195 
_mm_mask_rsqrt_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)1196 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rsqrt_sh(__m128h __W,
1197                                                                   __mmask8 __U,
1198                                                                   __m128h __A,
1199                                                                   __m128h __B) {
1200   return (__m128h)__builtin_ia32_rsqrtsh_mask((__v8hf)__A, (__v8hf)__B,
1201                                               (__v8hf)__W, (__mmask8)__U);
1202 }
1203 
1204 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_rsqrt_sh(__mmask8 __U,__m128h __A,__m128h __B)1205 _mm_maskz_rsqrt_sh(__mmask8 __U, __m128h __A, __m128h __B) {
1206   return (__m128h)__builtin_ia32_rsqrtsh_mask(
1207       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1208 }
1209 
1210 #define _mm_getmant_round_sh(A, B, C, D, R)                                    \
1211   ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
1212       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
1213       (__v8hf)_mm_setzero_ph(), (__mmask8)-1, (int)(R)))
1214 
1215 #define _mm_getmant_sh(A, B, C, D)                                             \
1216   ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
1217       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
1218       (__v8hf)_mm_setzero_ph(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))
1219 
1220 #define _mm_mask_getmant_sh(W, U, A, B, C, D)                                  \
1221   ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
1222       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
1223       (__v8hf)(__m128h)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
1224 
1225 #define _mm_mask_getmant_round_sh(W, U, A, B, C, D, R)                         \
1226   ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
1227       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
1228       (__v8hf)(__m128h)(W), (__mmask8)(U), (int)(R)))
1229 
1230 #define _mm_maskz_getmant_sh(U, A, B, C, D)                                    \
1231   ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
1232       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
1233       (__v8hf)_mm_setzero_ph(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
1234 
1235 #define _mm_maskz_getmant_round_sh(U, A, B, C, D, R)                           \
1236   ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
1237       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
1238       (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1239 
1240 #define _mm_getexp_round_sh(A, B, R)                                           \
1241   ((__m128h)__builtin_ia32_getexpsh128_round_mask(                             \
1242       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1243       (__mmask8)-1, (int)(R)))
1244 
_mm_getexp_sh(__m128h __A,__m128h __B)1245 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_getexp_sh(__m128h __A,
1246                                                               __m128h __B) {
1247   return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1248       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
1249       _MM_FROUND_CUR_DIRECTION);
1250 }
1251 
1252 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_getexp_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)1253 _mm_mask_getexp_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
1254   return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1255       (__v8hf)__A, (__v8hf)__B, (__v8hf)__W, (__mmask8)__U,
1256       _MM_FROUND_CUR_DIRECTION);
1257 }
1258 
1259 #define _mm_mask_getexp_round_sh(W, U, A, B, R)                                \
1260   ((__m128h)__builtin_ia32_getexpsh128_round_mask(                             \
1261       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
1262       (__mmask8)(U), (int)(R)))
1263 
1264 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_getexp_sh(__mmask8 __U,__m128h __A,__m128h __B)1265 _mm_maskz_getexp_sh(__mmask8 __U, __m128h __A, __m128h __B) {
1266   return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1267       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1268       _MM_FROUND_CUR_DIRECTION);
1269 }
1270 
1271 #define _mm_maskz_getexp_round_sh(U, A, B, R)                                  \
1272   ((__m128h)__builtin_ia32_getexpsh128_round_mask(                             \
1273       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1274       (__mmask8)(U), (int)(R)))
1275 
1276 #define _mm_scalef_round_sh(A, B, R)                                           \
1277   ((__m128h)__builtin_ia32_scalefsh_round_mask(                                \
1278       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1279       (__mmask8)-1, (int)(R)))
1280 
_mm_scalef_sh(__m128h __A,__m128h __B)1281 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_scalef_sh(__m128h __A,
1282                                                               __m128h __B) {
1283   return (__m128h)__builtin_ia32_scalefsh_round_mask(
1284       (__v8hf)__A, (__v8hf)(__B), (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
1285       _MM_FROUND_CUR_DIRECTION);
1286 }
1287 
1288 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_scalef_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)1289 _mm_mask_scalef_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
1290   return (__m128h)__builtin_ia32_scalefsh_round_mask((__v8hf)__A, (__v8hf)__B,
1291                                                      (__v8hf)__W, (__mmask8)__U,
1292                                                      _MM_FROUND_CUR_DIRECTION);
1293 }
1294 
1295 #define _mm_mask_scalef_round_sh(W, U, A, B, R)                                \
1296   ((__m128h)__builtin_ia32_scalefsh_round_mask(                                \
1297       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
1298       (__mmask8)(U), (int)(R)))
1299 
1300 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_scalef_sh(__mmask8 __U,__m128h __A,__m128h __B)1301 _mm_maskz_scalef_sh(__mmask8 __U, __m128h __A, __m128h __B) {
1302   return (__m128h)__builtin_ia32_scalefsh_round_mask(
1303       (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1304       _MM_FROUND_CUR_DIRECTION);
1305 }
1306 
1307 #define _mm_maskz_scalef_round_sh(U, A, B, R)                                  \
1308   ((__m128h)__builtin_ia32_scalefsh_round_mask(                                \
1309       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1310       (__mmask8)(U), (int)(R)))
1311 
1312 #define _mm_roundscale_round_sh(A, B, imm, R)                                  \
1313   ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
1314       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1315       (__mmask8)-1, (int)(imm), (int)(R)))
1316 
1317 #define _mm_roundscale_sh(A, B, imm)                                           \
1318   ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
1319       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1320       (__mmask8)-1, (int)(imm), _MM_FROUND_CUR_DIRECTION))
1321 
1322 #define _mm_mask_roundscale_sh(W, U, A, B, I)                                  \
1323   ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
1324       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
1325       (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
1326 
1327 #define _mm_mask_roundscale_round_sh(W, U, A, B, I, R)                         \
1328   ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
1329       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
1330       (__mmask8)(U), (int)(I), (int)(R)))
1331 
1332 #define _mm_maskz_roundscale_sh(U, A, B, I)                                    \
1333   ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
1334       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1335       (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
1336 
1337 #define _mm_maskz_roundscale_round_sh(U, A, B, I, R)                           \
1338   ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
1339       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1340       (__mmask8)(U), (int)(I), (int)(R)))
1341 
1342 #define _mm_reduce_sh(A, B, C)                                                 \
1343   ((__m128h)__builtin_ia32_reducesh_mask(                                      \
1344       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1345       (__mmask8)-1, (int)(C), _MM_FROUND_CUR_DIRECTION))
1346 
1347 #define _mm_mask_reduce_sh(W, U, A, B, C)                                      \
1348   ((__m128h)__builtin_ia32_reducesh_mask(                                      \
1349       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
1350       (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
1351 
1352 #define _mm_maskz_reduce_sh(U, A, B, C)                                        \
1353   ((__m128h)__builtin_ia32_reducesh_mask(                                      \
1354       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1355       (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
1356 
1357 #define _mm_reduce_round_sh(A, B, C, R)                                        \
1358   ((__m128h)__builtin_ia32_reducesh_mask(                                      \
1359       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1360       (__mmask8)-1, (int)(C), (int)(R)))
1361 
1362 #define _mm_mask_reduce_round_sh(W, U, A, B, C, R)                             \
1363   ((__m128h)__builtin_ia32_reducesh_mask(                                      \
1364       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
1365       (__mmask8)(U), (int)(C), (int)(R)))
1366 
1367 #define _mm_maskz_reduce_round_sh(U, A, B, C, R)                               \
1368   ((__m128h)__builtin_ia32_reducesh_mask(                                      \
1369       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1370       (__mmask8)(U), (int)(C), (int)(R)))
1371 
1372 #define _mm512_sqrt_round_ph(A, R)                                             \
1373   ((__m512h)__builtin_ia32_sqrtph512((__v32hf)(__m512h)(A), (int)(R)))
1374 
1375 #define _mm512_mask_sqrt_round_ph(W, U, A, R)                                  \
1376   ((__m512h)__builtin_ia32_selectph_512(                                       \
1377       (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)),                 \
1378       (__v32hf)(__m512h)(W)))
1379 
1380 #define _mm512_maskz_sqrt_round_ph(U, A, R)                                    \
1381   ((__m512h)__builtin_ia32_selectph_512(                                       \
1382       (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)),                 \
1383       (__v32hf)_mm512_setzero_ph()))
1384 
_mm512_sqrt_ph(__m512h __A)1385 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sqrt_ph(__m512h __A) {
1386   return (__m512h)__builtin_ia32_sqrtph512((__v32hf)__A,
1387                                            _MM_FROUND_CUR_DIRECTION);
1388 }
1389 
1390 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_sqrt_ph(__m512h __W,__mmask32 __U,__m512h __A)1391 _mm512_mask_sqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) {
1392   return (__m512h)__builtin_ia32_selectph_512(
1393       (__mmask32)(__U),
1394       (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)),
1395       (__v32hf)(__m512h)(__W));
1396 }
1397 
1398 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_sqrt_ph(__mmask32 __U,__m512h __A)1399 _mm512_maskz_sqrt_ph(__mmask32 __U, __m512h __A) {
1400   return (__m512h)__builtin_ia32_selectph_512(
1401       (__mmask32)(__U),
1402       (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)),
1403       (__v32hf)_mm512_setzero_ph());
1404 }
1405 
1406 #define _mm_sqrt_round_sh(A, B, R)                                             \
1407   ((__m128h)__builtin_ia32_sqrtsh_round_mask(                                  \
1408       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1409       (__mmask8)-1, (int)(R)))
1410 
1411 #define _mm_mask_sqrt_round_sh(W, U, A, B, R)                                  \
1412   ((__m128h)__builtin_ia32_sqrtsh_round_mask(                                  \
1413       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
1414       (__mmask8)(U), (int)(R)))
1415 
1416 #define _mm_maskz_sqrt_round_sh(U, A, B, R)                                    \
1417   ((__m128h)__builtin_ia32_sqrtsh_round_mask(                                  \
1418       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1419       (__mmask8)(U), (int)(R)))
1420 
_mm_sqrt_sh(__m128h __A,__m128h __B)1421 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_sh(__m128h __A,
1422                                                             __m128h __B) {
1423   return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1424       (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
1425       (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
1426 }
1427 
_mm_mask_sqrt_sh(__m128h __W,__mmask32 __U,__m128h __A,__m128h __B)1428 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_sh(__m128h __W,
1429                                                                  __mmask32 __U,
1430                                                                  __m128h __A,
1431                                                                  __m128h __B) {
1432   return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1433       (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)(__m128h)(__W),
1434       (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
1435 }
1436 
_mm_maskz_sqrt_sh(__mmask32 __U,__m128h __A,__m128h __B)1437 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_sh(__mmask32 __U,
1438                                                                   __m128h __A,
1439                                                                   __m128h __B) {
1440   return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1441       (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
1442       (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
1443 }
1444 
1445 #define _mm512_mask_fpclass_ph_mask(U, A, imm)                                 \
1446   ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A),          \
1447                                                (int)(imm), (__mmask32)(U)))
1448 
1449 #define _mm512_fpclass_ph_mask(A, imm)                                         \
1450   ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A),          \
1451                                                (int)(imm), (__mmask32)-1))
1452 
1453 #define _mm_fpclass_sh_mask(A, imm)                                            \
1454   ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm),   \
1455                                            (__mmask8)-1))
1456 
1457 #define _mm_mask_fpclass_sh_mask(U, A, imm)                                    \
1458   ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm),   \
1459                                            (__mmask8)(U)))
1460 
1461 #define _mm512_cvt_roundpd_ph(A, R)                                            \
1462   ((__m128h)__builtin_ia32_vcvtpd2ph512_mask(                                  \
1463       (__v8df)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
1464 
1465 #define _mm512_mask_cvt_roundpd_ph(W, U, A, R)                                 \
1466   ((__m128h)__builtin_ia32_vcvtpd2ph512_mask((__v8df)(A), (__v8hf)(W),         \
1467                                              (__mmask8)(U), (int)(R)))
1468 
1469 #define _mm512_maskz_cvt_roundpd_ph(U, A, R)                                   \
1470   ((__m128h)__builtin_ia32_vcvtpd2ph512_mask(                                  \
1471       (__v8df)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1472 
_mm512_cvtpd_ph(__m512d __A)1473 static __inline__ __m128h __DEFAULT_FN_ATTRS512 _mm512_cvtpd_ph(__m512d __A) {
1474   return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1475       (__v8df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
1476       _MM_FROUND_CUR_DIRECTION);
1477 }
1478 
1479 static __inline__ __m128h __DEFAULT_FN_ATTRS512
_mm512_mask_cvtpd_ph(__m128h __W,__mmask8 __U,__m512d __A)1480 _mm512_mask_cvtpd_ph(__m128h __W, __mmask8 __U, __m512d __A) {
1481   return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1482       (__v8df)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
1483 }
1484 
1485 static __inline__ __m128h __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtpd_ph(__mmask8 __U,__m512d __A)1486 _mm512_maskz_cvtpd_ph(__mmask8 __U, __m512d __A) {
1487   return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1488       (__v8df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1489       _MM_FROUND_CUR_DIRECTION);
1490 }
1491 
1492 #define _mm512_cvt_roundph_pd(A, R)                                            \
1493   ((__m512d)__builtin_ia32_vcvtph2pd512_mask(                                  \
1494       (__v8hf)(A), (__v8df)_mm512_undefined_pd(), (__mmask8)(-1), (int)(R)))
1495 
1496 #define _mm512_mask_cvt_roundph_pd(W, U, A, R)                                 \
1497   ((__m512d)__builtin_ia32_vcvtph2pd512_mask((__v8hf)(A), (__v8df)(W),         \
1498                                              (__mmask8)(U), (int)(R)))
1499 
1500 #define _mm512_maskz_cvt_roundph_pd(U, A, R)                                   \
1501   ((__m512d)__builtin_ia32_vcvtph2pd512_mask(                                  \
1502       (__v8hf)(A), (__v8df)_mm512_setzero_pd(), (__mmask8)(U), (int)(R)))
1503 
_mm512_cvtph_pd(__m128h __A)1504 static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_cvtph_pd(__m128h __A) {
1505   return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1506       (__v8hf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)-1,
1507       _MM_FROUND_CUR_DIRECTION);
1508 }
1509 
1510 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_mask_cvtph_pd(__m512d __W,__mmask8 __U,__m128h __A)1511 _mm512_mask_cvtph_pd(__m512d __W, __mmask8 __U, __m128h __A) {
1512   return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1513       (__v8hf)__A, (__v8df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
1514 }
1515 
1516 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtph_pd(__mmask8 __U,__m128h __A)1517 _mm512_maskz_cvtph_pd(__mmask8 __U, __m128h __A) {
1518   return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1519       (__v8hf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U,
1520       _MM_FROUND_CUR_DIRECTION);
1521 }
1522 
1523 #define _mm_cvt_roundsh_ss(A, B, R)                                            \
1524   ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B),       \
1525                                                (__v4sf)_mm_undefined_ps(),     \
1526                                                (__mmask8)(-1), (int)(R)))
1527 
1528 #define _mm_mask_cvt_roundsh_ss(W, U, A, B, R)                                 \
1529   ((__m128)__builtin_ia32_vcvtsh2ss_round_mask(                                \
1530       (__v4sf)(A), (__v8hf)(B), (__v4sf)(W), (__mmask8)(U), (int)(R)))
1531 
1532 #define _mm_maskz_cvt_roundsh_ss(U, A, B, R)                                   \
1533   ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B),       \
1534                                                (__v4sf)_mm_setzero_ps(),       \
1535                                                (__mmask8)(U), (int)(R)))
1536 
_mm_cvtsh_ss(__m128 __A,__m128h __B)1537 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtsh_ss(__m128 __A,
1538                                                             __m128h __B) {
1539   return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
1540       (__v4sf)__A, (__v8hf)__B, (__v4sf)_mm_undefined_ps(), (__mmask8)-1,
1541       _MM_FROUND_CUR_DIRECTION);
1542 }
1543 
_mm_mask_cvtsh_ss(__m128 __W,__mmask8 __U,__m128 __A,__m128h __B)1544 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtsh_ss(__m128 __W,
1545                                                                  __mmask8 __U,
1546                                                                  __m128 __A,
1547                                                                  __m128h __B) {
1548   return (__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)__A, (__v8hf)__B,
1549                                                      (__v4sf)__W, (__mmask8)__U,
1550                                                      _MM_FROUND_CUR_DIRECTION);
1551 }
1552 
_mm_maskz_cvtsh_ss(__mmask8 __U,__m128 __A,__m128h __B)1553 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_cvtsh_ss(__mmask8 __U,
1554                                                                   __m128 __A,
1555                                                                   __m128h __B) {
1556   return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
1557       (__v4sf)__A, (__v8hf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U,
1558       _MM_FROUND_CUR_DIRECTION);
1559 }
1560 
1561 #define _mm_cvt_roundss_sh(A, B, R)                                            \
1562   ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B),      \
1563                                                 (__v8hf)_mm_undefined_ph(),    \
1564                                                 (__mmask8)(-1), (int)(R)))
1565 
1566 #define _mm_mask_cvt_roundss_sh(W, U, A, B, R)                                 \
1567   ((__m128h)__builtin_ia32_vcvtss2sh_round_mask(                               \
1568       (__v8hf)(A), (__v4sf)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
1569 
1570 #define _mm_maskz_cvt_roundss_sh(U, A, B, R)                                   \
1571   ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B),      \
1572                                                 (__v8hf)_mm_setzero_ph(),      \
1573                                                 (__mmask8)(U), (int)(R)))
1574 
_mm_cvtss_sh(__m128h __A,__m128 __B)1575 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtss_sh(__m128h __A,
1576                                                              __m128 __B) {
1577   return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1578       (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_undefined_ph(), (__mmask8)-1,
1579       _MM_FROUND_CUR_DIRECTION);
1580 }
1581 
_mm_mask_cvtss_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128 __B)1582 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtss_sh(__m128h __W,
1583                                                                   __mmask8 __U,
1584                                                                   __m128h __A,
1585                                                                   __m128 __B) {
1586   return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1587       (__v8hf)__A, (__v4sf)__B, (__v8hf)__W, (__mmask8)__U,
1588       _MM_FROUND_CUR_DIRECTION);
1589 }
1590 
_mm_maskz_cvtss_sh(__mmask8 __U,__m128h __A,__m128 __B)1591 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_cvtss_sh(__mmask8 __U,
1592                                                                    __m128h __A,
1593                                                                    __m128 __B) {
1594   return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1595       (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1596       _MM_FROUND_CUR_DIRECTION);
1597 }
1598 
1599 #define _mm_cvt_roundsd_sh(A, B, R)                                            \
1600   ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B),      \
1601                                                 (__v8hf)_mm_undefined_ph(),    \
1602                                                 (__mmask8)(-1), (int)(R)))
1603 
1604 #define _mm_mask_cvt_roundsd_sh(W, U, A, B, R)                                 \
1605   ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask(                               \
1606       (__v8hf)(A), (__v2df)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
1607 
1608 #define _mm_maskz_cvt_roundsd_sh(U, A, B, R)                                   \
1609   ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B),      \
1610                                                 (__v8hf)_mm_setzero_ph(),      \
1611                                                 (__mmask8)(U), (int)(R)))
1612 
_mm_cvtsd_sh(__m128h __A,__m128d __B)1613 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtsd_sh(__m128h __A,
1614                                                              __m128d __B) {
1615   return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1616       (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_undefined_ph(), (__mmask8)-1,
1617       _MM_FROUND_CUR_DIRECTION);
1618 }
1619 
_mm_mask_cvtsd_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128d __B)1620 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtsd_sh(__m128h __W,
1621                                                                   __mmask8 __U,
1622                                                                   __m128h __A,
1623                                                                   __m128d __B) {
1624   return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1625       (__v8hf)__A, (__v2df)__B, (__v8hf)__W, (__mmask8)__U,
1626       _MM_FROUND_CUR_DIRECTION);
1627 }
1628 
1629 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_cvtsd_sh(__mmask8 __U,__m128h __A,__m128d __B)1630 _mm_maskz_cvtsd_sh(__mmask8 __U, __m128h __A, __m128d __B) {
1631   return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1632       (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1633       _MM_FROUND_CUR_DIRECTION);
1634 }
1635 
1636 #define _mm_cvt_roundsh_sd(A, B, R)                                            \
1637   ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B),      \
1638                                                 (__v2df)_mm_undefined_pd(),    \
1639                                                 (__mmask8)(-1), (int)(R)))
1640 
1641 #define _mm_mask_cvt_roundsh_sd(W, U, A, B, R)                                 \
1642   ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask(                               \
1643       (__v2df)(A), (__v8hf)(B), (__v2df)(W), (__mmask8)(U), (int)(R)))
1644 
1645 #define _mm_maskz_cvt_roundsh_sd(U, A, B, R)                                   \
1646   ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B),      \
1647                                                 (__v2df)_mm_setzero_pd(),      \
1648                                                 (__mmask8)(U), (int)(R)))
1649 
_mm_cvtsh_sd(__m128d __A,__m128h __B)1650 static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_cvtsh_sd(__m128d __A,
1651                                                              __m128h __B) {
1652   return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1653       (__v2df)__A, (__v8hf)__B, (__v2df)_mm_undefined_pd(), (__mmask8)-1,
1654       _MM_FROUND_CUR_DIRECTION);
1655 }
1656 
_mm_mask_cvtsh_sd(__m128d __W,__mmask8 __U,__m128d __A,__m128h __B)1657 static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_cvtsh_sd(__m128d __W,
1658                                                                   __mmask8 __U,
1659                                                                   __m128d __A,
1660                                                                   __m128h __B) {
1661   return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1662       (__v2df)__A, (__v8hf)__B, (__v2df)__W, (__mmask8)__U,
1663       _MM_FROUND_CUR_DIRECTION);
1664 }
1665 
1666 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_maskz_cvtsh_sd(__mmask8 __U,__m128d __A,__m128h __B)1667 _mm_maskz_cvtsh_sd(__mmask8 __U, __m128d __A, __m128h __B) {
1668   return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1669       (__v2df)__A, (__v8hf)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U,
1670       _MM_FROUND_CUR_DIRECTION);
1671 }
1672 
1673 #define _mm512_cvt_roundph_epi16(A, R)                                         \
1674   ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A),                      \
1675                                             (__v32hi)_mm512_undefined_epi32(), \
1676                                             (__mmask32)(-1), (int)(R)))
1677 
1678 #define _mm512_mask_cvt_roundph_epi16(W, U, A, R)                              \
1679   ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), (__v32hi)(W),        \
1680                                             (__mmask32)(U), (int)(R)))
1681 
1682 #define _mm512_maskz_cvt_roundph_epi16(U, A, R)                                \
1683   ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A),                      \
1684                                             (__v32hi)_mm512_setzero_epi32(),   \
1685                                             (__mmask32)(U), (int)(R)))
1686 
1687 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvtph_epi16(__m512h __A)1688 _mm512_cvtph_epi16(__m512h __A) {
1689   return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1690       (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)-1,
1691       _MM_FROUND_CUR_DIRECTION);
1692 }
1693 
1694 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvtph_epi16(__m512i __W,__mmask32 __U,__m512h __A)1695 _mm512_mask_cvtph_epi16(__m512i __W, __mmask32 __U, __m512h __A) {
1696   return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1697       (__v32hf)__A, (__v32hi)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1698 }
1699 
1700 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtph_epi16(__mmask32 __U,__m512h __A)1701 _mm512_maskz_cvtph_epi16(__mmask32 __U, __m512h __A) {
1702   return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1703       (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)__U,
1704       _MM_FROUND_CUR_DIRECTION);
1705 }
1706 
1707 #define _mm512_cvtt_roundph_epi16(A, R)                                        \
1708   ((__m512i)__builtin_ia32_vcvttph2w512_mask(                                  \
1709       (__v32hf)(A), (__v32hi)_mm512_undefined_epi32(), (__mmask32)(-1),        \
1710       (int)(R)))
1711 
1712 #define _mm512_mask_cvtt_roundph_epi16(W, U, A, R)                             \
1713   ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), (__v32hi)(W),       \
1714                                              (__mmask32)(U), (int)(R)))
1715 
1716 #define _mm512_maskz_cvtt_roundph_epi16(U, A, R)                               \
1717   ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A),                     \
1718                                              (__v32hi)_mm512_setzero_epi32(),  \
1719                                              (__mmask32)(U), (int)(R)))
1720 
1721 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvttph_epi16(__m512h __A)1722 _mm512_cvttph_epi16(__m512h __A) {
1723   return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1724       (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)-1,
1725       _MM_FROUND_CUR_DIRECTION);
1726 }
1727 
1728 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvttph_epi16(__m512i __W,__mmask32 __U,__m512h __A)1729 _mm512_mask_cvttph_epi16(__m512i __W, __mmask32 __U, __m512h __A) {
1730   return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1731       (__v32hf)__A, (__v32hi)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1732 }
1733 
1734 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvttph_epi16(__mmask32 __U,__m512h __A)1735 _mm512_maskz_cvttph_epi16(__mmask32 __U, __m512h __A) {
1736   return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1737       (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)__U,
1738       _MM_FROUND_CUR_DIRECTION);
1739 }
1740 
1741 #define _mm512_cvt_roundepi16_ph(A, R)                                         \
1742   ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A),                      \
1743                                             (__v32hf)_mm512_undefined_ph(),    \
1744                                             (__mmask32)(-1), (int)(R)))
1745 
1746 #define _mm512_mask_cvt_roundepi16_ph(W, U, A, R)                              \
1747   ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), (__v32hf)(W),        \
1748                                             (__mmask32)(U), (int)(R)))
1749 
1750 #define _mm512_maskz_cvt_roundepi16_ph(U, A, R)                                \
1751   ((__m512h)__builtin_ia32_vcvtw2ph512_mask(                                   \
1752       (__v32hi)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1753 
1754 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_cvtepi16_ph(__m512i __A)1755 _mm512_cvtepi16_ph(__m512i __A) {
1756   return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1757       (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)-1,
1758       _MM_FROUND_CUR_DIRECTION);
1759 }
1760 
1761 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_cvtepi16_ph(__m512h __W,__mmask32 __U,__m512i __A)1762 _mm512_mask_cvtepi16_ph(__m512h __W, __mmask32 __U, __m512i __A) {
1763   return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1764       (__v32hi)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1765 }
1766 
1767 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtepi16_ph(__mmask32 __U,__m512i __A)1768 _mm512_maskz_cvtepi16_ph(__mmask32 __U, __m512i __A) {
1769   return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1770       (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1771       _MM_FROUND_CUR_DIRECTION);
1772 }
1773 
1774 #define _mm512_cvt_roundph_epu16(A, R)                                         \
1775   ((__m512i)__builtin_ia32_vcvtph2uw512_mask(                                  \
1776       (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1),        \
1777       (int)(R)))
1778 
1779 #define _mm512_mask_cvt_roundph_epu16(W, U, A, R)                              \
1780   ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), (__v32hu)(W),       \
1781                                              (__mmask32)(U), (int)(R)))
1782 
1783 #define _mm512_maskz_cvt_roundph_epu16(U, A, R)                                \
1784   ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A),                     \
1785                                              (__v32hu)_mm512_setzero_epi32(),  \
1786                                              (__mmask32)(U), (int)(R)))
1787 
1788 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvtph_epu16(__m512h __A)1789 _mm512_cvtph_epu16(__m512h __A) {
1790   return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1791       (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)-1,
1792       _MM_FROUND_CUR_DIRECTION);
1793 }
1794 
1795 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvtph_epu16(__m512i __W,__mmask32 __U,__m512h __A)1796 _mm512_mask_cvtph_epu16(__m512i __W, __mmask32 __U, __m512h __A) {
1797   return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1798       (__v32hf)__A, (__v32hu)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1799 }
1800 
1801 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtph_epu16(__mmask32 __U,__m512h __A)1802 _mm512_maskz_cvtph_epu16(__mmask32 __U, __m512h __A) {
1803   return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1804       (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)__U,
1805       _MM_FROUND_CUR_DIRECTION);
1806 }
1807 
1808 #define _mm512_cvtt_roundph_epu16(A, R)                                        \
1809   ((__m512i)__builtin_ia32_vcvttph2uw512_mask(                                 \
1810       (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1),        \
1811       (int)(R)))
1812 
1813 #define _mm512_mask_cvtt_roundph_epu16(W, U, A, R)                             \
1814   ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), (__v32hu)(W),      \
1815                                               (__mmask32)(U), (int)(R)))
1816 
1817 #define _mm512_maskz_cvtt_roundph_epu16(U, A, R)                               \
1818   ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A),                    \
1819                                               (__v32hu)_mm512_setzero_epi32(), \
1820                                               (__mmask32)(U), (int)(R)))
1821 
1822 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvttph_epu16(__m512h __A)1823 _mm512_cvttph_epu16(__m512h __A) {
1824   return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1825       (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)-1,
1826       _MM_FROUND_CUR_DIRECTION);
1827 }
1828 
1829 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvttph_epu16(__m512i __W,__mmask32 __U,__m512h __A)1830 _mm512_mask_cvttph_epu16(__m512i __W, __mmask32 __U, __m512h __A) {
1831   return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1832       (__v32hf)__A, (__v32hu)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1833 }
1834 
1835 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvttph_epu16(__mmask32 __U,__m512h __A)1836 _mm512_maskz_cvttph_epu16(__mmask32 __U, __m512h __A) {
1837   return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1838       (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)__U,
1839       _MM_FROUND_CUR_DIRECTION);
1840 }
1841 
1842 #define _mm512_cvt_roundepu16_ph(A, R)                                         \
1843   ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A),                     \
1844                                              (__v32hf)_mm512_undefined_ph(),   \
1845                                              (__mmask32)(-1), (int)(R)))
1846 
1847 #define _mm512_mask_cvt_roundepu16_ph(W, U, A, R)                              \
1848   ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), (__v32hf)(W),       \
1849                                              (__mmask32)(U), (int)(R)))
1850 
1851 #define _mm512_maskz_cvt_roundepu16_ph(U, A, R)                                \
1852   ((__m512h)__builtin_ia32_vcvtuw2ph512_mask(                                  \
1853       (__v32hu)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1854 
1855 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_cvtepu16_ph(__m512i __A)1856 _mm512_cvtepu16_ph(__m512i __A) {
1857   return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1858       (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)-1,
1859       _MM_FROUND_CUR_DIRECTION);
1860 }
1861 
1862 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_cvtepu16_ph(__m512h __W,__mmask32 __U,__m512i __A)1863 _mm512_mask_cvtepu16_ph(__m512h __W, __mmask32 __U, __m512i __A) {
1864   return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1865       (__v32hu)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1866 }
1867 
1868 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtepu16_ph(__mmask32 __U,__m512i __A)1869 _mm512_maskz_cvtepu16_ph(__mmask32 __U, __m512i __A) {
1870   return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1871       (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1872       _MM_FROUND_CUR_DIRECTION);
1873 }
1874 
1875 #define _mm512_cvt_roundph_epi32(A, R)                                         \
1876   ((__m512i)__builtin_ia32_vcvtph2dq512_mask(                                  \
1877       (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1),        \
1878       (int)(R)))
1879 
1880 #define _mm512_mask_cvt_roundph_epi32(W, U, A, R)                              \
1881   ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), (__v16si)(W),       \
1882                                              (__mmask16)(U), (int)(R)))
1883 
1884 #define _mm512_maskz_cvt_roundph_epi32(U, A, R)                                \
1885   ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A),                     \
1886                                              (__v16si)_mm512_setzero_epi32(),  \
1887                                              (__mmask16)(U), (int)(R)))
1888 
1889 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvtph_epi32(__m256h __A)1890 _mm512_cvtph_epi32(__m256h __A) {
1891   return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1892       (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)-1,
1893       _MM_FROUND_CUR_DIRECTION);
1894 }
1895 
1896 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvtph_epi32(__m512i __W,__mmask16 __U,__m256h __A)1897 _mm512_mask_cvtph_epi32(__m512i __W, __mmask16 __U, __m256h __A) {
1898   return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1899       (__v16hf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1900 }
1901 
1902 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtph_epi32(__mmask16 __U,__m256h __A)1903 _mm512_maskz_cvtph_epi32(__mmask16 __U, __m256h __A) {
1904   return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1905       (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)__U,
1906       _MM_FROUND_CUR_DIRECTION);
1907 }
1908 
1909 #define _mm512_cvt_roundph_epu32(A, R)                                         \
1910   ((__m512i)__builtin_ia32_vcvtph2udq512_mask(                                 \
1911       (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1),        \
1912       (int)(R)))
1913 
1914 #define _mm512_mask_cvt_roundph_epu32(W, U, A, R)                              \
1915   ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), (__v16su)(W),      \
1916                                               (__mmask16)(U), (int)(R)))
1917 
1918 #define _mm512_maskz_cvt_roundph_epu32(U, A, R)                                \
1919   ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A),                    \
1920                                               (__v16su)_mm512_setzero_epi32(), \
1921                                               (__mmask16)(U), (int)(R)))
1922 
1923 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvtph_epu32(__m256h __A)1924 _mm512_cvtph_epu32(__m256h __A) {
1925   return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1926       (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)-1,
1927       _MM_FROUND_CUR_DIRECTION);
1928 }
1929 
1930 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvtph_epu32(__m512i __W,__mmask16 __U,__m256h __A)1931 _mm512_mask_cvtph_epu32(__m512i __W, __mmask16 __U, __m256h __A) {
1932   return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1933       (__v16hf)__A, (__v16su)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1934 }
1935 
1936 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtph_epu32(__mmask16 __U,__m256h __A)1937 _mm512_maskz_cvtph_epu32(__mmask16 __U, __m256h __A) {
1938   return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1939       (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)__U,
1940       _MM_FROUND_CUR_DIRECTION);
1941 }
1942 
1943 #define _mm512_cvt_roundepi32_ph(A, R)                                         \
1944   ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A),                     \
1945                                              (__v16hf)_mm256_undefined_ph(),   \
1946                                              (__mmask16)(-1), (int)(R)))
1947 
1948 #define _mm512_mask_cvt_roundepi32_ph(W, U, A, R)                              \
1949   ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), (__v16hf)(W),       \
1950                                              (__mmask16)(U), (int)(R)))
1951 
1952 #define _mm512_maskz_cvt_roundepi32_ph(U, A, R)                                \
1953   ((__m256h)__builtin_ia32_vcvtdq2ph512_mask(                                  \
1954       (__v16si)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1955 
1956 static __inline__ __m256h __DEFAULT_FN_ATTRS512
_mm512_cvtepi32_ph(__m512i __A)1957 _mm512_cvtepi32_ph(__m512i __A) {
1958   return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1959       (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
1960       _MM_FROUND_CUR_DIRECTION);
1961 }
1962 
1963 static __inline__ __m256h __DEFAULT_FN_ATTRS512
_mm512_mask_cvtepi32_ph(__m256h __W,__mmask16 __U,__m512i __A)1964 _mm512_mask_cvtepi32_ph(__m256h __W, __mmask16 __U, __m512i __A) {
1965   return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1966       (__v16si)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1967 }
1968 
1969 static __inline__ __m256h __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtepi32_ph(__mmask16 __U,__m512i __A)1970 _mm512_maskz_cvtepi32_ph(__mmask16 __U, __m512i __A) {
1971   return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1972       (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
1973       _MM_FROUND_CUR_DIRECTION);
1974 }
1975 
1976 #define _mm512_cvt_roundepu32_ph(A, R)                                         \
1977   ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A),                    \
1978                                               (__v16hf)_mm256_undefined_ph(),  \
1979                                               (__mmask16)(-1), (int)(R)))
1980 
1981 #define _mm512_mask_cvt_roundepu32_ph(W, U, A, R)                              \
1982   ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), (__v16hf)(W),      \
1983                                               (__mmask16)(U), (int)(R)))
1984 
1985 #define _mm512_maskz_cvt_roundepu32_ph(U, A, R)                                \
1986   ((__m256h)__builtin_ia32_vcvtudq2ph512_mask(                                 \
1987       (__v16su)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1988 
1989 static __inline__ __m256h __DEFAULT_FN_ATTRS512
_mm512_cvtepu32_ph(__m512i __A)1990 _mm512_cvtepu32_ph(__m512i __A) {
1991   return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
1992       (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
1993       _MM_FROUND_CUR_DIRECTION);
1994 }
1995 
1996 static __inline__ __m256h __DEFAULT_FN_ATTRS512
_mm512_mask_cvtepu32_ph(__m256h __W,__mmask16 __U,__m512i __A)1997 _mm512_mask_cvtepu32_ph(__m256h __W, __mmask16 __U, __m512i __A) {
1998   return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
1999       (__v16su)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2000 }
2001 
2002 static __inline__ __m256h __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtepu32_ph(__mmask16 __U,__m512i __A)2003 _mm512_maskz_cvtepu32_ph(__mmask16 __U, __m512i __A) {
2004   return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
2005       (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
2006       _MM_FROUND_CUR_DIRECTION);
2007 }
2008 
2009 #define _mm512_cvtt_roundph_epi32(A, R)                                        \
2010   ((__m512i)__builtin_ia32_vcvttph2dq512_mask(                                 \
2011       (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1),        \
2012       (int)(R)))
2013 
2014 #define _mm512_mask_cvtt_roundph_epi32(W, U, A, R)                             \
2015   ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), (__v16si)(W),      \
2016                                               (__mmask16)(U), (int)(R)))
2017 
2018 #define _mm512_maskz_cvtt_roundph_epi32(U, A, R)                               \
2019   ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A),                    \
2020                                               (__v16si)_mm512_setzero_epi32(), \
2021                                               (__mmask16)(U), (int)(R)))
2022 
2023 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvttph_epi32(__m256h __A)2024 _mm512_cvttph_epi32(__m256h __A) {
2025   return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2026       (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)-1,
2027       _MM_FROUND_CUR_DIRECTION);
2028 }
2029 
2030 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvttph_epi32(__m512i __W,__mmask16 __U,__m256h __A)2031 _mm512_mask_cvttph_epi32(__m512i __W, __mmask16 __U, __m256h __A) {
2032   return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2033       (__v16hf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2034 }
2035 
2036 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvttph_epi32(__mmask16 __U,__m256h __A)2037 _mm512_maskz_cvttph_epi32(__mmask16 __U, __m256h __A) {
2038   return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2039       (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)__U,
2040       _MM_FROUND_CUR_DIRECTION);
2041 }
2042 
2043 #define _mm512_cvtt_roundph_epu32(A, R)                                        \
2044   ((__m512i)__builtin_ia32_vcvttph2udq512_mask(                                \
2045       (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1),        \
2046       (int)(R)))
2047 
2048 #define _mm512_mask_cvtt_roundph_epu32(W, U, A, R)                             \
2049   ((__m512i)__builtin_ia32_vcvttph2udq512_mask((__v16hf)(A), (__v16su)(W),     \
2050                                                (__mmask16)(U), (int)(R)))
2051 
2052 #define _mm512_maskz_cvtt_roundph_epu32(U, A, R)                               \
2053   ((__m512i)__builtin_ia32_vcvttph2udq512_mask(                                \
2054       (__v16hf)(A), (__v16su)_mm512_setzero_epi32(), (__mmask16)(U),           \
2055       (int)(R)))
2056 
2057 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvttph_epu32(__m256h __A)2058 _mm512_cvttph_epu32(__m256h __A) {
2059   return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2060       (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)-1,
2061       _MM_FROUND_CUR_DIRECTION);
2062 }
2063 
2064 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvttph_epu32(__m512i __W,__mmask16 __U,__m256h __A)2065 _mm512_mask_cvttph_epu32(__m512i __W, __mmask16 __U, __m256h __A) {
2066   return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2067       (__v16hf)__A, (__v16su)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2068 }
2069 
2070 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvttph_epu32(__mmask16 __U,__m256h __A)2071 _mm512_maskz_cvttph_epu32(__mmask16 __U, __m256h __A) {
2072   return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2073       (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)__U,
2074       _MM_FROUND_CUR_DIRECTION);
2075 }
2076 
2077 #define _mm512_cvt_roundepi64_ph(A, R)                                         \
2078   ((__m128h)__builtin_ia32_vcvtqq2ph512_mask(                                  \
2079       (__v8di)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
2080 
2081 #define _mm512_mask_cvt_roundepi64_ph(W, U, A, R)                              \
2082   ((__m128h)__builtin_ia32_vcvtqq2ph512_mask((__v8di)(A), (__v8hf)(W),         \
2083                                              (__mmask8)(U), (int)(R)))
2084 
2085 #define _mm512_maskz_cvt_roundepi64_ph(U, A, R)                                \
2086   ((__m128h)__builtin_ia32_vcvtqq2ph512_mask(                                  \
2087       (__v8di)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
2088 
2089 static __inline__ __m128h __DEFAULT_FN_ATTRS512
_mm512_cvtepi64_ph(__m512i __A)2090 _mm512_cvtepi64_ph(__m512i __A) {
2091   return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2092       (__v8di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
2093       _MM_FROUND_CUR_DIRECTION);
2094 }
2095 
2096 static __inline__ __m128h __DEFAULT_FN_ATTRS512
_mm512_mask_cvtepi64_ph(__m128h __W,__mmask8 __U,__m512i __A)2097 _mm512_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m512i __A) {
2098   return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2099       (__v8di)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2100 }
2101 
2102 static __inline__ __m128h __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtepi64_ph(__mmask8 __U,__m512i __A)2103 _mm512_maskz_cvtepi64_ph(__mmask8 __U, __m512i __A) {
2104   return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2105       (__v8di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
2106       _MM_FROUND_CUR_DIRECTION);
2107 }
2108 
2109 #define _mm512_cvt_roundph_epi64(A, R)                                         \
2110   ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A),                      \
2111                                              (__v8di)_mm512_undefined_epi32(), \
2112                                              (__mmask8)(-1), (int)(R)))
2113 
2114 #define _mm512_mask_cvt_roundph_epi64(W, U, A, R)                              \
2115   ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), (__v8di)(W),         \
2116                                              (__mmask8)(U), (int)(R)))
2117 
2118 #define _mm512_maskz_cvt_roundph_epi64(U, A, R)                                \
2119   ((__m512i)__builtin_ia32_vcvtph2qq512_mask(                                  \
2120       (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2121 
2122 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvtph_epi64(__m128h __A)2123 _mm512_cvtph_epi64(__m128h __A) {
2124   return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2125       (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)-1,
2126       _MM_FROUND_CUR_DIRECTION);
2127 }
2128 
2129 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvtph_epi64(__m512i __W,__mmask8 __U,__m128h __A)2130 _mm512_mask_cvtph_epi64(__m512i __W, __mmask8 __U, __m128h __A) {
2131   return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2132       (__v8hf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2133 }
2134 
2135 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtph_epi64(__mmask8 __U,__m128h __A)2136 _mm512_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) {
2137   return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2138       (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)__U,
2139       _MM_FROUND_CUR_DIRECTION);
2140 }
2141 
2142 #define _mm512_cvt_roundepu64_ph(A, R)                                         \
2143   ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask(                                 \
2144       (__v8du)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
2145 
2146 #define _mm512_mask_cvt_roundepu64_ph(W, U, A, R)                              \
2147   ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask((__v8du)(A), (__v8hf)(W),        \
2148                                               (__mmask8)(U), (int)(R)))
2149 
2150 #define _mm512_maskz_cvt_roundepu64_ph(U, A, R)                                \
2151   ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask(                                 \
2152       (__v8du)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
2153 
2154 static __inline__ __m128h __DEFAULT_FN_ATTRS512
_mm512_cvtepu64_ph(__m512i __A)2155 _mm512_cvtepu64_ph(__m512i __A) {
2156   return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2157       (__v8du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
2158       _MM_FROUND_CUR_DIRECTION);
2159 }
2160 
2161 static __inline__ __m128h __DEFAULT_FN_ATTRS512
_mm512_mask_cvtepu64_ph(__m128h __W,__mmask8 __U,__m512i __A)2162 _mm512_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m512i __A) {
2163   return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2164       (__v8du)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2165 }
2166 
2167 static __inline__ __m128h __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtepu64_ph(__mmask8 __U,__m512i __A)2168 _mm512_maskz_cvtepu64_ph(__mmask8 __U, __m512i __A) {
2169   return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2170       (__v8du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
2171       _MM_FROUND_CUR_DIRECTION);
2172 }
2173 
2174 #define _mm512_cvt_roundph_epu64(A, R)                                         \
2175   ((__m512i)__builtin_ia32_vcvtph2uqq512_mask(                                 \
2176       (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1),           \
2177       (int)(R)))
2178 
2179 #define _mm512_mask_cvt_roundph_epu64(W, U, A, R)                              \
2180   ((__m512i)__builtin_ia32_vcvtph2uqq512_mask((__v8hf)(A), (__v8du)(W),        \
2181                                               (__mmask8)(U), (int)(R)))
2182 
2183 #define _mm512_maskz_cvt_roundph_epu64(U, A, R)                                \
2184   ((__m512i)__builtin_ia32_vcvtph2uqq512_mask(                                 \
2185       (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2186 
2187 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvtph_epu64(__m128h __A)2188 _mm512_cvtph_epu64(__m128h __A) {
2189   return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2190       (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)-1,
2191       _MM_FROUND_CUR_DIRECTION);
2192 }
2193 
2194 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvtph_epu64(__m512i __W,__mmask8 __U,__m128h __A)2195 _mm512_mask_cvtph_epu64(__m512i __W, __mmask8 __U, __m128h __A) {
2196   return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2197       (__v8hf)__A, (__v8du)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2198 }
2199 
2200 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtph_epu64(__mmask8 __U,__m128h __A)2201 _mm512_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) {
2202   return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2203       (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)__U,
2204       _MM_FROUND_CUR_DIRECTION);
2205 }
2206 
2207 #define _mm512_cvtt_roundph_epi64(A, R)                                        \
2208   ((__m512i)__builtin_ia32_vcvttph2qq512_mask(                                 \
2209       (__v8hf)(A), (__v8di)_mm512_undefined_epi32(), (__mmask8)(-1),           \
2210       (int)(R)))
2211 
2212 #define _mm512_mask_cvtt_roundph_epi64(W, U, A, R)                             \
2213   ((__m512i)__builtin_ia32_vcvttph2qq512_mask((__v8hf)(A), (__v8di)(W),        \
2214                                               (__mmask8)(U), (int)(R)))
2215 
2216 #define _mm512_maskz_cvtt_roundph_epi64(U, A, R)                               \
2217   ((__m512i)__builtin_ia32_vcvttph2qq512_mask(                                 \
2218       (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2219 
2220 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvttph_epi64(__m128h __A)2221 _mm512_cvttph_epi64(__m128h __A) {
2222   return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2223       (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)-1,
2224       _MM_FROUND_CUR_DIRECTION);
2225 }
2226 
2227 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvttph_epi64(__m512i __W,__mmask8 __U,__m128h __A)2228 _mm512_mask_cvttph_epi64(__m512i __W, __mmask8 __U, __m128h __A) {
2229   return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2230       (__v8hf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2231 }
2232 
2233 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvttph_epi64(__mmask8 __U,__m128h __A)2234 _mm512_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) {
2235   return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2236       (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)__U,
2237       _MM_FROUND_CUR_DIRECTION);
2238 }
2239 
2240 #define _mm512_cvtt_roundph_epu64(A, R)                                        \
2241   ((__m512i)__builtin_ia32_vcvttph2uqq512_mask(                                \
2242       (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1),           \
2243       (int)(R)))
2244 
2245 #define _mm512_mask_cvtt_roundph_epu64(W, U, A, R)                             \
2246   ((__m512i)__builtin_ia32_vcvttph2uqq512_mask((__v8hf)(A), (__v8du)(W),       \
2247                                                (__mmask8)(U), (int)(R)))
2248 
2249 #define _mm512_maskz_cvtt_roundph_epu64(U, A, R)                               \
2250   ((__m512i)__builtin_ia32_vcvttph2uqq512_mask(                                \
2251       (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2252 
2253 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvttph_epu64(__m128h __A)2254 _mm512_cvttph_epu64(__m128h __A) {
2255   return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2256       (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)-1,
2257       _MM_FROUND_CUR_DIRECTION);
2258 }
2259 
2260 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvttph_epu64(__m512i __W,__mmask8 __U,__m128h __A)2261 _mm512_mask_cvttph_epu64(__m512i __W, __mmask8 __U, __m128h __A) {
2262   return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2263       (__v8hf)__A, (__v8du)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2264 }
2265 
2266 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvttph_epu64(__mmask8 __U,__m128h __A)2267 _mm512_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) {
2268   return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2269       (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)__U,
2270       _MM_FROUND_CUR_DIRECTION);
2271 }
2272 
2273 #define _mm_cvt_roundsh_i32(A, R)                                              \
2274   ((int)__builtin_ia32_vcvtsh2si32((__v8hf)(A), (int)(R)))
2275 
_mm_cvtsh_i32(__m128h __A)2276 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvtsh_i32(__m128h __A) {
2277   return (int)__builtin_ia32_vcvtsh2si32((__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
2278 }
2279 
2280 #define _mm_cvt_roundsh_u32(A, R)                                              \
2281   ((unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)(A), (int)(R)))
2282 
2283 static __inline__ unsigned int __DEFAULT_FN_ATTRS128
_mm_cvtsh_u32(__m128h __A)2284 _mm_cvtsh_u32(__m128h __A) {
2285   return (unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)__A,
2286                                                    _MM_FROUND_CUR_DIRECTION);
2287 }
2288 
2289 #ifdef __x86_64__
2290 #define _mm_cvt_roundsh_i64(A, R)                                              \
2291   ((long long)__builtin_ia32_vcvtsh2si64((__v8hf)(A), (int)(R)))
2292 
_mm_cvtsh_i64(__m128h __A)2293 static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvtsh_i64(__m128h __A) {
2294   return (long long)__builtin_ia32_vcvtsh2si64((__v8hf)__A,
2295                                                _MM_FROUND_CUR_DIRECTION);
2296 }
2297 
2298 #define _mm_cvt_roundsh_u64(A, R)                                              \
2299   ((unsigned long long)__builtin_ia32_vcvtsh2usi64((__v8hf)(A), (int)(R)))
2300 
2301 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
_mm_cvtsh_u64(__m128h __A)2302 _mm_cvtsh_u64(__m128h __A) {
2303   return (unsigned long long)__builtin_ia32_vcvtsh2usi64(
2304       (__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
2305 }
2306 #endif // __x86_64__
2307 
2308 #define _mm_cvt_roundu32_sh(A, B, R)                                           \
2309   ((__m128h)__builtin_ia32_vcvtusi2sh((__v8hf)(A), (unsigned int)(B), (int)(R)))
2310 
2311 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_cvtu32_sh(__m128h __A,unsigned int __B)2312 _mm_cvtu32_sh(__m128h __A, unsigned int __B) {
2313   __A[0] = __B;
2314   return __A;
2315 }
2316 
2317 #ifdef __x86_64__
2318 #define _mm_cvt_roundu64_sh(A, B, R)                                           \
2319   ((__m128h)__builtin_ia32_vcvtusi642sh((__v8hf)(A), (unsigned long long)(B),  \
2320                                         (int)(R)))
2321 
2322 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_cvtu64_sh(__m128h __A,unsigned long long __B)2323 _mm_cvtu64_sh(__m128h __A, unsigned long long __B) {
2324   __A[0] = __B;
2325   return __A;
2326 }
2327 #endif
2328 
2329 #define _mm_cvt_roundi32_sh(A, B, R)                                           \
2330   ((__m128h)__builtin_ia32_vcvtsi2sh((__v8hf)(A), (int)(B), (int)(R)))
2331 
_mm_cvti32_sh(__m128h __A,int __B)2332 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvti32_sh(__m128h __A,
2333                                                               int __B) {
2334   __A[0] = __B;
2335   return __A;
2336 }
2337 
2338 #ifdef __x86_64__
2339 #define _mm_cvt_roundi64_sh(A, B, R)                                           \
2340   ((__m128h)__builtin_ia32_vcvtsi642sh((__v8hf)(A), (long long)(B), (int)(R)))
2341 
_mm_cvti64_sh(__m128h __A,long long __B)2342 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvti64_sh(__m128h __A,
2343                                                               long long __B) {
2344   __A[0] = __B;
2345   return __A;
2346 }
2347 #endif
2348 
2349 #define _mm_cvtt_roundsh_i32(A, R)                                             \
2350   ((int)__builtin_ia32_vcvttsh2si32((__v8hf)(A), (int)(R)))
2351 
_mm_cvttsh_i32(__m128h __A)2352 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvttsh_i32(__m128h __A) {
2353   return (int)__builtin_ia32_vcvttsh2si32((__v8hf)__A,
2354                                           _MM_FROUND_CUR_DIRECTION);
2355 }
2356 
2357 #ifdef __x86_64__
2358 #define _mm_cvtt_roundsh_i64(A, R)                                             \
2359   ((long long)__builtin_ia32_vcvttsh2si64((__v8hf)(A), (int)(R)))
2360 
_mm_cvttsh_i64(__m128h __A)2361 static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvttsh_i64(__m128h __A) {
2362   return (long long)__builtin_ia32_vcvttsh2si64((__v8hf)__A,
2363                                                 _MM_FROUND_CUR_DIRECTION);
2364 }
2365 #endif
2366 
2367 #define _mm_cvtt_roundsh_u32(A, R)                                             \
2368   ((unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)(A), (int)(R)))
2369 
2370 static __inline__ unsigned int __DEFAULT_FN_ATTRS128
_mm_cvttsh_u32(__m128h __A)2371 _mm_cvttsh_u32(__m128h __A) {
2372   return (unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)__A,
2373                                                     _MM_FROUND_CUR_DIRECTION);
2374 }
2375 
2376 #ifdef __x86_64__
2377 #define _mm_cvtt_roundsh_u64(A, R)                                             \
2378   ((unsigned long long)__builtin_ia32_vcvttsh2usi64((__v8hf)(A), (int)(R)))
2379 
2380 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
_mm_cvttsh_u64(__m128h __A)2381 _mm_cvttsh_u64(__m128h __A) {
2382   return (unsigned long long)__builtin_ia32_vcvttsh2usi64(
2383       (__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
2384 }
2385 #endif
2386 
2387 #define _mm512_cvtx_roundph_ps(A, R)                                           \
2388   ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A),                     \
2389                                              (__v16sf)_mm512_undefined_ps(),   \
2390                                              (__mmask16)(-1), (int)(R)))
2391 
2392 #define _mm512_mask_cvtx_roundph_ps(W, U, A, R)                                \
2393   ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), (__v16sf)(W),       \
2394                                              (__mmask16)(U), (int)(R)))
2395 
2396 #define _mm512_maskz_cvtx_roundph_ps(U, A, R)                                  \
2397   ((__m512)__builtin_ia32_vcvtph2psx512_mask(                                  \
2398       (__v16hf)(A), (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), (int)(R)))
2399 
_mm512_cvtxph_ps(__m256h __A)2400 static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtxph_ps(__m256h __A) {
2401   return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2402       (__v16hf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)-1,
2403       _MM_FROUND_CUR_DIRECTION);
2404 }
2405 
2406 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_cvtxph_ps(__m512 __W,__mmask16 __U,__m256h __A)2407 _mm512_mask_cvtxph_ps(__m512 __W, __mmask16 __U, __m256h __A) {
2408   return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2409       (__v16hf)__A, (__v16sf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2410 }
2411 
2412 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtxph_ps(__mmask16 __U,__m256h __A)2413 _mm512_maskz_cvtxph_ps(__mmask16 __U, __m256h __A) {
2414   return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2415       (__v16hf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U,
2416       _MM_FROUND_CUR_DIRECTION);
2417 }
2418 
2419 #define _mm512_cvtx_roundps_ph(A, R)                                           \
2420   ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A),                    \
2421                                               (__v16hf)_mm256_undefined_ph(),  \
2422                                               (__mmask16)(-1), (int)(R)))
2423 
2424 #define _mm512_mask_cvtx_roundps_ph(W, U, A, R)                                \
2425   ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), (__v16hf)(W),      \
2426                                               (__mmask16)(U), (int)(R)))
2427 
2428 #define _mm512_maskz_cvtx_roundps_ph(U, A, R)                                  \
2429   ((__m256h)__builtin_ia32_vcvtps2phx512_mask(                                 \
2430       (__v16sf)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
2431 
_mm512_cvtxps_ph(__m512 __A)2432 static __inline__ __m256h __DEFAULT_FN_ATTRS512 _mm512_cvtxps_ph(__m512 __A) {
2433   return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2434       (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
2435       _MM_FROUND_CUR_DIRECTION);
2436 }
2437 
2438 static __inline__ __m256h __DEFAULT_FN_ATTRS512
_mm512_mask_cvtxps_ph(__m256h __W,__mmask16 __U,__m512 __A)2439 _mm512_mask_cvtxps_ph(__m256h __W, __mmask16 __U, __m512 __A) {
2440   return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2441       (__v16sf)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2442 }
2443 
2444 static __inline__ __m256h __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtxps_ph(__mmask16 __U,__m512 __A)2445 _mm512_maskz_cvtxps_ph(__mmask16 __U, __m512 __A) {
2446   return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2447       (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
2448       _MM_FROUND_CUR_DIRECTION);
2449 }
2450 
2451 #define _mm512_fmadd_round_ph(A, B, C, R)                                      \
2452   ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2453       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2454       (__mmask32)-1, (int)(R)))
2455 
2456 #define _mm512_mask_fmadd_round_ph(A, U, B, C, R)                              \
2457   ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2458       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2459       (__mmask32)(U), (int)(R)))
2460 
2461 #define _mm512_mask3_fmadd_round_ph(A, B, C, U, R)                             \
2462   ((__m512h)__builtin_ia32_vfmaddph512_mask3(                                  \
2463       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2464       (__mmask32)(U), (int)(R)))
2465 
2466 #define _mm512_maskz_fmadd_round_ph(U, A, B, C, R)                             \
2467   ((__m512h)__builtin_ia32_vfmaddph512_maskz(                                  \
2468       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2469       (__mmask32)(U), (int)(R)))
2470 
2471 #define _mm512_fmsub_round_ph(A, B, C, R)                                      \
2472   ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2473       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
2474       (__mmask32)-1, (int)(R)))
2475 
2476 #define _mm512_mask_fmsub_round_ph(A, U, B, C, R)                              \
2477   ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2478       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
2479       (__mmask32)(U), (int)(R)))
2480 
2481 #define _mm512_maskz_fmsub_round_ph(U, A, B, C, R)                             \
2482   ((__m512h)__builtin_ia32_vfmaddph512_maskz(                                  \
2483       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
2484       (__mmask32)(U), (int)(R)))
2485 
2486 #define _mm512_fnmadd_round_ph(A, B, C, R)                                     \
2487   ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2488       (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),    \
2489       (__mmask32)-1, (int)(R)))
2490 
2491 #define _mm512_mask3_fnmadd_round_ph(A, B, C, U, R)                            \
2492   ((__m512h)__builtin_ia32_vfmaddph512_mask3(                                  \
2493       -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),    \
2494       (__mmask32)(U), (int)(R)))
2495 
2496 #define _mm512_maskz_fnmadd_round_ph(U, A, B, C, R)                            \
2497   ((__m512h)__builtin_ia32_vfmaddph512_maskz(                                  \
2498       -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),    \
2499       (__mmask32)(U), (int)(R)))
2500 
2501 #define _mm512_fnmsub_round_ph(A, B, C, R)                                     \
2502   ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2503       (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),   \
2504       (__mmask32)-1, (int)(R)))
2505 
2506 #define _mm512_maskz_fnmsub_round_ph(U, A, B, C, R)                            \
2507   ((__m512h)__builtin_ia32_vfmaddph512_maskz(                                  \
2508       -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),   \
2509       (__mmask32)(U), (int)(R)))
2510 
_mm512_fmadd_ph(__m512h __A,__m512h __B,__m512h __C)2511 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_ph(__m512h __A,
2512                                                                 __m512h __B,
2513                                                                 __m512h __C) {
2514   return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2515                                                   (__v32hf)__C, (__mmask32)-1,
2516                                                   _MM_FROUND_CUR_DIRECTION);
2517 }
2518 
2519 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_fmadd_ph(__m512h __A,__mmask32 __U,__m512h __B,__m512h __C)2520 _mm512_mask_fmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2521   return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2522                                                   (__v32hf)__C, (__mmask32)__U,
2523                                                   _MM_FROUND_CUR_DIRECTION);
2524 }
2525 
2526 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask3_fmadd_ph(__m512h __A,__m512h __B,__m512h __C,__mmask32 __U)2527 _mm512_mask3_fmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2528   return (__m512h)__builtin_ia32_vfmaddph512_mask3((__v32hf)__A, (__v32hf)__B,
2529                                                    (__v32hf)__C, (__mmask32)__U,
2530                                                    _MM_FROUND_CUR_DIRECTION);
2531 }
2532 
2533 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_fmadd_ph(__mmask32 __U,__m512h __A,__m512h __B,__m512h __C)2534 _mm512_maskz_fmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2535   return (__m512h)__builtin_ia32_vfmaddph512_maskz((__v32hf)__A, (__v32hf)__B,
2536                                                    (__v32hf)__C, (__mmask32)__U,
2537                                                    _MM_FROUND_CUR_DIRECTION);
2538 }
2539 
_mm512_fmsub_ph(__m512h __A,__m512h __B,__m512h __C)2540 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmsub_ph(__m512h __A,
2541                                                                 __m512h __B,
2542                                                                 __m512h __C) {
2543   return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2544                                                   -(__v32hf)__C, (__mmask32)-1,
2545                                                   _MM_FROUND_CUR_DIRECTION);
2546 }
2547 
2548 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_fmsub_ph(__m512h __A,__mmask32 __U,__m512h __B,__m512h __C)2549 _mm512_mask_fmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2550   return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2551                                                   -(__v32hf)__C, (__mmask32)__U,
2552                                                   _MM_FROUND_CUR_DIRECTION);
2553 }
2554 
2555 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_fmsub_ph(__mmask32 __U,__m512h __A,__m512h __B,__m512h __C)2556 _mm512_maskz_fmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2557   return (__m512h)__builtin_ia32_vfmaddph512_maskz(
2558       (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2559       _MM_FROUND_CUR_DIRECTION);
2560 }
2561 
_mm512_fnmadd_ph(__m512h __A,__m512h __B,__m512h __C)2562 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fnmadd_ph(__m512h __A,
2563                                                                  __m512h __B,
2564                                                                  __m512h __C) {
2565   return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2566                                                   (__v32hf)__C, (__mmask32)-1,
2567                                                   _MM_FROUND_CUR_DIRECTION);
2568 }
2569 
2570 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask3_fnmadd_ph(__m512h __A,__m512h __B,__m512h __C,__mmask32 __U)2571 _mm512_mask3_fnmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2572   return (__m512h)__builtin_ia32_vfmaddph512_mask3(-(__v32hf)__A, (__v32hf)__B,
2573                                                    (__v32hf)__C, (__mmask32)__U,
2574                                                    _MM_FROUND_CUR_DIRECTION);
2575 }
2576 
2577 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_fnmadd_ph(__mmask32 __U,__m512h __A,__m512h __B,__m512h __C)2578 _mm512_maskz_fnmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2579   return (__m512h)__builtin_ia32_vfmaddph512_maskz(-(__v32hf)__A, (__v32hf)__B,
2580                                                    (__v32hf)__C, (__mmask32)__U,
2581                                                    _MM_FROUND_CUR_DIRECTION);
2582 }
2583 
_mm512_fnmsub_ph(__m512h __A,__m512h __B,__m512h __C)2584 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fnmsub_ph(__m512h __A,
2585                                                                  __m512h __B,
2586                                                                  __m512h __C) {
2587   return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2588                                                   -(__v32hf)__C, (__mmask32)-1,
2589                                                   _MM_FROUND_CUR_DIRECTION);
2590 }
2591 
2592 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_fnmsub_ph(__mmask32 __U,__m512h __A,__m512h __B,__m512h __C)2593 _mm512_maskz_fnmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2594   return (__m512h)__builtin_ia32_vfmaddph512_maskz(
2595       -(__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2596       _MM_FROUND_CUR_DIRECTION);
2597 }
2598 
2599 #define _mm512_fmaddsub_round_ph(A, B, C, R)                                   \
2600   ((__m512h)__builtin_ia32_vfmaddsubph512_mask(                                \
2601       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2602       (__mmask32)-1, (int)(R)))
2603 
2604 #define _mm512_mask_fmaddsub_round_ph(A, U, B, C, R)                           \
2605   ((__m512h)__builtin_ia32_vfmaddsubph512_mask(                                \
2606       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2607       (__mmask32)(U), (int)(R)))
2608 
2609 #define _mm512_mask3_fmaddsub_round_ph(A, B, C, U, R)                          \
2610   ((__m512h)__builtin_ia32_vfmaddsubph512_mask3(                               \
2611       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2612       (__mmask32)(U), (int)(R)))
2613 
2614 #define _mm512_maskz_fmaddsub_round_ph(U, A, B, C, R)                          \
2615   ((__m512h)__builtin_ia32_vfmaddsubph512_maskz(                               \
2616       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2617       (__mmask32)(U), (int)(R)))
2618 
2619 #define _mm512_fmsubadd_round_ph(A, B, C, R)                                   \
2620   ((__m512h)__builtin_ia32_vfmaddsubph512_mask(                                \
2621       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
2622       (__mmask32)-1, (int)(R)))
2623 
2624 #define _mm512_mask_fmsubadd_round_ph(A, U, B, C, R)                           \
2625   ((__m512h)__builtin_ia32_vfmaddsubph512_mask(                                \
2626       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
2627       (__mmask32)(U), (int)(R)))
2628 
2629 #define _mm512_maskz_fmsubadd_round_ph(U, A, B, C, R)                          \
2630   ((__m512h)__builtin_ia32_vfmaddsubph512_maskz(                               \
2631       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
2632       (__mmask32)(U), (int)(R)))
2633 
2634 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_fmaddsub_ph(__m512h __A,__m512h __B,__m512h __C)2635 _mm512_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C) {
2636   return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2637       (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)-1,
2638       _MM_FROUND_CUR_DIRECTION);
2639 }
2640 
2641 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_fmaddsub_ph(__m512h __A,__mmask32 __U,__m512h __B,__m512h __C)2642 _mm512_mask_fmaddsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2643   return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2644       (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2645       _MM_FROUND_CUR_DIRECTION);
2646 }
2647 
2648 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask3_fmaddsub_ph(__m512h __A,__m512h __B,__m512h __C,__mmask32 __U)2649 _mm512_mask3_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2650   return (__m512h)__builtin_ia32_vfmaddsubph512_mask3(
2651       (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2652       _MM_FROUND_CUR_DIRECTION);
2653 }
2654 
2655 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_fmaddsub_ph(__mmask32 __U,__m512h __A,__m512h __B,__m512h __C)2656 _mm512_maskz_fmaddsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2657   return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
2658       (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2659       _MM_FROUND_CUR_DIRECTION);
2660 }
2661 
2662 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_fmsubadd_ph(__m512h __A,__m512h __B,__m512h __C)2663 _mm512_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C) {
2664   return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2665       (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)-1,
2666       _MM_FROUND_CUR_DIRECTION);
2667 }
2668 
2669 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_fmsubadd_ph(__m512h __A,__mmask32 __U,__m512h __B,__m512h __C)2670 _mm512_mask_fmsubadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2671   return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2672       (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2673       _MM_FROUND_CUR_DIRECTION);
2674 }
2675 
2676 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_fmsubadd_ph(__mmask32 __U,__m512h __A,__m512h __B,__m512h __C)2677 _mm512_maskz_fmsubadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2678   return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
2679       (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2680       _MM_FROUND_CUR_DIRECTION);
2681 }
2682 
2683 #define _mm512_mask3_fmsub_round_ph(A, B, C, U, R)                             \
2684   ((__m512h)__builtin_ia32_vfmsubph512_mask3(                                  \
2685       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2686       (__mmask32)(U), (int)(R)))
2687 
2688 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask3_fmsub_ph(__m512h __A,__m512h __B,__m512h __C,__mmask32 __U)2689 _mm512_mask3_fmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2690   return (__m512h)__builtin_ia32_vfmsubph512_mask3((__v32hf)__A, (__v32hf)__B,
2691                                                    (__v32hf)__C, (__mmask32)__U,
2692                                                    _MM_FROUND_CUR_DIRECTION);
2693 }
2694 
2695 #define _mm512_mask3_fmsubadd_round_ph(A, B, C, U, R)                          \
2696   ((__m512h)__builtin_ia32_vfmsubaddph512_mask3(                               \
2697       (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2698       (__mmask32)(U), (int)(R)))
2699 
2700 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask3_fmsubadd_ph(__m512h __A,__m512h __B,__m512h __C,__mmask32 __U)2701 _mm512_mask3_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2702   return (__m512h)__builtin_ia32_vfmsubaddph512_mask3(
2703       (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2704       _MM_FROUND_CUR_DIRECTION);
2705 }
2706 
2707 #define _mm512_mask_fnmadd_round_ph(A, U, B, C, R)                             \
2708   ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2709       (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),    \
2710       (__mmask32)(U), (int)(R)))
2711 
2712 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_fnmadd_ph(__m512h __A,__mmask32 __U,__m512h __B,__m512h __C)2713 _mm512_mask_fnmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2714   return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2715                                                   (__v32hf)__C, (__mmask32)__U,
2716                                                   _MM_FROUND_CUR_DIRECTION);
2717 }
2718 
2719 #define _mm512_mask_fnmsub_round_ph(A, U, B, C, R)                             \
2720   ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2721       (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),   \
2722       (__mmask32)(U), (int)(R)))
2723 
2724 #define _mm512_mask3_fnmsub_round_ph(A, B, C, U, R)                            \
2725   ((__m512h)__builtin_ia32_vfmsubph512_mask3(                                  \
2726       -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),    \
2727       (__mmask32)(U), (int)(R)))
2728 
2729 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_fnmsub_ph(__m512h __A,__mmask32 __U,__m512h __B,__m512h __C)2730 _mm512_mask_fnmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2731   return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2732                                                   -(__v32hf)__C, (__mmask32)__U,
2733                                                   _MM_FROUND_CUR_DIRECTION);
2734 }
2735 
2736 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask3_fnmsub_ph(__m512h __A,__m512h __B,__m512h __C,__mmask32 __U)2737 _mm512_mask3_fnmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2738   return (__m512h)__builtin_ia32_vfmsubph512_mask3(-(__v32hf)__A, (__v32hf)__B,
2739                                                    (__v32hf)__C, (__mmask32)__U,
2740                                                    _MM_FROUND_CUR_DIRECTION);
2741 }
2742 
_mm_fmadd_sh(__m128h __W,__m128h __A,__m128h __B)2743 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sh(__m128h __W,
2744                                                              __m128h __A,
2745                                                              __m128h __B) {
2746   return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
2747                                        (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
2748 }
2749 
_mm_mask_fmadd_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)2750 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_sh(__m128h __W,
2751                                                                   __mmask8 __U,
2752                                                                   __m128h __A,
2753                                                                   __m128h __B) {
2754   return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
2755                                        (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2756 }
2757 
2758 #define _mm_fmadd_round_sh(A, B, C, R)                                         \
2759   ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2760       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C),        \
2761       (__mmask8)-1, (int)(R)))
2762 
2763 #define _mm_mask_fmadd_round_sh(W, U, A, B, R)                                 \
2764   ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2765       (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B),        \
2766       (__mmask8)(U), (int)(R)))
2767 
2768 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_fmadd_sh(__mmask8 __U,__m128h __A,__m128h __B,__m128h __C)2769 _mm_maskz_fmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2770   return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B, (__v8hf)__C,
2771                                         (__mmask8)__U,
2772                                         _MM_FROUND_CUR_DIRECTION);
2773 }
2774 
2775 #define _mm_maskz_fmadd_round_sh(U, A, B, C, R)                                \
2776   ((__m128h)__builtin_ia32_vfmaddsh3_maskz(                                    \
2777       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C),        \
2778       (__mmask8)(U), (int)(R)))
2779 
2780 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask3_fmadd_sh(__m128h __W,__m128h __X,__m128h __Y,__mmask8 __U)2781 _mm_mask3_fmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2782   return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)__Y,
2783                                         (__mmask8)__U,
2784                                         _MM_FROUND_CUR_DIRECTION);
2785 }
2786 
2787 #define _mm_mask3_fmadd_round_sh(W, X, Y, U, R)                                \
2788   ((__m128h)__builtin_ia32_vfmaddsh3_mask3(                                    \
2789       (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y),        \
2790       (__mmask8)(U), (int)(R)))
2791 
_mm_fmsub_sh(__m128h __W,__m128h __A,__m128h __B)2792 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsub_sh(__m128h __W,
2793                                                              __m128h __A,
2794                                                              __m128h __B) {
2795   return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
2796                                                 -(__v8hf)__B, (__mmask8)-1,
2797                                                 _MM_FROUND_CUR_DIRECTION);
2798 }
2799 
_mm_mask_fmsub_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)2800 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_sh(__m128h __W,
2801                                                                   __mmask8 __U,
2802                                                                   __m128h __A,
2803                                                                   __m128h __B) {
2804   return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
2805                                                 -(__v8hf)__B, (__mmask8)__U,
2806                                                 _MM_FROUND_CUR_DIRECTION);
2807 }
2808 
2809 #define _mm_fmsub_round_sh(A, B, C, R)                                         \
2810   ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2811       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C),       \
2812       (__mmask8)-1, (int)(R)))
2813 
2814 #define _mm_mask_fmsub_round_sh(W, U, A, B, R)                                 \
2815   ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2816       (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B),       \
2817       (__mmask8)(U), (int)(R)))
2818 
2819 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_fmsub_sh(__mmask8 __U,__m128h __A,__m128h __B,__m128h __C)2820 _mm_maskz_fmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2821   return (__m128h)__builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B,
2822                                                  -(__v8hf)__C, (__mmask8)__U,
2823                                                  _MM_FROUND_CUR_DIRECTION);
2824 }
2825 
2826 #define _mm_maskz_fmsub_round_sh(U, A, B, C, R)                                \
2827   ((__m128h)__builtin_ia32_vfmaddsh3_maskz(                                    \
2828       (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C),       \
2829       (__mmask8)(U), (int)R))
2830 
2831 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask3_fmsub_sh(__m128h __W,__m128h __X,__m128h __Y,__mmask8 __U)2832 _mm_mask3_fmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2833   return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)__Y,
2834                                         (__mmask8)__U,
2835                                         _MM_FROUND_CUR_DIRECTION);
2836 }
2837 
2838 #define _mm_mask3_fmsub_round_sh(W, X, Y, U, R)                                \
2839   ((__m128h)__builtin_ia32_vfmsubsh3_mask3(                                    \
2840       (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y),        \
2841       (__mmask8)(U), (int)(R)))
2842 
_mm_fnmadd_sh(__m128h __W,__m128h __A,__m128h __B)2843 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmadd_sh(__m128h __W,
2844                                                               __m128h __A,
2845                                                               __m128h __B) {
2846   return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
2847                                        (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
2848 }
2849 
2850 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_fnmadd_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)2851 _mm_mask_fnmadd_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
2852   return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
2853                                        (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2854 }
2855 
2856 #define _mm_fnmadd_round_sh(A, B, C, R)                                        \
2857   ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2858       (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C),       \
2859       (__mmask8)-1, (int)(R)))
2860 
2861 #define _mm_mask_fnmadd_round_sh(W, U, A, B, R)                                \
2862   ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2863       (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B),       \
2864       (__mmask8)(U), (int)(R)))
2865 
2866 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_fnmadd_sh(__mmask8 __U,__m128h __A,__m128h __B,__m128h __C)2867 _mm_maskz_fnmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2868   return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C,
2869                                         (__mmask8)__U,
2870                                         _MM_FROUND_CUR_DIRECTION);
2871 }
2872 
2873 #define _mm_maskz_fnmadd_round_sh(U, A, B, C, R)                               \
2874   ((__m128h)__builtin_ia32_vfmaddsh3_maskz(                                    \
2875       (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C),       \
2876       (__mmask8)(U), (int)(R)))
2877 
2878 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask3_fnmadd_sh(__m128h __W,__m128h __X,__m128h __Y,__mmask8 __U)2879 _mm_mask3_fnmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2880   return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)__Y,
2881                                         (__mmask8)__U,
2882                                         _MM_FROUND_CUR_DIRECTION);
2883 }
2884 
2885 #define _mm_mask3_fnmadd_round_sh(W, X, Y, U, R)                               \
2886   ((__m128h)__builtin_ia32_vfmaddsh3_mask3(                                    \
2887       (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y),       \
2888       (__mmask8)(U), (int)(R)))
2889 
_mm_fnmsub_sh(__m128h __W,__m128h __A,__m128h __B)2890 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmsub_sh(__m128h __W,
2891                                                               __m128h __A,
2892                                                               __m128h __B) {
2893   return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
2894                                        (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
2895 }
2896 
2897 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_fnmsub_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)2898 _mm_mask_fnmsub_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
2899   return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
2900                                        (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2901 }
2902 
2903 #define _mm_fnmsub_round_sh(A, B, C, R)                                        \
2904   ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2905       (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C),      \
2906       (__mmask8)-1, (int)(R)))
2907 
2908 #define _mm_mask_fnmsub_round_sh(W, U, A, B, R)                                \
2909   ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2910       (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B),      \
2911       (__mmask8)(U), (int)(R)))
2912 
2913 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_fnmsub_sh(__mmask8 __U,__m128h __A,__m128h __B,__m128h __C)2914 _mm_maskz_fnmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2915   return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C,
2916                                         (__mmask8)__U,
2917                                         _MM_FROUND_CUR_DIRECTION);
2918 }
2919 
2920 #define _mm_maskz_fnmsub_round_sh(U, A, B, C, R)                               \
2921   ((__m128h)__builtin_ia32_vfmaddsh3_maskz(                                    \
2922       (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C),      \
2923       (__mmask8)(U), (int)(R)))
2924 
2925 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask3_fnmsub_sh(__m128h __W,__m128h __X,__m128h __Y,__mmask8 __U)2926 _mm_mask3_fnmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2927   return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)__Y,
2928                                         (__mmask8)__U,
2929                                         _MM_FROUND_CUR_DIRECTION);
2930 }
2931 
2932 #define _mm_mask3_fnmsub_round_sh(W, X, Y, U, R)                               \
2933   ((__m128h)__builtin_ia32_vfmsubsh3_mask3(                                    \
2934       (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y),       \
2935       (__mmask8)(U), (int)(R)))
2936 
_mm_fcmadd_sch(__m128h __A,__m128h __B,__m128h __C)2937 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmadd_sch(__m128h __A,
2938                                                                __m128h __B,
2939                                                                __m128h __C) {
2940   return (__m128h)__builtin_ia32_vfcmaddcsh_mask((__v4sf)__A, (__v4sf)__B,
2941                                                  (__v4sf)__C, (__mmask8)-1,
2942                                                  _MM_FROUND_CUR_DIRECTION);
2943 }
2944 
2945 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_fcmadd_sch(__m128h __A,__mmask8 __U,__m128h __B,__m128h __C)2946 _mm_mask_fcmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
2947   return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask(
2948       (__v4sf)__A, (__v4sf)(__B), (__v4sf)(__C), __U, _MM_FROUND_CUR_DIRECTION);
2949 }
2950 
2951 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_fcmadd_sch(__mmask8 __U,__m128h __A,__m128h __B,__m128h __C)2952 _mm_maskz_fcmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2953   return (__m128h)__builtin_ia32_vfcmaddcsh_maskz((__v4sf)__A, (__v4sf)__B,
2954                                                   (__v4sf)__C, (__mmask8)__U,
2955                                                   _MM_FROUND_CUR_DIRECTION);
2956 }
2957 
2958 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask3_fcmadd_sch(__m128h __A,__m128h __B,__m128h __C,__mmask8 __U)2959 _mm_mask3_fcmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
2960   return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask3(
2961       (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U, _MM_FROUND_CUR_DIRECTION);
2962 }
2963 
2964 #define _mm_fcmadd_round_sch(A, B, C, R)                                       \
2965   ((__m128h)__builtin_ia32_vfcmaddcsh_mask(                                    \
2966       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
2967       (__mmask8)-1, (int)(R)))
2968 
2969 #define _mm_mask_fcmadd_round_sch(A, U, B, C, R)                               \
2970   ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask(                              \
2971       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
2972       (__mmask8)(U), (int)(R)))
2973 
2974 #define _mm_maskz_fcmadd_round_sch(U, A, B, C, R)                              \
2975   ((__m128h)__builtin_ia32_vfcmaddcsh_maskz(                                   \
2976       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
2977       (__mmask8)(U), (int)(R)))
2978 
2979 #define _mm_mask3_fcmadd_round_sch(A, B, C, U, R)                              \
2980   ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask3(                             \
2981       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
2982       (__mmask8)(U), (int)(R)))
2983 
_mm_fmadd_sch(__m128h __A,__m128h __B,__m128h __C)2984 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sch(__m128h __A,
2985                                                               __m128h __B,
2986                                                               __m128h __C) {
2987   return (__m128h)__builtin_ia32_vfmaddcsh_mask((__v4sf)__A, (__v4sf)__B,
2988                                                 (__v4sf)__C, (__mmask8)-1,
2989                                                 _MM_FROUND_CUR_DIRECTION);
2990 }
2991 
2992 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_fmadd_sch(__m128h __A,__mmask8 __U,__m128h __B,__m128h __C)2993 _mm_mask_fmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
2994   return (__m128h)__builtin_ia32_vfmaddcsh_round_mask(
2995       (__v4sf)__A, (__v4sf)(__B), (__v4sf)(__C), __U, _MM_FROUND_CUR_DIRECTION);
2996 }
2997 
2998 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_fmadd_sch(__mmask8 __U,__m128h __A,__m128h __B,__m128h __C)2999 _mm_maskz_fmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
3000   return (__m128h)__builtin_ia32_vfmaddcsh_maskz((__v4sf)__A, (__v4sf)__B,
3001                                                  (__v4sf)__C, (__mmask8)__U,
3002                                                  _MM_FROUND_CUR_DIRECTION);
3003 }
3004 
3005 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask3_fmadd_sch(__m128h __A,__m128h __B,__m128h __C,__mmask8 __U)3006 _mm_mask3_fmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
3007   return (__m128h)__builtin_ia32_vfmaddcsh_round_mask3(
3008       (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U, _MM_FROUND_CUR_DIRECTION);
3009 }
3010 
3011 #define _mm_fmadd_round_sch(A, B, C, R)                                        \
3012   ((__m128h)__builtin_ia32_vfmaddcsh_mask(                                     \
3013       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
3014       (__mmask8)-1, (int)(R)))
3015 
3016 #define _mm_mask_fmadd_round_sch(A, U, B, C, R)                                \
3017   ((__m128h)__builtin_ia32_vfmaddcsh_round_mask(                               \
3018       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
3019       (__mmask8)(U), (int)(R)))
3020 
3021 #define _mm_maskz_fmadd_round_sch(U, A, B, C, R)                               \
3022   ((__m128h)__builtin_ia32_vfmaddcsh_maskz(                                    \
3023       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
3024       (__mmask8)(U), (int)(R)))
3025 
3026 #define _mm_mask3_fmadd_round_sch(A, B, C, U, R)                               \
3027   ((__m128h)__builtin_ia32_vfmaddcsh_round_mask3(                              \
3028       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
3029       (__mmask8)(U), (int)(R)))
3030 
_mm_fcmul_sch(__m128h __A,__m128h __B)3031 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmul_sch(__m128h __A,
3032                                                               __m128h __B) {
3033   return (__m128h)__builtin_ia32_vfcmulcsh_mask(
3034       (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1,
3035       _MM_FROUND_CUR_DIRECTION);
3036 }
3037 
3038 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_fcmul_sch(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)3039 _mm_mask_fcmul_sch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
3040   return (__m128h)__builtin_ia32_vfcmulcsh_mask((__v4sf)__A, (__v4sf)__B,
3041                                                 (__v4sf)__W, (__mmask8)__U,
3042                                                 _MM_FROUND_CUR_DIRECTION);
3043 }
3044 
3045 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_fcmul_sch(__mmask8 __U,__m128h __A,__m128h __B)3046 _mm_maskz_fcmul_sch(__mmask8 __U, __m128h __A, __m128h __B) {
3047   return (__m128h)__builtin_ia32_vfcmulcsh_mask(
3048       (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U,
3049       _MM_FROUND_CUR_DIRECTION);
3050 }
3051 
3052 #define _mm_fcmul_round_sch(A, B, R)                                           \
3053   ((__m128h)__builtin_ia32_vfcmulcsh_mask(                                     \
3054       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B),                              \
3055       (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
3056 
3057 #define _mm_mask_fcmul_round_sch(W, U, A, B, R)                                \
3058   ((__m128h)__builtin_ia32_vfcmulcsh_mask(                                     \
3059       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W),        \
3060       (__mmask8)(U), (int)(R)))
3061 
3062 #define _mm_maskz_fcmul_round_sch(U, A, B, R)                                  \
3063   ((__m128h)__builtin_ia32_vfcmulcsh_mask(                                     \
3064       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B),                              \
3065       (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
3066 
_mm_fmul_sch(__m128h __A,__m128h __B)3067 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmul_sch(__m128h __A,
3068                                                              __m128h __B) {
3069   return (__m128h)__builtin_ia32_vfmulcsh_mask(
3070       (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1,
3071       _MM_FROUND_CUR_DIRECTION);
3072 }
3073 
_mm_mask_fmul_sch(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)3074 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmul_sch(__m128h __W,
3075                                                                   __mmask8 __U,
3076                                                                   __m128h __A,
3077                                                                   __m128h __B) {
3078   return (__m128h)__builtin_ia32_vfmulcsh_mask((__v4sf)__A, (__v4sf)__B,
3079                                                (__v4sf)__W, (__mmask8)__U,
3080                                                _MM_FROUND_CUR_DIRECTION);
3081 }
3082 
3083 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_fmul_sch(__mmask8 __U,__m128h __A,__m128h __B)3084 _mm_maskz_fmul_sch(__mmask8 __U, __m128h __A, __m128h __B) {
3085   return (__m128h)__builtin_ia32_vfmulcsh_mask(
3086       (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U,
3087       _MM_FROUND_CUR_DIRECTION);
3088 }
3089 
3090 #define _mm_fmul_round_sch(A, B, R)                                            \
3091   ((__m128h)__builtin_ia32_vfmulcsh_mask(                                      \
3092       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B),                              \
3093       (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
3094 
3095 #define _mm_mask_fmul_round_sch(W, U, A, B, R)                                 \
3096   ((__m128h)__builtin_ia32_vfmulcsh_mask(                                      \
3097       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W),        \
3098       (__mmask8)(U), (int)(R)))
3099 
3100 #define _mm_maskz_fmul_round_sch(U, A, B, R)                                   \
3101   ((__m128h)__builtin_ia32_vfmulcsh_mask(                                      \
3102       (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B),                              \
3103       (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
3104 
_mm512_fcmul_pch(__m512h __A,__m512h __B)3105 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmul_pch(__m512h __A,
3106                                                                  __m512h __B) {
3107   return (__m512h)__builtin_ia32_vfcmulcph512_mask(
3108       (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (__mmask16)-1,
3109       _MM_FROUND_CUR_DIRECTION);
3110 }
3111 
3112 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_fcmul_pch(__m512h __W,__mmask16 __U,__m512h __A,__m512h __B)3113 _mm512_mask_fcmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
3114   return (__m512h)__builtin_ia32_vfcmulcph512_mask((__v16sf)__A, (__v16sf)__B,
3115                                                    (__v16sf)__W, (__mmask16)__U,
3116                                                    _MM_FROUND_CUR_DIRECTION);
3117 }
3118 
3119 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_fcmul_pch(__mmask16 __U,__m512h __A,__m512h __B)3120 _mm512_maskz_fcmul_pch(__mmask16 __U, __m512h __A, __m512h __B) {
3121   return (__m512h)__builtin_ia32_vfcmulcph512_mask(
3122       (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (__mmask16)__U,
3123       _MM_FROUND_CUR_DIRECTION);
3124 }
3125 
3126 #define _mm512_fcmul_round_pch(A, B, R)                                        \
3127   ((__m512h)__builtin_ia32_vfcmulcph512_mask(                                  \
3128       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B),                            \
3129       (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
3130 
3131 #define _mm512_mask_fcmul_round_pch(W, U, A, B, R)                             \
3132   ((__m512h)__builtin_ia32_vfcmulcph512_mask(                                  \
3133       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W),     \
3134       (__mmask16)(U), (int)(R)))
3135 
3136 #define _mm512_maskz_fcmul_round_pch(U, A, B, R)                               \
3137   ((__m512h)__builtin_ia32_vfcmulcph512_mask(                                  \
3138       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B),                            \
3139       (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
3140 
_mm512_fmul_pch(__m512h __A,__m512h __B)3141 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmul_pch(__m512h __A,
3142                                                                 __m512h __B) {
3143   return (__m512h)__builtin_ia32_vfmulcph512_mask(
3144       (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (__mmask16)-1,
3145       _MM_FROUND_CUR_DIRECTION);
3146 }
3147 
3148 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_fmul_pch(__m512h __W,__mmask16 __U,__m512h __A,__m512h __B)3149 _mm512_mask_fmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
3150   return (__m512h)__builtin_ia32_vfmulcph512_mask((__v16sf)__A, (__v16sf)__B,
3151                                                   (__v16sf)__W, (__mmask16)__U,
3152                                                   _MM_FROUND_CUR_DIRECTION);
3153 }
3154 
3155 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_fmul_pch(__mmask16 __U,__m512h __A,__m512h __B)3156 _mm512_maskz_fmul_pch(__mmask16 __U, __m512h __A, __m512h __B) {
3157   return (__m512h)__builtin_ia32_vfmulcph512_mask(
3158       (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (__mmask16)__U,
3159       _MM_FROUND_CUR_DIRECTION);
3160 }
3161 
3162 #define _mm512_fmul_round_pch(A, B, R)                                         \
3163   ((__m512h)__builtin_ia32_vfmulcph512_mask(                                   \
3164       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B),                            \
3165       (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
3166 
3167 #define _mm512_mask_fmul_round_pch(W, U, A, B, R)                              \
3168   ((__m512h)__builtin_ia32_vfmulcph512_mask(                                   \
3169       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W),     \
3170       (__mmask16)(U), (int)(R)))
3171 
3172 #define _mm512_maskz_fmul_round_pch(U, A, B, R)                                \
3173   ((__m512h)__builtin_ia32_vfmulcph512_mask(                                   \
3174       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B),                            \
3175       (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
3176 
_mm512_fcmadd_pch(__m512h __A,__m512h __B,__m512h __C)3177 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmadd_pch(__m512h __A,
3178                                                                   __m512h __B,
3179                                                                   __m512h __C) {
3180   return (__m512h)__builtin_ia32_vfcmaddcph512_mask3(
3181       (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)-1,
3182       _MM_FROUND_CUR_DIRECTION);
3183 }
3184 
3185 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_fcmadd_pch(__m512h __A,__mmask16 __U,__m512h __B,__m512h __C)3186 _mm512_mask_fcmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
3187   return (__m512h)__builtin_ia32_vfcmaddcph512_mask(
3188       (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3189       _MM_FROUND_CUR_DIRECTION);
3190 }
3191 
3192 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask3_fcmadd_pch(__m512h __A,__m512h __B,__m512h __C,__mmask16 __U)3193 _mm512_mask3_fcmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
3194   return (__m512h)__builtin_ia32_vfcmaddcph512_mask3(
3195       (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3196       _MM_FROUND_CUR_DIRECTION);
3197 }
3198 
3199 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_fcmadd_pch(__mmask16 __U,__m512h __A,__m512h __B,__m512h __C)3200 _mm512_maskz_fcmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
3201   return (__m512h)__builtin_ia32_vfcmaddcph512_maskz(
3202       (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3203       _MM_FROUND_CUR_DIRECTION);
3204 }
3205 
3206 #define _mm512_fcmadd_round_pch(A, B, C, R)                                    \
3207   ((__m512h)__builtin_ia32_vfcmaddcph512_mask3(                                \
3208       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3209       (__mmask16)-1, (int)(R)))
3210 
3211 #define _mm512_mask_fcmadd_round_pch(A, U, B, C, R)                            \
3212   ((__m512h)__builtin_ia32_vfcmaddcph512_mask(                                 \
3213       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3214       (__mmask16)(U), (int)(R)))
3215 
3216 #define _mm512_mask3_fcmadd_round_pch(A, B, C, U, R)                           \
3217   ((__m512h)__builtin_ia32_vfcmaddcph512_mask3(                                \
3218       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3219       (__mmask16)(U), (int)(R)))
3220 
3221 #define _mm512_maskz_fcmadd_round_pch(U, A, B, C, R)                           \
3222   ((__m512h)__builtin_ia32_vfcmaddcph512_maskz(                                \
3223       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3224       (__mmask16)(U), (int)(R)))
3225 
_mm512_fmadd_pch(__m512h __A,__m512h __B,__m512h __C)3226 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_pch(__m512h __A,
3227                                                                  __m512h __B,
3228                                                                  __m512h __C) {
3229   return (__m512h)__builtin_ia32_vfmaddcph512_mask3((__v16sf)__A, (__v16sf)__B,
3230                                                     (__v16sf)__C, (__mmask16)-1,
3231                                                     _MM_FROUND_CUR_DIRECTION);
3232 }
3233 
3234 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_fmadd_pch(__m512h __A,__mmask16 __U,__m512h __B,__m512h __C)3235 _mm512_mask_fmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
3236   return (__m512h)__builtin_ia32_vfmaddcph512_mask((__v16sf)__A, (__v16sf)__B,
3237                                                    (__v16sf)__C, (__mmask16)__U,
3238                                                    _MM_FROUND_CUR_DIRECTION);
3239 }
3240 
3241 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask3_fmadd_pch(__m512h __A,__m512h __B,__m512h __C,__mmask16 __U)3242 _mm512_mask3_fmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
3243   return (__m512h)__builtin_ia32_vfmaddcph512_mask3(
3244       (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3245       _MM_FROUND_CUR_DIRECTION);
3246 }
3247 
3248 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_fmadd_pch(__mmask16 __U,__m512h __A,__m512h __B,__m512h __C)3249 _mm512_maskz_fmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
3250   return (__m512h)__builtin_ia32_vfmaddcph512_maskz(
3251       (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3252       _MM_FROUND_CUR_DIRECTION);
3253 }
3254 
3255 #define _mm512_fmadd_round_pch(A, B, C, R)                                     \
3256   ((__m512h)__builtin_ia32_vfmaddcph512_mask3(                                 \
3257       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3258       (__mmask16)-1, (int)(R)))
3259 
3260 #define _mm512_mask_fmadd_round_pch(A, U, B, C, R)                             \
3261   ((__m512h)__builtin_ia32_vfmaddcph512_mask(                                  \
3262       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3263       (__mmask16)(U), (int)(R)))
3264 
3265 #define _mm512_mask3_fmadd_round_pch(A, B, C, U, R)                            \
3266   ((__m512h)__builtin_ia32_vfmaddcph512_mask3(                                 \
3267       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3268       (__mmask16)(U), (int)(R)))
3269 
3270 #define _mm512_maskz_fmadd_round_pch(U, A, B, C, R)                            \
3271   ((__m512h)__builtin_ia32_vfmaddcph512_maskz(                                 \
3272       (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3273       (__mmask16)(U), (int)(R)))
3274 
3275 static __inline__ _Float16 __DEFAULT_FN_ATTRS512
_mm512_reduce_add_ph(__m512h __W)3276 _mm512_reduce_add_ph(__m512h __W) {
3277   return __builtin_ia32_reduce_fadd_ph512(-0.0f16, __W);
3278 }
3279 
3280 static __inline__ _Float16 __DEFAULT_FN_ATTRS512
_mm512_reduce_mul_ph(__m512h __W)3281 _mm512_reduce_mul_ph(__m512h __W) {
3282   return __builtin_ia32_reduce_fmul_ph512(1.0f16, __W);
3283 }
3284 
3285 static __inline__ _Float16 __DEFAULT_FN_ATTRS512
_mm512_reduce_max_ph(__m512h __V)3286 _mm512_reduce_max_ph(__m512h __V) {
3287   return __builtin_ia32_reduce_fmax_ph512(__V);
3288 }
3289 
3290 static __inline__ _Float16 __DEFAULT_FN_ATTRS512
_mm512_reduce_min_ph(__m512h __V)3291 _mm512_reduce_min_ph(__m512h __V) {
3292   return __builtin_ia32_reduce_fmin_ph512(__V);
3293 }
3294 
3295 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_blend_ph(__mmask32 __U,__m512h __A,__m512h __W)3296 _mm512_mask_blend_ph(__mmask32 __U, __m512h __A, __m512h __W) {
3297   return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, (__v32hf)__W,
3298                                               (__v32hf)__A);
3299 }
3300 
3301 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_permutex2var_ph(__m512h __A,__m512i __I,__m512h __B)3302 _mm512_permutex2var_ph(__m512h __A, __m512i __I, __m512h __B) {
3303   return (__m512h)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
3304                                                  (__v32hi)__B);
3305 }
3306 
3307 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_permutexvar_ph(__m512i __A,__m512h __B)3308 _mm512_permutexvar_ph(__m512i __A, __m512h __B) {
3309   return (__m512h)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A);
3310 }
3311 
3312 // intrinsics below are alias for f*mul_*ch
3313 #define _mm512_mul_pch(A, B) _mm512_fmul_pch(A, B)
3314 #define _mm512_mask_mul_pch(W, U, A, B) _mm512_mask_fmul_pch(W, U, A, B)
3315 #define _mm512_maskz_mul_pch(U, A, B) _mm512_maskz_fmul_pch(U, A, B)
3316 #define _mm512_mul_round_pch(A, B, R) _mm512_fmul_round_pch(A, B, R)
3317 #define _mm512_mask_mul_round_pch(W, U, A, B, R)                               \
3318   _mm512_mask_fmul_round_pch(W, U, A, B, R)
3319 #define _mm512_maskz_mul_round_pch(U, A, B, R)                                 \
3320   _mm512_maskz_fmul_round_pch(U, A, B, R)
3321 
3322 #define _mm512_cmul_pch(A, B) _mm512_fcmul_pch(A, B)
3323 #define _mm512_mask_cmul_pch(W, U, A, B) _mm512_mask_fcmul_pch(W, U, A, B)
3324 #define _mm512_maskz_cmul_pch(U, A, B) _mm512_maskz_fcmul_pch(U, A, B)
3325 #define _mm512_cmul_round_pch(A, B, R) _mm512_fcmul_round_pch(A, B, R)
3326 #define _mm512_mask_cmul_round_pch(W, U, A, B, R)                              \
3327   _mm512_mask_fcmul_round_pch(W, U, A, B, R)
3328 #define _mm512_maskz_cmul_round_pch(U, A, B, R)                                \
3329   _mm512_maskz_fcmul_round_pch(U, A, B, R)
3330 
3331 #define _mm_mul_sch(A, B) _mm_fmul_sch(A, B)
3332 #define _mm_mask_mul_sch(W, U, A, B) _mm_mask_fmul_sch(W, U, A, B)
3333 #define _mm_maskz_mul_sch(U, A, B) _mm_maskz_fmul_sch(U, A, B)
3334 #define _mm_mul_round_sch(A, B, R) _mm_fmul_round_sch(A, B, R)
3335 #define _mm_mask_mul_round_sch(W, U, A, B, R)                                  \
3336   _mm_mask_fmul_round_sch(W, U, A, B, R)
3337 #define _mm_maskz_mul_round_sch(U, A, B, R) _mm_maskz_fmul_round_sch(U, A, B, R)
3338 
3339 #define _mm_cmul_sch(A, B) _mm_fcmul_sch(A, B)
3340 #define _mm_mask_cmul_sch(W, U, A, B) _mm_mask_fcmul_sch(W, U, A, B)
3341 #define _mm_maskz_cmul_sch(U, A, B) _mm_maskz_fcmul_sch(U, A, B)
3342 #define _mm_cmul_round_sch(A, B, R) _mm_fcmul_round_sch(A, B, R)
3343 #define _mm_mask_cmul_round_sch(W, U, A, B, R)                                 \
3344   _mm_mask_fcmul_round_sch(W, U, A, B, R)
3345 #define _mm_maskz_cmul_round_sch(U, A, B, R)                                   \
3346   _mm_maskz_fcmul_round_sch(U, A, B, R)
3347 
3348 #undef __DEFAULT_FN_ATTRS128
3349 #undef __DEFAULT_FN_ATTRS256
3350 #undef __DEFAULT_FN_ATTRS512
3351 
3352 #endif
3353 #endif
3354