1 /*===----------- avx512fp16intrin.h - AVX512-FP16 intrinsics ---------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9 #ifndef __IMMINTRIN_H
10 #error "Never use <avx512fp16intrin.h> directly; include <immintrin.h> instead."
11 #endif
12
13 #ifdef __SSE2__
14
15 #ifndef __AVX512FP16INTRIN_H
16 #define __AVX512FP16INTRIN_H
17
18 /* Define the default attributes for the functions in this file. */
19 typedef _Float16 __v32hf __attribute__((__vector_size__(64), __aligned__(64)));
20 typedef _Float16 __m512h __attribute__((__vector_size__(64), __aligned__(64)));
21 typedef _Float16 __m512h_u __attribute__((__vector_size__(64), __aligned__(1)));
22
23 /* Define the default attributes for the functions in this file. */
24 #define __DEFAULT_FN_ATTRS512 \
25 __attribute__((__always_inline__, __nodebug__, \
26 __target__("avx512fp16,evex512"), __min_vector_width__(512)))
27 #define __DEFAULT_FN_ATTRS256 \
28 __attribute__((__always_inline__, __nodebug__, \
29 __target__("avx512fp16,no-evex512"), \
30 __min_vector_width__(256)))
31 #define __DEFAULT_FN_ATTRS128 \
32 __attribute__((__always_inline__, __nodebug__, \
33 __target__("avx512fp16,no-evex512"), \
34 __min_vector_width__(128)))
35
_mm512_cvtsh_h(__m512h __a)36 static __inline__ _Float16 __DEFAULT_FN_ATTRS512 _mm512_cvtsh_h(__m512h __a) {
37 return __a[0];
38 }
39
_mm_setzero_ph(void)40 static __inline __m128h __DEFAULT_FN_ATTRS128 _mm_setzero_ph(void) {
41 return (__m128h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
42 }
43
_mm256_setzero_ph(void)44 static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_setzero_ph(void) {
45 return (__m256h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
46 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
47 }
48
_mm256_undefined_ph(void)49 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_undefined_ph(void) {
50 return (__m256h)__builtin_ia32_undef256();
51 }
52
_mm512_setzero_ph(void)53 static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_setzero_ph(void) {
54 return (__m512h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
55 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
56 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
57 }
58
_mm_undefined_ph(void)59 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_undefined_ph(void) {
60 return (__m128h)__builtin_ia32_undef128();
61 }
62
_mm512_undefined_ph(void)63 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_undefined_ph(void) {
64 return (__m512h)__builtin_ia32_undef512();
65 }
66
_mm512_set1_ph(_Float16 __h)67 static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_set1_ph(_Float16 __h) {
68 return (__m512h)(__v32hf){__h, __h, __h, __h, __h, __h, __h, __h,
69 __h, __h, __h, __h, __h, __h, __h, __h,
70 __h, __h, __h, __h, __h, __h, __h, __h,
71 __h, __h, __h, __h, __h, __h, __h, __h};
72 }
73
74 static __inline __m512h __DEFAULT_FN_ATTRS512
_mm512_set_ph(_Float16 __h1,_Float16 __h2,_Float16 __h3,_Float16 __h4,_Float16 __h5,_Float16 __h6,_Float16 __h7,_Float16 __h8,_Float16 __h9,_Float16 __h10,_Float16 __h11,_Float16 __h12,_Float16 __h13,_Float16 __h14,_Float16 __h15,_Float16 __h16,_Float16 __h17,_Float16 __h18,_Float16 __h19,_Float16 __h20,_Float16 __h21,_Float16 __h22,_Float16 __h23,_Float16 __h24,_Float16 __h25,_Float16 __h26,_Float16 __h27,_Float16 __h28,_Float16 __h29,_Float16 __h30,_Float16 __h31,_Float16 __h32)75 _mm512_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4,
76 _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8,
77 _Float16 __h9, _Float16 __h10, _Float16 __h11, _Float16 __h12,
78 _Float16 __h13, _Float16 __h14, _Float16 __h15, _Float16 __h16,
79 _Float16 __h17, _Float16 __h18, _Float16 __h19, _Float16 __h20,
80 _Float16 __h21, _Float16 __h22, _Float16 __h23, _Float16 __h24,
81 _Float16 __h25, _Float16 __h26, _Float16 __h27, _Float16 __h28,
82 _Float16 __h29, _Float16 __h30, _Float16 __h31, _Float16 __h32) {
83 return (__m512h)(__v32hf){__h32, __h31, __h30, __h29, __h28, __h27, __h26,
84 __h25, __h24, __h23, __h22, __h21, __h20, __h19,
85 __h18, __h17, __h16, __h15, __h14, __h13, __h12,
86 __h11, __h10, __h9, __h8, __h7, __h6, __h5,
87 __h4, __h3, __h2, __h1};
88 }
89
90 #define _mm512_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11, h12, h13, \
91 h14, h15, h16, h17, h18, h19, h20, h21, h22, h23, h24, \
92 h25, h26, h27, h28, h29, h30, h31, h32) \
93 _mm512_set_ph((h32), (h31), (h30), (h29), (h28), (h27), (h26), (h25), (h24), \
94 (h23), (h22), (h21), (h20), (h19), (h18), (h17), (h16), (h15), \
95 (h14), (h13), (h12), (h11), (h10), (h9), (h8), (h7), (h6), \
96 (h5), (h4), (h3), (h2), (h1))
97
98 static __inline __m512h __DEFAULT_FN_ATTRS512
_mm512_set1_pch(_Float16 _Complex __h)99 _mm512_set1_pch(_Float16 _Complex __h) {
100 return (__m512h)_mm512_set1_ps(__builtin_bit_cast(float, __h));
101 }
102
_mm_castph_ps(__m128h __a)103 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_castph_ps(__m128h __a) {
104 return (__m128)__a;
105 }
106
_mm256_castph_ps(__m256h __a)107 static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_castph_ps(__m256h __a) {
108 return (__m256)__a;
109 }
110
_mm512_castph_ps(__m512h __a)111 static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_castph_ps(__m512h __a) {
112 return (__m512)__a;
113 }
114
_mm_castph_pd(__m128h __a)115 static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_castph_pd(__m128h __a) {
116 return (__m128d)__a;
117 }
118
_mm256_castph_pd(__m256h __a)119 static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_castph_pd(__m256h __a) {
120 return (__m256d)__a;
121 }
122
_mm512_castph_pd(__m512h __a)123 static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_castph_pd(__m512h __a) {
124 return (__m512d)__a;
125 }
126
_mm_castph_si128(__m128h __a)127 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_castph_si128(__m128h __a) {
128 return (__m128i)__a;
129 }
130
131 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_castph_si256(__m256h __a)132 _mm256_castph_si256(__m256h __a) {
133 return (__m256i)__a;
134 }
135
136 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_castph_si512(__m512h __a)137 _mm512_castph_si512(__m512h __a) {
138 return (__m512i)__a;
139 }
140
_mm_castps_ph(__m128 __a)141 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castps_ph(__m128 __a) {
142 return (__m128h)__a;
143 }
144
_mm256_castps_ph(__m256 __a)145 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_castps_ph(__m256 __a) {
146 return (__m256h)__a;
147 }
148
_mm512_castps_ph(__m512 __a)149 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_castps_ph(__m512 __a) {
150 return (__m512h)__a;
151 }
152
_mm_castpd_ph(__m128d __a)153 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castpd_ph(__m128d __a) {
154 return (__m128h)__a;
155 }
156
_mm256_castpd_ph(__m256d __a)157 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_castpd_ph(__m256d __a) {
158 return (__m256h)__a;
159 }
160
_mm512_castpd_ph(__m512d __a)161 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_castpd_ph(__m512d __a) {
162 return (__m512h)__a;
163 }
164
_mm_castsi128_ph(__m128i __a)165 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castsi128_ph(__m128i __a) {
166 return (__m128h)__a;
167 }
168
169 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_castsi256_ph(__m256i __a)170 _mm256_castsi256_ph(__m256i __a) {
171 return (__m256h)__a;
172 }
173
174 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_castsi512_ph(__m512i __a)175 _mm512_castsi512_ph(__m512i __a) {
176 return (__m512h)__a;
177 }
178
179 static __inline__ __m128h __DEFAULT_FN_ATTRS256
_mm256_castph256_ph128(__m256h __a)180 _mm256_castph256_ph128(__m256h __a) {
181 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
182 }
183
184 static __inline__ __m128h __DEFAULT_FN_ATTRS512
_mm512_castph512_ph128(__m512h __a)185 _mm512_castph512_ph128(__m512h __a) {
186 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
187 }
188
189 static __inline__ __m256h __DEFAULT_FN_ATTRS512
_mm512_castph512_ph256(__m512h __a)190 _mm512_castph512_ph256(__m512h __a) {
191 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
192 12, 13, 14, 15);
193 }
194
195 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_castph128_ph256(__m128h __a)196 _mm256_castph128_ph256(__m128h __a) {
197 return __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a),
198 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
199 }
200
201 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_castph128_ph512(__m128h __a)202 _mm512_castph128_ph512(__m128h __a) {
203 __m256h __b = __builtin_nondeterministic_value(__b);
204 return __builtin_shufflevector(
205 __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a),
206 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
207 __b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
208 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
209 }
210
211 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_castph256_ph512(__m256h __a)212 _mm512_castph256_ph512(__m256h __a) {
213 return __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a), 0,
214 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
215 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
216 27, 28, 29, 30, 31);
217 }
218
219 /// Constructs a 256-bit floating-point vector of [16 x half] from a
220 /// 128-bit floating-point vector of [8 x half]. The lower 128 bits
221 /// contain the value of the source vector. The upper 384 bits are set
222 /// to zero.
223 ///
224 /// \headerfile <x86intrin.h>
225 ///
226 /// This intrinsic has no corresponding instruction.
227 ///
228 /// \param __a
229 /// A 128-bit vector of [8 x half].
230 /// \returns A 512-bit floating-point vector of [16 x half]. The lower 128 bits
231 /// contain the value of the parameter. The upper 384 bits are set to zero.
232 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_zextph128_ph256(__m128h __a)233 _mm256_zextph128_ph256(__m128h __a) {
234 return __builtin_shufflevector(__a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4,
235 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
236 }
237
238 /// Constructs a 512-bit floating-point vector of [32 x half] from a
239 /// 128-bit floating-point vector of [8 x half]. The lower 128 bits
240 /// contain the value of the source vector. The upper 384 bits are set
241 /// to zero.
242 ///
243 /// \headerfile <x86intrin.h>
244 ///
245 /// This intrinsic has no corresponding instruction.
246 ///
247 /// \param __a
248 /// A 128-bit vector of [8 x half].
249 /// \returns A 512-bit floating-point vector of [32 x half]. The lower 128 bits
250 /// contain the value of the parameter. The upper 384 bits are set to zero.
251 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_zextph128_ph512(__m128h __a)252 _mm512_zextph128_ph512(__m128h __a) {
253 return __builtin_shufflevector(
254 __a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
255 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15);
256 }
257
258 /// Constructs a 512-bit floating-point vector of [32 x half] from a
259 /// 256-bit floating-point vector of [16 x half]. The lower 256 bits
260 /// contain the value of the source vector. The upper 256 bits are set
261 /// to zero.
262 ///
263 /// \headerfile <x86intrin.h>
264 ///
265 /// This intrinsic has no corresponding instruction.
266 ///
267 /// \param __a
268 /// A 256-bit vector of [16 x half].
269 /// \returns A 512-bit floating-point vector of [32 x half]. The lower 256 bits
270 /// contain the value of the parameter. The upper 256 bits are set to zero.
271 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_zextph256_ph512(__m256h __a)272 _mm512_zextph256_ph512(__m256h __a) {
273 return __builtin_shufflevector(__a, (__v16hf)_mm256_setzero_ph(), 0, 1, 2, 3,
274 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
275 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
276 29, 30, 31);
277 }
278
279 #define _mm_comi_round_sh(A, B, P, R) \
280 __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, (int)(P), (int)(R))
281
282 #define _mm_comi_sh(A, B, pred) \
283 _mm_comi_round_sh((A), (B), (pred), _MM_FROUND_CUR_DIRECTION)
284
_mm_comieq_sh(__m128h __A,__m128h __B)285 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comieq_sh(__m128h __A,
286 __m128h __B) {
287 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_EQ_OS,
288 _MM_FROUND_CUR_DIRECTION);
289 }
290
_mm_comilt_sh(__m128h __A,__m128h __B)291 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comilt_sh(__m128h __A,
292 __m128h __B) {
293 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_LT_OS,
294 _MM_FROUND_CUR_DIRECTION);
295 }
296
_mm_comile_sh(__m128h __A,__m128h __B)297 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comile_sh(__m128h __A,
298 __m128h __B) {
299 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_LE_OS,
300 _MM_FROUND_CUR_DIRECTION);
301 }
302
_mm_comigt_sh(__m128h __A,__m128h __B)303 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comigt_sh(__m128h __A,
304 __m128h __B) {
305 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_GT_OS,
306 _MM_FROUND_CUR_DIRECTION);
307 }
308
_mm_comige_sh(__m128h __A,__m128h __B)309 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comige_sh(__m128h __A,
310 __m128h __B) {
311 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_GE_OS,
312 _MM_FROUND_CUR_DIRECTION);
313 }
314
_mm_comineq_sh(__m128h __A,__m128h __B)315 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comineq_sh(__m128h __A,
316 __m128h __B) {
317 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_NEQ_US,
318 _MM_FROUND_CUR_DIRECTION);
319 }
320
_mm_ucomieq_sh(__m128h __A,__m128h __B)321 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomieq_sh(__m128h __A,
322 __m128h __B) {
323 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_EQ_OQ,
324 _MM_FROUND_CUR_DIRECTION);
325 }
326
_mm_ucomilt_sh(__m128h __A,__m128h __B)327 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomilt_sh(__m128h __A,
328 __m128h __B) {
329 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_LT_OQ,
330 _MM_FROUND_CUR_DIRECTION);
331 }
332
_mm_ucomile_sh(__m128h __A,__m128h __B)333 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomile_sh(__m128h __A,
334 __m128h __B) {
335 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_LE_OQ,
336 _MM_FROUND_CUR_DIRECTION);
337 }
338
_mm_ucomigt_sh(__m128h __A,__m128h __B)339 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomigt_sh(__m128h __A,
340 __m128h __B) {
341 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_GT_OQ,
342 _MM_FROUND_CUR_DIRECTION);
343 }
344
_mm_ucomige_sh(__m128h __A,__m128h __B)345 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomige_sh(__m128h __A,
346 __m128h __B) {
347 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_GE_OQ,
348 _MM_FROUND_CUR_DIRECTION);
349 }
350
_mm_ucomineq_sh(__m128h __A,__m128h __B)351 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomineq_sh(__m128h __A,
352 __m128h __B) {
353 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_NEQ_UQ,
354 _MM_FROUND_CUR_DIRECTION);
355 }
356
_mm512_add_ph(__m512h __A,__m512h __B)357 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_add_ph(__m512h __A,
358 __m512h __B) {
359 return (__m512h)((__v32hf)__A + (__v32hf)__B);
360 }
361
362 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_add_ph(__m512h __W,__mmask32 __U,__m512h __A,__m512h __B)363 _mm512_mask_add_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
364 return (__m512h)__builtin_ia32_selectph_512(
365 (__mmask32)__U, (__v32hf)_mm512_add_ph(__A, __B), (__v32hf)__W);
366 }
367
368 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_add_ph(__mmask32 __U,__m512h __A,__m512h __B)369 _mm512_maskz_add_ph(__mmask32 __U, __m512h __A, __m512h __B) {
370 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
371 (__v32hf)_mm512_add_ph(__A, __B),
372 (__v32hf)_mm512_setzero_ph());
373 }
374
375 #define _mm512_add_round_ph(A, B, R) \
376 ((__m512h)__builtin_ia32_addph512((__v32hf)(__m512h)(A), \
377 (__v32hf)(__m512h)(B), (int)(R)))
378
379 #define _mm512_mask_add_round_ph(W, U, A, B, R) \
380 ((__m512h)__builtin_ia32_selectph_512( \
381 (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)), \
382 (__v32hf)(__m512h)(W)))
383
384 #define _mm512_maskz_add_round_ph(U, A, B, R) \
385 ((__m512h)__builtin_ia32_selectph_512( \
386 (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)), \
387 (__v32hf)_mm512_setzero_ph()))
388
_mm512_sub_ph(__m512h __A,__m512h __B)389 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sub_ph(__m512h __A,
390 __m512h __B) {
391 return (__m512h)((__v32hf)__A - (__v32hf)__B);
392 }
393
394 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_sub_ph(__m512h __W,__mmask32 __U,__m512h __A,__m512h __B)395 _mm512_mask_sub_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
396 return (__m512h)__builtin_ia32_selectph_512(
397 (__mmask32)__U, (__v32hf)_mm512_sub_ph(__A, __B), (__v32hf)__W);
398 }
399
400 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_sub_ph(__mmask32 __U,__m512h __A,__m512h __B)401 _mm512_maskz_sub_ph(__mmask32 __U, __m512h __A, __m512h __B) {
402 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
403 (__v32hf)_mm512_sub_ph(__A, __B),
404 (__v32hf)_mm512_setzero_ph());
405 }
406
407 #define _mm512_sub_round_ph(A, B, R) \
408 ((__m512h)__builtin_ia32_subph512((__v32hf)(__m512h)(A), \
409 (__v32hf)(__m512h)(B), (int)(R)))
410
411 #define _mm512_mask_sub_round_ph(W, U, A, B, R) \
412 ((__m512h)__builtin_ia32_selectph_512( \
413 (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)), \
414 (__v32hf)(__m512h)(W)))
415
416 #define _mm512_maskz_sub_round_ph(U, A, B, R) \
417 ((__m512h)__builtin_ia32_selectph_512( \
418 (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)), \
419 (__v32hf)_mm512_setzero_ph()))
420
_mm512_mul_ph(__m512h __A,__m512h __B)421 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_mul_ph(__m512h __A,
422 __m512h __B) {
423 return (__m512h)((__v32hf)__A * (__v32hf)__B);
424 }
425
426 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_mul_ph(__m512h __W,__mmask32 __U,__m512h __A,__m512h __B)427 _mm512_mask_mul_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
428 return (__m512h)__builtin_ia32_selectph_512(
429 (__mmask32)__U, (__v32hf)_mm512_mul_ph(__A, __B), (__v32hf)__W);
430 }
431
432 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_mul_ph(__mmask32 __U,__m512h __A,__m512h __B)433 _mm512_maskz_mul_ph(__mmask32 __U, __m512h __A, __m512h __B) {
434 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
435 (__v32hf)_mm512_mul_ph(__A, __B),
436 (__v32hf)_mm512_setzero_ph());
437 }
438
439 #define _mm512_mul_round_ph(A, B, R) \
440 ((__m512h)__builtin_ia32_mulph512((__v32hf)(__m512h)(A), \
441 (__v32hf)(__m512h)(B), (int)(R)))
442
443 #define _mm512_mask_mul_round_ph(W, U, A, B, R) \
444 ((__m512h)__builtin_ia32_selectph_512( \
445 (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)), \
446 (__v32hf)(__m512h)(W)))
447
448 #define _mm512_maskz_mul_round_ph(U, A, B, R) \
449 ((__m512h)__builtin_ia32_selectph_512( \
450 (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)), \
451 (__v32hf)_mm512_setzero_ph()))
452
_mm512_div_ph(__m512h __A,__m512h __B)453 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_div_ph(__m512h __A,
454 __m512h __B) {
455 return (__m512h)((__v32hf)__A / (__v32hf)__B);
456 }
457
458 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_div_ph(__m512h __W,__mmask32 __U,__m512h __A,__m512h __B)459 _mm512_mask_div_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
460 return (__m512h)__builtin_ia32_selectph_512(
461 (__mmask32)__U, (__v32hf)_mm512_div_ph(__A, __B), (__v32hf)__W);
462 }
463
464 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_div_ph(__mmask32 __U,__m512h __A,__m512h __B)465 _mm512_maskz_div_ph(__mmask32 __U, __m512h __A, __m512h __B) {
466 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
467 (__v32hf)_mm512_div_ph(__A, __B),
468 (__v32hf)_mm512_setzero_ph());
469 }
470
471 #define _mm512_div_round_ph(A, B, R) \
472 ((__m512h)__builtin_ia32_divph512((__v32hf)(__m512h)(A), \
473 (__v32hf)(__m512h)(B), (int)(R)))
474
475 #define _mm512_mask_div_round_ph(W, U, A, B, R) \
476 ((__m512h)__builtin_ia32_selectph_512( \
477 (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)), \
478 (__v32hf)(__m512h)(W)))
479
480 #define _mm512_maskz_div_round_ph(U, A, B, R) \
481 ((__m512h)__builtin_ia32_selectph_512( \
482 (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)), \
483 (__v32hf)_mm512_setzero_ph()))
484
_mm512_min_ph(__m512h __A,__m512h __B)485 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_min_ph(__m512h __A,
486 __m512h __B) {
487 return (__m512h)__builtin_ia32_minph512((__v32hf)__A, (__v32hf)__B,
488 _MM_FROUND_CUR_DIRECTION);
489 }
490
491 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_min_ph(__m512h __W,__mmask32 __U,__m512h __A,__m512h __B)492 _mm512_mask_min_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
493 return (__m512h)__builtin_ia32_selectph_512(
494 (__mmask32)__U, (__v32hf)_mm512_min_ph(__A, __B), (__v32hf)__W);
495 }
496
497 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_min_ph(__mmask32 __U,__m512h __A,__m512h __B)498 _mm512_maskz_min_ph(__mmask32 __U, __m512h __A, __m512h __B) {
499 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
500 (__v32hf)_mm512_min_ph(__A, __B),
501 (__v32hf)_mm512_setzero_ph());
502 }
503
504 #define _mm512_min_round_ph(A, B, R) \
505 ((__m512h)__builtin_ia32_minph512((__v32hf)(__m512h)(A), \
506 (__v32hf)(__m512h)(B), (int)(R)))
507
508 #define _mm512_mask_min_round_ph(W, U, A, B, R) \
509 ((__m512h)__builtin_ia32_selectph_512( \
510 (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)), \
511 (__v32hf)(__m512h)(W)))
512
513 #define _mm512_maskz_min_round_ph(U, A, B, R) \
514 ((__m512h)__builtin_ia32_selectph_512( \
515 (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)), \
516 (__v32hf)_mm512_setzero_ph()))
517
_mm512_max_ph(__m512h __A,__m512h __B)518 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_max_ph(__m512h __A,
519 __m512h __B) {
520 return (__m512h)__builtin_ia32_maxph512((__v32hf)__A, (__v32hf)__B,
521 _MM_FROUND_CUR_DIRECTION);
522 }
523
524 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_max_ph(__m512h __W,__mmask32 __U,__m512h __A,__m512h __B)525 _mm512_mask_max_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
526 return (__m512h)__builtin_ia32_selectph_512(
527 (__mmask32)__U, (__v32hf)_mm512_max_ph(__A, __B), (__v32hf)__W);
528 }
529
530 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_max_ph(__mmask32 __U,__m512h __A,__m512h __B)531 _mm512_maskz_max_ph(__mmask32 __U, __m512h __A, __m512h __B) {
532 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
533 (__v32hf)_mm512_max_ph(__A, __B),
534 (__v32hf)_mm512_setzero_ph());
535 }
536
537 #define _mm512_max_round_ph(A, B, R) \
538 ((__m512h)__builtin_ia32_maxph512((__v32hf)(__m512h)(A), \
539 (__v32hf)(__m512h)(B), (int)(R)))
540
541 #define _mm512_mask_max_round_ph(W, U, A, B, R) \
542 ((__m512h)__builtin_ia32_selectph_512( \
543 (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)), \
544 (__v32hf)(__m512h)(W)))
545
546 #define _mm512_maskz_max_round_ph(U, A, B, R) \
547 ((__m512h)__builtin_ia32_selectph_512( \
548 (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)), \
549 (__v32hf)_mm512_setzero_ph()))
550
_mm512_abs_ph(__m512h __A)551 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_abs_ph(__m512h __A) {
552 return (__m512h)_mm512_and_epi32(_mm512_set1_epi32(0x7FFF7FFF), (__m512i)__A);
553 }
554
_mm512_conj_pch(__m512h __A)555 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_conj_pch(__m512h __A) {
556 return (__m512h)_mm512_xor_epi32((__m512i)__A,
557 _mm512_set1_epi32(-2147483648));
558 }
559
560 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_conj_pch(__m512h __W,__mmask16 __U,__m512h __A)561 _mm512_mask_conj_pch(__m512h __W, __mmask16 __U, __m512h __A) {
562 return (__m512h)__builtin_ia32_selectps_512(
563 (__mmask16)__U, (__v16sf)_mm512_conj_pch(__A), (__v16sf)__W);
564 }
565
566 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_conj_pch(__mmask16 __U,__m512h __A)567 _mm512_maskz_conj_pch(__mmask16 __U, __m512h __A) {
568 return (__m512h)__builtin_ia32_selectps_512((__mmask16)__U,
569 (__v16sf)_mm512_conj_pch(__A),
570 (__v16sf)_mm512_setzero_ps());
571 }
572
_mm_add_sh(__m128h __A,__m128h __B)573 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_add_sh(__m128h __A,
574 __m128h __B) {
575 __A[0] += __B[0];
576 return __A;
577 }
578
_mm_mask_add_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)579 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_add_sh(__m128h __W,
580 __mmask8 __U,
581 __m128h __A,
582 __m128h __B) {
583 __A = _mm_add_sh(__A, __B);
584 return __builtin_ia32_selectsh_128(__U, __A, __W);
585 }
586
_mm_maskz_add_sh(__mmask8 __U,__m128h __A,__m128h __B)587 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_add_sh(__mmask8 __U,
588 __m128h __A,
589 __m128h __B) {
590 __A = _mm_add_sh(__A, __B);
591 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
592 }
593
594 #define _mm_add_round_sh(A, B, R) \
595 ((__m128h)__builtin_ia32_addsh_round_mask( \
596 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
597 (__mmask8)-1, (int)(R)))
598
599 #define _mm_mask_add_round_sh(W, U, A, B, R) \
600 ((__m128h)__builtin_ia32_addsh_round_mask( \
601 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
602 (__mmask8)(U), (int)(R)))
603
604 #define _mm_maskz_add_round_sh(U, A, B, R) \
605 ((__m128h)__builtin_ia32_addsh_round_mask( \
606 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
607 (__mmask8)(U), (int)(R)))
608
_mm_sub_sh(__m128h __A,__m128h __B)609 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sub_sh(__m128h __A,
610 __m128h __B) {
611 __A[0] -= __B[0];
612 return __A;
613 }
614
_mm_mask_sub_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)615 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sub_sh(__m128h __W,
616 __mmask8 __U,
617 __m128h __A,
618 __m128h __B) {
619 __A = _mm_sub_sh(__A, __B);
620 return __builtin_ia32_selectsh_128(__U, __A, __W);
621 }
622
_mm_maskz_sub_sh(__mmask8 __U,__m128h __A,__m128h __B)623 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sub_sh(__mmask8 __U,
624 __m128h __A,
625 __m128h __B) {
626 __A = _mm_sub_sh(__A, __B);
627 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
628 }
629
630 #define _mm_sub_round_sh(A, B, R) \
631 ((__m128h)__builtin_ia32_subsh_round_mask( \
632 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
633 (__mmask8)-1, (int)(R)))
634
635 #define _mm_mask_sub_round_sh(W, U, A, B, R) \
636 ((__m128h)__builtin_ia32_subsh_round_mask( \
637 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
638 (__mmask8)(U), (int)(R)))
639
640 #define _mm_maskz_sub_round_sh(U, A, B, R) \
641 ((__m128h)__builtin_ia32_subsh_round_mask( \
642 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
643 (__mmask8)(U), (int)(R)))
644
_mm_mul_sh(__m128h __A,__m128h __B)645 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mul_sh(__m128h __A,
646 __m128h __B) {
647 __A[0] *= __B[0];
648 return __A;
649 }
650
_mm_mask_mul_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)651 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_mul_sh(__m128h __W,
652 __mmask8 __U,
653 __m128h __A,
654 __m128h __B) {
655 __A = _mm_mul_sh(__A, __B);
656 return __builtin_ia32_selectsh_128(__U, __A, __W);
657 }
658
_mm_maskz_mul_sh(__mmask8 __U,__m128h __A,__m128h __B)659 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_mul_sh(__mmask8 __U,
660 __m128h __A,
661 __m128h __B) {
662 __A = _mm_mul_sh(__A, __B);
663 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
664 }
665
666 #define _mm_mul_round_sh(A, B, R) \
667 ((__m128h)__builtin_ia32_mulsh_round_mask( \
668 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
669 (__mmask8)-1, (int)(R)))
670
671 #define _mm_mask_mul_round_sh(W, U, A, B, R) \
672 ((__m128h)__builtin_ia32_mulsh_round_mask( \
673 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
674 (__mmask8)(U), (int)(R)))
675
676 #define _mm_maskz_mul_round_sh(U, A, B, R) \
677 ((__m128h)__builtin_ia32_mulsh_round_mask( \
678 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
679 (__mmask8)(U), (int)(R)))
680
_mm_div_sh(__m128h __A,__m128h __B)681 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_div_sh(__m128h __A,
682 __m128h __B) {
683 __A[0] /= __B[0];
684 return __A;
685 }
686
_mm_mask_div_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)687 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_div_sh(__m128h __W,
688 __mmask8 __U,
689 __m128h __A,
690 __m128h __B) {
691 __A = _mm_div_sh(__A, __B);
692 return __builtin_ia32_selectsh_128(__U, __A, __W);
693 }
694
_mm_maskz_div_sh(__mmask8 __U,__m128h __A,__m128h __B)695 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_div_sh(__mmask8 __U,
696 __m128h __A,
697 __m128h __B) {
698 __A = _mm_div_sh(__A, __B);
699 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
700 }
701
702 #define _mm_div_round_sh(A, B, R) \
703 ((__m128h)__builtin_ia32_divsh_round_mask( \
704 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
705 (__mmask8)-1, (int)(R)))
706
707 #define _mm_mask_div_round_sh(W, U, A, B, R) \
708 ((__m128h)__builtin_ia32_divsh_round_mask( \
709 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
710 (__mmask8)(U), (int)(R)))
711
712 #define _mm_maskz_div_round_sh(U, A, B, R) \
713 ((__m128h)__builtin_ia32_divsh_round_mask( \
714 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
715 (__mmask8)(U), (int)(R)))
716
_mm_min_sh(__m128h __A,__m128h __B)717 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_min_sh(__m128h __A,
718 __m128h __B) {
719 return (__m128h)__builtin_ia32_minsh_round_mask(
720 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
721 _MM_FROUND_CUR_DIRECTION);
722 }
723
_mm_mask_min_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)724 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_min_sh(__m128h __W,
725 __mmask8 __U,
726 __m128h __A,
727 __m128h __B) {
728 return (__m128h)__builtin_ia32_minsh_round_mask((__v8hf)__A, (__v8hf)__B,
729 (__v8hf)__W, (__mmask8)__U,
730 _MM_FROUND_CUR_DIRECTION);
731 }
732
_mm_maskz_min_sh(__mmask8 __U,__m128h __A,__m128h __B)733 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_min_sh(__mmask8 __U,
734 __m128h __A,
735 __m128h __B) {
736 return (__m128h)__builtin_ia32_minsh_round_mask(
737 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
738 _MM_FROUND_CUR_DIRECTION);
739 }
740
741 #define _mm_min_round_sh(A, B, R) \
742 ((__m128h)__builtin_ia32_minsh_round_mask( \
743 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
744 (__mmask8)-1, (int)(R)))
745
746 #define _mm_mask_min_round_sh(W, U, A, B, R) \
747 ((__m128h)__builtin_ia32_minsh_round_mask( \
748 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
749 (__mmask8)(U), (int)(R)))
750
751 #define _mm_maskz_min_round_sh(U, A, B, R) \
752 ((__m128h)__builtin_ia32_minsh_round_mask( \
753 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
754 (__mmask8)(U), (int)(R)))
755
_mm_max_sh(__m128h __A,__m128h __B)756 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_max_sh(__m128h __A,
757 __m128h __B) {
758 return (__m128h)__builtin_ia32_maxsh_round_mask(
759 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
760 _MM_FROUND_CUR_DIRECTION);
761 }
762
_mm_mask_max_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)763 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_max_sh(__m128h __W,
764 __mmask8 __U,
765 __m128h __A,
766 __m128h __B) {
767 return (__m128h)__builtin_ia32_maxsh_round_mask((__v8hf)__A, (__v8hf)__B,
768 (__v8hf)__W, (__mmask8)__U,
769 _MM_FROUND_CUR_DIRECTION);
770 }
771
_mm_maskz_max_sh(__mmask8 __U,__m128h __A,__m128h __B)772 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_max_sh(__mmask8 __U,
773 __m128h __A,
774 __m128h __B) {
775 return (__m128h)__builtin_ia32_maxsh_round_mask(
776 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
777 _MM_FROUND_CUR_DIRECTION);
778 }
779
780 #define _mm_max_round_sh(A, B, R) \
781 ((__m128h)__builtin_ia32_maxsh_round_mask( \
782 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
783 (__mmask8)-1, (int)(R)))
784
785 #define _mm_mask_max_round_sh(W, U, A, B, R) \
786 ((__m128h)__builtin_ia32_maxsh_round_mask( \
787 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
788 (__mmask8)(U), (int)(R)))
789
790 #define _mm_maskz_max_round_sh(U, A, B, R) \
791 ((__m128h)__builtin_ia32_maxsh_round_mask( \
792 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
793 (__mmask8)(U), (int)(R)))
794
795 #define _mm512_cmp_round_ph_mask(A, B, P, R) \
796 ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A), \
797 (__v32hf)(__m512h)(B), (int)(P), \
798 (__mmask32)-1, (int)(R)))
799
800 #define _mm512_mask_cmp_round_ph_mask(U, A, B, P, R) \
801 ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A), \
802 (__v32hf)(__m512h)(B), (int)(P), \
803 (__mmask32)(U), (int)(R)))
804
805 #define _mm512_cmp_ph_mask(A, B, P) \
806 _mm512_cmp_round_ph_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
807
808 #define _mm512_mask_cmp_ph_mask(U, A, B, P) \
809 _mm512_mask_cmp_round_ph_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
810
811 #define _mm_cmp_round_sh_mask(X, Y, P, R) \
812 ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X), \
813 (__v8hf)(__m128h)(Y), (int)(P), \
814 (__mmask8)-1, (int)(R)))
815
816 #define _mm_mask_cmp_round_sh_mask(M, X, Y, P, R) \
817 ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X), \
818 (__v8hf)(__m128h)(Y), (int)(P), \
819 (__mmask8)(M), (int)(R)))
820
821 #define _mm_cmp_sh_mask(X, Y, P) \
822 ((__mmask8)__builtin_ia32_cmpsh_mask( \
823 (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)-1, \
824 _MM_FROUND_CUR_DIRECTION))
825
826 #define _mm_mask_cmp_sh_mask(M, X, Y, P) \
827 ((__mmask8)__builtin_ia32_cmpsh_mask( \
828 (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)(M), \
829 _MM_FROUND_CUR_DIRECTION))
830 // loads with vmovsh:
_mm_load_sh(void const * __dp)831 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_sh(void const *__dp) {
832 struct __mm_load_sh_struct {
833 _Float16 __u;
834 } __attribute__((__packed__, __may_alias__));
835 _Float16 __u = ((const struct __mm_load_sh_struct *)__dp)->__u;
836 return (__m128h){__u, 0, 0, 0, 0, 0, 0, 0};
837 }
838
839 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_load_sh(__m128h __W,__mmask8 __U,const void * __A)840 _mm_mask_load_sh(__m128h __W, __mmask8 __U, const void *__A) {
841 __m128h src = (__v8hf)__builtin_shufflevector(
842 (__v8hf)__W, (__v8hf)_mm_setzero_ph(), 0, 8, 8, 8, 8, 8, 8, 8);
843
844 return (__m128h)__builtin_ia32_loadsh128_mask((const __v8hf *)__A, src, __U & 1);
845 }
846
847 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_load_sh(__mmask8 __U,const void * __A)848 _mm_maskz_load_sh(__mmask8 __U, const void *__A) {
849 return (__m128h)__builtin_ia32_loadsh128_mask(
850 (const __v8hf *)__A, (__v8hf)_mm_setzero_ph(), __U & 1);
851 }
852
853 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_load_ph(void const * __p)854 _mm512_load_ph(void const *__p) {
855 return *(const __m512h *)__p;
856 }
857
858 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_load_ph(void const * __p)859 _mm256_load_ph(void const *__p) {
860 return *(const __m256h *)__p;
861 }
862
_mm_load_ph(void const * __p)863 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_ph(void const *__p) {
864 return *(const __m128h *)__p;
865 }
866
867 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_loadu_ph(void const * __p)868 _mm512_loadu_ph(void const *__p) {
869 struct __loadu_ph {
870 __m512h_u __v;
871 } __attribute__((__packed__, __may_alias__));
872 return ((const struct __loadu_ph *)__p)->__v;
873 }
874
875 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_loadu_ph(void const * __p)876 _mm256_loadu_ph(void const *__p) {
877 struct __loadu_ph {
878 __m256h_u __v;
879 } __attribute__((__packed__, __may_alias__));
880 return ((const struct __loadu_ph *)__p)->__v;
881 }
882
_mm_loadu_ph(void const * __p)883 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_loadu_ph(void const *__p) {
884 struct __loadu_ph {
885 __m128h_u __v;
886 } __attribute__((__packed__, __may_alias__));
887 return ((const struct __loadu_ph *)__p)->__v;
888 }
889
890 // stores with vmovsh:
_mm_store_sh(void * __dp,__m128h __a)891 static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_sh(void *__dp,
892 __m128h __a) {
893 struct __mm_store_sh_struct {
894 _Float16 __u;
895 } __attribute__((__packed__, __may_alias__));
896 ((struct __mm_store_sh_struct *)__dp)->__u = __a[0];
897 }
898
_mm_mask_store_sh(void * __W,__mmask8 __U,__m128h __A)899 static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_store_sh(void *__W,
900 __mmask8 __U,
901 __m128h __A) {
902 __builtin_ia32_storesh128_mask((__v8hf *)__W, __A, __U & 1);
903 }
904
_mm512_store_ph(void * __P,__m512h __A)905 static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_store_ph(void *__P,
906 __m512h __A) {
907 *(__m512h *)__P = __A;
908 }
909
_mm256_store_ph(void * __P,__m256h __A)910 static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_store_ph(void *__P,
911 __m256h __A) {
912 *(__m256h *)__P = __A;
913 }
914
_mm_store_ph(void * __P,__m128h __A)915 static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_ph(void *__P,
916 __m128h __A) {
917 *(__m128h *)__P = __A;
918 }
919
_mm512_storeu_ph(void * __P,__m512h __A)920 static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_storeu_ph(void *__P,
921 __m512h __A) {
922 struct __storeu_ph {
923 __m512h_u __v;
924 } __attribute__((__packed__, __may_alias__));
925 ((struct __storeu_ph *)__P)->__v = __A;
926 }
927
_mm256_storeu_ph(void * __P,__m256h __A)928 static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_storeu_ph(void *__P,
929 __m256h __A) {
930 struct __storeu_ph {
931 __m256h_u __v;
932 } __attribute__((__packed__, __may_alias__));
933 ((struct __storeu_ph *)__P)->__v = __A;
934 }
935
_mm_storeu_ph(void * __P,__m128h __A)936 static __inline__ void __DEFAULT_FN_ATTRS128 _mm_storeu_ph(void *__P,
937 __m128h __A) {
938 struct __storeu_ph {
939 __m128h_u __v;
940 } __attribute__((__packed__, __may_alias__));
941 ((struct __storeu_ph *)__P)->__v = __A;
942 }
943
944 // moves with vmovsh:
_mm_move_sh(__m128h __a,__m128h __b)945 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_move_sh(__m128h __a,
946 __m128h __b) {
947 __a[0] = __b[0];
948 return __a;
949 }
950
_mm_mask_move_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)951 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_move_sh(__m128h __W,
952 __mmask8 __U,
953 __m128h __A,
954 __m128h __B) {
955 return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B), __W);
956 }
957
_mm_maskz_move_sh(__mmask8 __U,__m128h __A,__m128h __B)958 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_move_sh(__mmask8 __U,
959 __m128h __A,
960 __m128h __B) {
961 return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B),
962 _mm_setzero_ph());
963 }
964
965 // vmovw:
_mm_cvtsi16_si128(short __a)966 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtsi16_si128(short __a) {
967 return (__m128i)(__v8hi){__a, 0, 0, 0, 0, 0, 0, 0};
968 }
969
_mm_cvtsi128_si16(__m128i __a)970 static __inline__ short __DEFAULT_FN_ATTRS128 _mm_cvtsi128_si16(__m128i __a) {
971 __v8hi __b = (__v8hi)__a;
972 return __b[0];
973 }
974
_mm512_rcp_ph(__m512h __A)975 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rcp_ph(__m512h __A) {
976 return (__m512h)__builtin_ia32_rcpph512_mask(
977 (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1);
978 }
979
980 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_rcp_ph(__m512h __W,__mmask32 __U,__m512h __A)981 _mm512_mask_rcp_ph(__m512h __W, __mmask32 __U, __m512h __A) {
982 return (__m512h)__builtin_ia32_rcpph512_mask((__v32hf)__A, (__v32hf)__W,
983 (__mmask32)__U);
984 }
985
986 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_rcp_ph(__mmask32 __U,__m512h __A)987 _mm512_maskz_rcp_ph(__mmask32 __U, __m512h __A) {
988 return (__m512h)__builtin_ia32_rcpph512_mask(
989 (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U);
990 }
991
_mm512_rsqrt_ph(__m512h __A)992 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rsqrt_ph(__m512h __A) {
993 return (__m512h)__builtin_ia32_rsqrtph512_mask(
994 (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1);
995 }
996
997 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_rsqrt_ph(__m512h __W,__mmask32 __U,__m512h __A)998 _mm512_mask_rsqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) {
999 return (__m512h)__builtin_ia32_rsqrtph512_mask((__v32hf)__A, (__v32hf)__W,
1000 (__mmask32)__U);
1001 }
1002
1003 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_rsqrt_ph(__mmask32 __U,__m512h __A)1004 _mm512_maskz_rsqrt_ph(__mmask32 __U, __m512h __A) {
1005 return (__m512h)__builtin_ia32_rsqrtph512_mask(
1006 (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U);
1007 }
1008
1009 #define _mm512_getmant_ph(A, B, C) \
1010 ((__m512h)__builtin_ia32_getmantph512_mask( \
1011 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1012 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, \
1013 _MM_FROUND_CUR_DIRECTION))
1014
1015 #define _mm512_mask_getmant_ph(W, U, A, B, C) \
1016 ((__m512h)__builtin_ia32_getmantph512_mask( \
1017 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W), \
1018 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1019
1020 #define _mm512_maskz_getmant_ph(U, A, B, C) \
1021 ((__m512h)__builtin_ia32_getmantph512_mask( \
1022 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1023 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1024
1025 #define _mm512_getmant_round_ph(A, B, C, R) \
1026 ((__m512h)__builtin_ia32_getmantph512_mask( \
1027 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1028 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
1029
1030 #define _mm512_mask_getmant_round_ph(W, U, A, B, C, R) \
1031 ((__m512h)__builtin_ia32_getmantph512_mask( \
1032 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W), \
1033 (__mmask32)(U), (int)(R)))
1034
1035 #define _mm512_maskz_getmant_round_ph(U, A, B, C, R) \
1036 ((__m512h)__builtin_ia32_getmantph512_mask( \
1037 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1038 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1039
_mm512_getexp_ph(__m512h __A)1040 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_getexp_ph(__m512h __A) {
1041 return (__m512h)__builtin_ia32_getexpph512_mask(
1042 (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,
1043 _MM_FROUND_CUR_DIRECTION);
1044 }
1045
1046 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_getexp_ph(__m512h __W,__mmask32 __U,__m512h __A)1047 _mm512_mask_getexp_ph(__m512h __W, __mmask32 __U, __m512h __A) {
1048 return (__m512h)__builtin_ia32_getexpph512_mask(
1049 (__v32hf)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1050 }
1051
1052 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_getexp_ph(__mmask32 __U,__m512h __A)1053 _mm512_maskz_getexp_ph(__mmask32 __U, __m512h __A) {
1054 return (__m512h)__builtin_ia32_getexpph512_mask(
1055 (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1056 _MM_FROUND_CUR_DIRECTION);
1057 }
1058
1059 #define _mm512_getexp_round_ph(A, R) \
1060 ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \
1061 (__v32hf)_mm512_undefined_ph(), \
1062 (__mmask32)-1, (int)(R)))
1063
1064 #define _mm512_mask_getexp_round_ph(W, U, A, R) \
1065 ((__m512h)__builtin_ia32_getexpph512_mask( \
1066 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(W), (__mmask32)(U), (int)(R)))
1067
1068 #define _mm512_maskz_getexp_round_ph(U, A, R) \
1069 ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \
1070 (__v32hf)_mm512_setzero_ph(), \
1071 (__mmask32)(U), (int)(R)))
1072
_mm512_scalef_ph(__m512h __A,__m512h __B)1073 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_scalef_ph(__m512h __A,
1074 __m512h __B) {
1075 return (__m512h)__builtin_ia32_scalefph512_mask(
1076 (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,
1077 _MM_FROUND_CUR_DIRECTION);
1078 }
1079
1080 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_scalef_ph(__m512h __W,__mmask32 __U,__m512h __A,__m512h __B)1081 _mm512_mask_scalef_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
1082 return (__m512h)__builtin_ia32_scalefph512_mask((__v32hf)__A, (__v32hf)__B,
1083 (__v32hf)__W, (__mmask32)__U,
1084 _MM_FROUND_CUR_DIRECTION);
1085 }
1086
1087 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_scalef_ph(__mmask32 __U,__m512h __A,__m512h __B)1088 _mm512_maskz_scalef_ph(__mmask32 __U, __m512h __A, __m512h __B) {
1089 return (__m512h)__builtin_ia32_scalefph512_mask(
1090 (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1091 _MM_FROUND_CUR_DIRECTION);
1092 }
1093
1094 #define _mm512_scalef_round_ph(A, B, R) \
1095 ((__m512h)__builtin_ia32_scalefph512_mask( \
1096 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), \
1097 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
1098
1099 #define _mm512_mask_scalef_round_ph(W, U, A, B, R) \
1100 ((__m512h)__builtin_ia32_scalefph512_mask( \
1101 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(W), \
1102 (__mmask32)(U), (int)(R)))
1103
1104 #define _mm512_maskz_scalef_round_ph(U, A, B, R) \
1105 ((__m512h)__builtin_ia32_scalefph512_mask( \
1106 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), \
1107 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1108
1109 #define _mm512_roundscale_ph(A, B) \
1110 ((__m512h)__builtin_ia32_rndscaleph_mask( \
1111 (__v32hf)(__m512h)(A), (int)(B), (__v32hf)(__m512h)(A), (__mmask32)-1, \
1112 _MM_FROUND_CUR_DIRECTION))
1113
1114 #define _mm512_mask_roundscale_ph(A, B, C, imm) \
1115 ((__m512h)__builtin_ia32_rndscaleph_mask( \
1116 (__v32hf)(__m512h)(C), (int)(imm), (__v32hf)(__m512h)(A), \
1117 (__mmask32)(B), _MM_FROUND_CUR_DIRECTION))
1118
1119 #define _mm512_maskz_roundscale_ph(A, B, imm) \
1120 ((__m512h)__builtin_ia32_rndscaleph_mask( \
1121 (__v32hf)(__m512h)(B), (int)(imm), (__v32hf)_mm512_setzero_ph(), \
1122 (__mmask32)(A), _MM_FROUND_CUR_DIRECTION))
1123
1124 #define _mm512_mask_roundscale_round_ph(A, B, C, imm, R) \
1125 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(C), (int)(imm), \
1126 (__v32hf)(__m512h)(A), \
1127 (__mmask32)(B), (int)(R)))
1128
1129 #define _mm512_maskz_roundscale_round_ph(A, B, imm, R) \
1130 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(B), (int)(imm), \
1131 (__v32hf)_mm512_setzero_ph(), \
1132 (__mmask32)(A), (int)(R)))
1133
1134 #define _mm512_roundscale_round_ph(A, imm, R) \
1135 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(A), (int)(imm), \
1136 (__v32hf)_mm512_undefined_ph(), \
1137 (__mmask32)-1, (int)(R)))
1138
1139 #define _mm512_reduce_ph(A, imm) \
1140 ((__m512h)__builtin_ia32_reduceph512_mask( \
1141 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_undefined_ph(), \
1142 (__mmask32)-1, _MM_FROUND_CUR_DIRECTION))
1143
1144 #define _mm512_mask_reduce_ph(W, U, A, imm) \
1145 ((__m512h)__builtin_ia32_reduceph512_mask( \
1146 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)(__m512h)(W), \
1147 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1148
1149 #define _mm512_maskz_reduce_ph(U, A, imm) \
1150 ((__m512h)__builtin_ia32_reduceph512_mask( \
1151 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_setzero_ph(), \
1152 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1153
1154 #define _mm512_mask_reduce_round_ph(W, U, A, imm, R) \
1155 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1156 (__v32hf)(__m512h)(W), \
1157 (__mmask32)(U), (int)(R)))
1158
1159 #define _mm512_maskz_reduce_round_ph(U, A, imm, R) \
1160 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1161 (__v32hf)_mm512_setzero_ph(), \
1162 (__mmask32)(U), (int)(R)))
1163
1164 #define _mm512_reduce_round_ph(A, imm, R) \
1165 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1166 (__v32hf)_mm512_undefined_ph(), \
1167 (__mmask32)-1, (int)(R)))
1168
_mm_rcp_sh(__m128h __A,__m128h __B)1169 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rcp_sh(__m128h __A,
1170 __m128h __B) {
1171 return (__m128h)__builtin_ia32_rcpsh_mask(
1172 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
1173 }
1174
_mm_mask_rcp_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)1175 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rcp_sh(__m128h __W,
1176 __mmask8 __U,
1177 __m128h __A,
1178 __m128h __B) {
1179 return (__m128h)__builtin_ia32_rcpsh_mask((__v8hf)__A, (__v8hf)__B,
1180 (__v8hf)__W, (__mmask8)__U);
1181 }
1182
_mm_maskz_rcp_sh(__mmask8 __U,__m128h __A,__m128h __B)1183 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_rcp_sh(__mmask8 __U,
1184 __m128h __A,
1185 __m128h __B) {
1186 return (__m128h)__builtin_ia32_rcpsh_mask(
1187 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1188 }
1189
_mm_rsqrt_sh(__m128h __A,__m128h __B)1190 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rsqrt_sh(__m128h __A,
1191 __m128h __B) {
1192 return (__m128h)__builtin_ia32_rsqrtsh_mask(
1193 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
1194 }
1195
_mm_mask_rsqrt_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)1196 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rsqrt_sh(__m128h __W,
1197 __mmask8 __U,
1198 __m128h __A,
1199 __m128h __B) {
1200 return (__m128h)__builtin_ia32_rsqrtsh_mask((__v8hf)__A, (__v8hf)__B,
1201 (__v8hf)__W, (__mmask8)__U);
1202 }
1203
1204 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_rsqrt_sh(__mmask8 __U,__m128h __A,__m128h __B)1205 _mm_maskz_rsqrt_sh(__mmask8 __U, __m128h __A, __m128h __B) {
1206 return (__m128h)__builtin_ia32_rsqrtsh_mask(
1207 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1208 }
1209
1210 #define _mm_getmant_round_sh(A, B, C, D, R) \
1211 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1212 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1213 (__v8hf)_mm_setzero_ph(), (__mmask8)-1, (int)(R)))
1214
1215 #define _mm_getmant_sh(A, B, C, D) \
1216 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1217 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1218 (__v8hf)_mm_setzero_ph(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))
1219
1220 #define _mm_mask_getmant_sh(W, U, A, B, C, D) \
1221 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1222 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1223 (__v8hf)(__m128h)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
1224
1225 #define _mm_mask_getmant_round_sh(W, U, A, B, C, D, R) \
1226 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1227 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1228 (__v8hf)(__m128h)(W), (__mmask8)(U), (int)(R)))
1229
1230 #define _mm_maskz_getmant_sh(U, A, B, C, D) \
1231 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1232 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1233 (__v8hf)_mm_setzero_ph(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
1234
1235 #define _mm_maskz_getmant_round_sh(U, A, B, C, D, R) \
1236 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1237 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1238 (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1239
1240 #define _mm_getexp_round_sh(A, B, R) \
1241 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
1242 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1243 (__mmask8)-1, (int)(R)))
1244
_mm_getexp_sh(__m128h __A,__m128h __B)1245 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_getexp_sh(__m128h __A,
1246 __m128h __B) {
1247 return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1248 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
1249 _MM_FROUND_CUR_DIRECTION);
1250 }
1251
1252 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_getexp_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)1253 _mm_mask_getexp_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
1254 return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1255 (__v8hf)__A, (__v8hf)__B, (__v8hf)__W, (__mmask8)__U,
1256 _MM_FROUND_CUR_DIRECTION);
1257 }
1258
1259 #define _mm_mask_getexp_round_sh(W, U, A, B, R) \
1260 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
1261 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1262 (__mmask8)(U), (int)(R)))
1263
1264 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_getexp_sh(__mmask8 __U,__m128h __A,__m128h __B)1265 _mm_maskz_getexp_sh(__mmask8 __U, __m128h __A, __m128h __B) {
1266 return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1267 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1268 _MM_FROUND_CUR_DIRECTION);
1269 }
1270
1271 #define _mm_maskz_getexp_round_sh(U, A, B, R) \
1272 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
1273 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1274 (__mmask8)(U), (int)(R)))
1275
1276 #define _mm_scalef_round_sh(A, B, R) \
1277 ((__m128h)__builtin_ia32_scalefsh_round_mask( \
1278 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1279 (__mmask8)-1, (int)(R)))
1280
_mm_scalef_sh(__m128h __A,__m128h __B)1281 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_scalef_sh(__m128h __A,
1282 __m128h __B) {
1283 return (__m128h)__builtin_ia32_scalefsh_round_mask(
1284 (__v8hf)__A, (__v8hf)(__B), (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
1285 _MM_FROUND_CUR_DIRECTION);
1286 }
1287
1288 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_scalef_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)1289 _mm_mask_scalef_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
1290 return (__m128h)__builtin_ia32_scalefsh_round_mask((__v8hf)__A, (__v8hf)__B,
1291 (__v8hf)__W, (__mmask8)__U,
1292 _MM_FROUND_CUR_DIRECTION);
1293 }
1294
1295 #define _mm_mask_scalef_round_sh(W, U, A, B, R) \
1296 ((__m128h)__builtin_ia32_scalefsh_round_mask( \
1297 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1298 (__mmask8)(U), (int)(R)))
1299
1300 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_scalef_sh(__mmask8 __U,__m128h __A,__m128h __B)1301 _mm_maskz_scalef_sh(__mmask8 __U, __m128h __A, __m128h __B) {
1302 return (__m128h)__builtin_ia32_scalefsh_round_mask(
1303 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1304 _MM_FROUND_CUR_DIRECTION);
1305 }
1306
1307 #define _mm_maskz_scalef_round_sh(U, A, B, R) \
1308 ((__m128h)__builtin_ia32_scalefsh_round_mask( \
1309 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1310 (__mmask8)(U), (int)(R)))
1311
1312 #define _mm_roundscale_round_sh(A, B, imm, R) \
1313 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1314 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1315 (__mmask8)-1, (int)(imm), (int)(R)))
1316
1317 #define _mm_roundscale_sh(A, B, imm) \
1318 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1319 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1320 (__mmask8)-1, (int)(imm), _MM_FROUND_CUR_DIRECTION))
1321
1322 #define _mm_mask_roundscale_sh(W, U, A, B, I) \
1323 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1324 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1325 (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
1326
1327 #define _mm_mask_roundscale_round_sh(W, U, A, B, I, R) \
1328 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1329 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1330 (__mmask8)(U), (int)(I), (int)(R)))
1331
1332 #define _mm_maskz_roundscale_sh(U, A, B, I) \
1333 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1334 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1335 (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
1336
1337 #define _mm_maskz_roundscale_round_sh(U, A, B, I, R) \
1338 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1339 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1340 (__mmask8)(U), (int)(I), (int)(R)))
1341
1342 #define _mm_reduce_sh(A, B, C) \
1343 ((__m128h)__builtin_ia32_reducesh_mask( \
1344 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1345 (__mmask8)-1, (int)(C), _MM_FROUND_CUR_DIRECTION))
1346
1347 #define _mm_mask_reduce_sh(W, U, A, B, C) \
1348 ((__m128h)__builtin_ia32_reducesh_mask( \
1349 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1350 (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
1351
1352 #define _mm_maskz_reduce_sh(U, A, B, C) \
1353 ((__m128h)__builtin_ia32_reducesh_mask( \
1354 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1355 (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
1356
1357 #define _mm_reduce_round_sh(A, B, C, R) \
1358 ((__m128h)__builtin_ia32_reducesh_mask( \
1359 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1360 (__mmask8)-1, (int)(C), (int)(R)))
1361
1362 #define _mm_mask_reduce_round_sh(W, U, A, B, C, R) \
1363 ((__m128h)__builtin_ia32_reducesh_mask( \
1364 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1365 (__mmask8)(U), (int)(C), (int)(R)))
1366
1367 #define _mm_maskz_reduce_round_sh(U, A, B, C, R) \
1368 ((__m128h)__builtin_ia32_reducesh_mask( \
1369 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1370 (__mmask8)(U), (int)(C), (int)(R)))
1371
1372 #define _mm512_sqrt_round_ph(A, R) \
1373 ((__m512h)__builtin_ia32_sqrtph512((__v32hf)(__m512h)(A), (int)(R)))
1374
1375 #define _mm512_mask_sqrt_round_ph(W, U, A, R) \
1376 ((__m512h)__builtin_ia32_selectph_512( \
1377 (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)), \
1378 (__v32hf)(__m512h)(W)))
1379
1380 #define _mm512_maskz_sqrt_round_ph(U, A, R) \
1381 ((__m512h)__builtin_ia32_selectph_512( \
1382 (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)), \
1383 (__v32hf)_mm512_setzero_ph()))
1384
_mm512_sqrt_ph(__m512h __A)1385 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sqrt_ph(__m512h __A) {
1386 return (__m512h)__builtin_ia32_sqrtph512((__v32hf)__A,
1387 _MM_FROUND_CUR_DIRECTION);
1388 }
1389
1390 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_sqrt_ph(__m512h __W,__mmask32 __U,__m512h __A)1391 _mm512_mask_sqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) {
1392 return (__m512h)__builtin_ia32_selectph_512(
1393 (__mmask32)(__U),
1394 (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)),
1395 (__v32hf)(__m512h)(__W));
1396 }
1397
1398 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_sqrt_ph(__mmask32 __U,__m512h __A)1399 _mm512_maskz_sqrt_ph(__mmask32 __U, __m512h __A) {
1400 return (__m512h)__builtin_ia32_selectph_512(
1401 (__mmask32)(__U),
1402 (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)),
1403 (__v32hf)_mm512_setzero_ph());
1404 }
1405
1406 #define _mm_sqrt_round_sh(A, B, R) \
1407 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
1408 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1409 (__mmask8)-1, (int)(R)))
1410
1411 #define _mm_mask_sqrt_round_sh(W, U, A, B, R) \
1412 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
1413 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1414 (__mmask8)(U), (int)(R)))
1415
1416 #define _mm_maskz_sqrt_round_sh(U, A, B, R) \
1417 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
1418 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1419 (__mmask8)(U), (int)(R)))
1420
_mm_sqrt_sh(__m128h __A,__m128h __B)1421 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_sh(__m128h __A,
1422 __m128h __B) {
1423 return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1424 (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
1425 (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
1426 }
1427
_mm_mask_sqrt_sh(__m128h __W,__mmask32 __U,__m128h __A,__m128h __B)1428 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_sh(__m128h __W,
1429 __mmask32 __U,
1430 __m128h __A,
1431 __m128h __B) {
1432 return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1433 (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)(__m128h)(__W),
1434 (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
1435 }
1436
_mm_maskz_sqrt_sh(__mmask32 __U,__m128h __A,__m128h __B)1437 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_sh(__mmask32 __U,
1438 __m128h __A,
1439 __m128h __B) {
1440 return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1441 (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
1442 (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
1443 }
1444
1445 #define _mm512_mask_fpclass_ph_mask(U, A, imm) \
1446 ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A), \
1447 (int)(imm), (__mmask32)(U)))
1448
1449 #define _mm512_fpclass_ph_mask(A, imm) \
1450 ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A), \
1451 (int)(imm), (__mmask32)-1))
1452
1453 #define _mm_fpclass_sh_mask(A, imm) \
1454 ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm), \
1455 (__mmask8)-1))
1456
1457 #define _mm_mask_fpclass_sh_mask(U, A, imm) \
1458 ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm), \
1459 (__mmask8)(U)))
1460
1461 #define _mm512_cvt_roundpd_ph(A, R) \
1462 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask( \
1463 (__v8df)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
1464
1465 #define _mm512_mask_cvt_roundpd_ph(W, U, A, R) \
1466 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask((__v8df)(A), (__v8hf)(W), \
1467 (__mmask8)(U), (int)(R)))
1468
1469 #define _mm512_maskz_cvt_roundpd_ph(U, A, R) \
1470 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask( \
1471 (__v8df)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1472
_mm512_cvtpd_ph(__m512d __A)1473 static __inline__ __m128h __DEFAULT_FN_ATTRS512 _mm512_cvtpd_ph(__m512d __A) {
1474 return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1475 (__v8df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
1476 _MM_FROUND_CUR_DIRECTION);
1477 }
1478
1479 static __inline__ __m128h __DEFAULT_FN_ATTRS512
_mm512_mask_cvtpd_ph(__m128h __W,__mmask8 __U,__m512d __A)1480 _mm512_mask_cvtpd_ph(__m128h __W, __mmask8 __U, __m512d __A) {
1481 return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1482 (__v8df)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
1483 }
1484
1485 static __inline__ __m128h __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtpd_ph(__mmask8 __U,__m512d __A)1486 _mm512_maskz_cvtpd_ph(__mmask8 __U, __m512d __A) {
1487 return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1488 (__v8df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1489 _MM_FROUND_CUR_DIRECTION);
1490 }
1491
1492 #define _mm512_cvt_roundph_pd(A, R) \
1493 ((__m512d)__builtin_ia32_vcvtph2pd512_mask( \
1494 (__v8hf)(A), (__v8df)_mm512_undefined_pd(), (__mmask8)(-1), (int)(R)))
1495
1496 #define _mm512_mask_cvt_roundph_pd(W, U, A, R) \
1497 ((__m512d)__builtin_ia32_vcvtph2pd512_mask((__v8hf)(A), (__v8df)(W), \
1498 (__mmask8)(U), (int)(R)))
1499
1500 #define _mm512_maskz_cvt_roundph_pd(U, A, R) \
1501 ((__m512d)__builtin_ia32_vcvtph2pd512_mask( \
1502 (__v8hf)(A), (__v8df)_mm512_setzero_pd(), (__mmask8)(U), (int)(R)))
1503
_mm512_cvtph_pd(__m128h __A)1504 static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_cvtph_pd(__m128h __A) {
1505 return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1506 (__v8hf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)-1,
1507 _MM_FROUND_CUR_DIRECTION);
1508 }
1509
1510 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_mask_cvtph_pd(__m512d __W,__mmask8 __U,__m128h __A)1511 _mm512_mask_cvtph_pd(__m512d __W, __mmask8 __U, __m128h __A) {
1512 return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1513 (__v8hf)__A, (__v8df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
1514 }
1515
1516 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtph_pd(__mmask8 __U,__m128h __A)1517 _mm512_maskz_cvtph_pd(__mmask8 __U, __m128h __A) {
1518 return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1519 (__v8hf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U,
1520 _MM_FROUND_CUR_DIRECTION);
1521 }
1522
1523 #define _mm_cvt_roundsh_ss(A, B, R) \
1524 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B), \
1525 (__v4sf)_mm_undefined_ps(), \
1526 (__mmask8)(-1), (int)(R)))
1527
1528 #define _mm_mask_cvt_roundsh_ss(W, U, A, B, R) \
1529 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask( \
1530 (__v4sf)(A), (__v8hf)(B), (__v4sf)(W), (__mmask8)(U), (int)(R)))
1531
1532 #define _mm_maskz_cvt_roundsh_ss(U, A, B, R) \
1533 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B), \
1534 (__v4sf)_mm_setzero_ps(), \
1535 (__mmask8)(U), (int)(R)))
1536
_mm_cvtsh_ss(__m128 __A,__m128h __B)1537 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtsh_ss(__m128 __A,
1538 __m128h __B) {
1539 return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
1540 (__v4sf)__A, (__v8hf)__B, (__v4sf)_mm_undefined_ps(), (__mmask8)-1,
1541 _MM_FROUND_CUR_DIRECTION);
1542 }
1543
_mm_mask_cvtsh_ss(__m128 __W,__mmask8 __U,__m128 __A,__m128h __B)1544 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtsh_ss(__m128 __W,
1545 __mmask8 __U,
1546 __m128 __A,
1547 __m128h __B) {
1548 return (__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)__A, (__v8hf)__B,
1549 (__v4sf)__W, (__mmask8)__U,
1550 _MM_FROUND_CUR_DIRECTION);
1551 }
1552
_mm_maskz_cvtsh_ss(__mmask8 __U,__m128 __A,__m128h __B)1553 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_cvtsh_ss(__mmask8 __U,
1554 __m128 __A,
1555 __m128h __B) {
1556 return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
1557 (__v4sf)__A, (__v8hf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U,
1558 _MM_FROUND_CUR_DIRECTION);
1559 }
1560
1561 #define _mm_cvt_roundss_sh(A, B, R) \
1562 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B), \
1563 (__v8hf)_mm_undefined_ph(), \
1564 (__mmask8)(-1), (int)(R)))
1565
1566 #define _mm_mask_cvt_roundss_sh(W, U, A, B, R) \
1567 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask( \
1568 (__v8hf)(A), (__v4sf)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
1569
1570 #define _mm_maskz_cvt_roundss_sh(U, A, B, R) \
1571 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B), \
1572 (__v8hf)_mm_setzero_ph(), \
1573 (__mmask8)(U), (int)(R)))
1574
_mm_cvtss_sh(__m128h __A,__m128 __B)1575 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtss_sh(__m128h __A,
1576 __m128 __B) {
1577 return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1578 (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_undefined_ph(), (__mmask8)-1,
1579 _MM_FROUND_CUR_DIRECTION);
1580 }
1581
_mm_mask_cvtss_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128 __B)1582 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtss_sh(__m128h __W,
1583 __mmask8 __U,
1584 __m128h __A,
1585 __m128 __B) {
1586 return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1587 (__v8hf)__A, (__v4sf)__B, (__v8hf)__W, (__mmask8)__U,
1588 _MM_FROUND_CUR_DIRECTION);
1589 }
1590
_mm_maskz_cvtss_sh(__mmask8 __U,__m128h __A,__m128 __B)1591 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_cvtss_sh(__mmask8 __U,
1592 __m128h __A,
1593 __m128 __B) {
1594 return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1595 (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1596 _MM_FROUND_CUR_DIRECTION);
1597 }
1598
1599 #define _mm_cvt_roundsd_sh(A, B, R) \
1600 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B), \
1601 (__v8hf)_mm_undefined_ph(), \
1602 (__mmask8)(-1), (int)(R)))
1603
1604 #define _mm_mask_cvt_roundsd_sh(W, U, A, B, R) \
1605 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask( \
1606 (__v8hf)(A), (__v2df)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
1607
1608 #define _mm_maskz_cvt_roundsd_sh(U, A, B, R) \
1609 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B), \
1610 (__v8hf)_mm_setzero_ph(), \
1611 (__mmask8)(U), (int)(R)))
1612
_mm_cvtsd_sh(__m128h __A,__m128d __B)1613 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtsd_sh(__m128h __A,
1614 __m128d __B) {
1615 return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1616 (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_undefined_ph(), (__mmask8)-1,
1617 _MM_FROUND_CUR_DIRECTION);
1618 }
1619
_mm_mask_cvtsd_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128d __B)1620 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtsd_sh(__m128h __W,
1621 __mmask8 __U,
1622 __m128h __A,
1623 __m128d __B) {
1624 return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1625 (__v8hf)__A, (__v2df)__B, (__v8hf)__W, (__mmask8)__U,
1626 _MM_FROUND_CUR_DIRECTION);
1627 }
1628
1629 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_cvtsd_sh(__mmask8 __U,__m128h __A,__m128d __B)1630 _mm_maskz_cvtsd_sh(__mmask8 __U, __m128h __A, __m128d __B) {
1631 return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1632 (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1633 _MM_FROUND_CUR_DIRECTION);
1634 }
1635
1636 #define _mm_cvt_roundsh_sd(A, B, R) \
1637 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B), \
1638 (__v2df)_mm_undefined_pd(), \
1639 (__mmask8)(-1), (int)(R)))
1640
1641 #define _mm_mask_cvt_roundsh_sd(W, U, A, B, R) \
1642 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask( \
1643 (__v2df)(A), (__v8hf)(B), (__v2df)(W), (__mmask8)(U), (int)(R)))
1644
1645 #define _mm_maskz_cvt_roundsh_sd(U, A, B, R) \
1646 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B), \
1647 (__v2df)_mm_setzero_pd(), \
1648 (__mmask8)(U), (int)(R)))
1649
_mm_cvtsh_sd(__m128d __A,__m128h __B)1650 static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_cvtsh_sd(__m128d __A,
1651 __m128h __B) {
1652 return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1653 (__v2df)__A, (__v8hf)__B, (__v2df)_mm_undefined_pd(), (__mmask8)-1,
1654 _MM_FROUND_CUR_DIRECTION);
1655 }
1656
_mm_mask_cvtsh_sd(__m128d __W,__mmask8 __U,__m128d __A,__m128h __B)1657 static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_cvtsh_sd(__m128d __W,
1658 __mmask8 __U,
1659 __m128d __A,
1660 __m128h __B) {
1661 return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1662 (__v2df)__A, (__v8hf)__B, (__v2df)__W, (__mmask8)__U,
1663 _MM_FROUND_CUR_DIRECTION);
1664 }
1665
1666 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_maskz_cvtsh_sd(__mmask8 __U,__m128d __A,__m128h __B)1667 _mm_maskz_cvtsh_sd(__mmask8 __U, __m128d __A, __m128h __B) {
1668 return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1669 (__v2df)__A, (__v8hf)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U,
1670 _MM_FROUND_CUR_DIRECTION);
1671 }
1672
1673 #define _mm512_cvt_roundph_epi16(A, R) \
1674 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), \
1675 (__v32hi)_mm512_undefined_epi32(), \
1676 (__mmask32)(-1), (int)(R)))
1677
1678 #define _mm512_mask_cvt_roundph_epi16(W, U, A, R) \
1679 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), (__v32hi)(W), \
1680 (__mmask32)(U), (int)(R)))
1681
1682 #define _mm512_maskz_cvt_roundph_epi16(U, A, R) \
1683 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), \
1684 (__v32hi)_mm512_setzero_epi32(), \
1685 (__mmask32)(U), (int)(R)))
1686
1687 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvtph_epi16(__m512h __A)1688 _mm512_cvtph_epi16(__m512h __A) {
1689 return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1690 (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)-1,
1691 _MM_FROUND_CUR_DIRECTION);
1692 }
1693
1694 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvtph_epi16(__m512i __W,__mmask32 __U,__m512h __A)1695 _mm512_mask_cvtph_epi16(__m512i __W, __mmask32 __U, __m512h __A) {
1696 return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1697 (__v32hf)__A, (__v32hi)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1698 }
1699
1700 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtph_epi16(__mmask32 __U,__m512h __A)1701 _mm512_maskz_cvtph_epi16(__mmask32 __U, __m512h __A) {
1702 return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1703 (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)__U,
1704 _MM_FROUND_CUR_DIRECTION);
1705 }
1706
1707 #define _mm512_cvtt_roundph_epi16(A, R) \
1708 ((__m512i)__builtin_ia32_vcvttph2w512_mask( \
1709 (__v32hf)(A), (__v32hi)_mm512_undefined_epi32(), (__mmask32)(-1), \
1710 (int)(R)))
1711
1712 #define _mm512_mask_cvtt_roundph_epi16(W, U, A, R) \
1713 ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), (__v32hi)(W), \
1714 (__mmask32)(U), (int)(R)))
1715
1716 #define _mm512_maskz_cvtt_roundph_epi16(U, A, R) \
1717 ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), \
1718 (__v32hi)_mm512_setzero_epi32(), \
1719 (__mmask32)(U), (int)(R)))
1720
1721 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvttph_epi16(__m512h __A)1722 _mm512_cvttph_epi16(__m512h __A) {
1723 return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1724 (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)-1,
1725 _MM_FROUND_CUR_DIRECTION);
1726 }
1727
1728 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvttph_epi16(__m512i __W,__mmask32 __U,__m512h __A)1729 _mm512_mask_cvttph_epi16(__m512i __W, __mmask32 __U, __m512h __A) {
1730 return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1731 (__v32hf)__A, (__v32hi)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1732 }
1733
1734 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvttph_epi16(__mmask32 __U,__m512h __A)1735 _mm512_maskz_cvttph_epi16(__mmask32 __U, __m512h __A) {
1736 return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1737 (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)__U,
1738 _MM_FROUND_CUR_DIRECTION);
1739 }
1740
1741 #define _mm512_cvt_roundepi16_ph(A, R) \
1742 ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), \
1743 (__v32hf)_mm512_undefined_ph(), \
1744 (__mmask32)(-1), (int)(R)))
1745
1746 #define _mm512_mask_cvt_roundepi16_ph(W, U, A, R) \
1747 ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), (__v32hf)(W), \
1748 (__mmask32)(U), (int)(R)))
1749
1750 #define _mm512_maskz_cvt_roundepi16_ph(U, A, R) \
1751 ((__m512h)__builtin_ia32_vcvtw2ph512_mask( \
1752 (__v32hi)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1753
1754 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_cvtepi16_ph(__m512i __A)1755 _mm512_cvtepi16_ph(__m512i __A) {
1756 return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1757 (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)-1,
1758 _MM_FROUND_CUR_DIRECTION);
1759 }
1760
1761 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_cvtepi16_ph(__m512h __W,__mmask32 __U,__m512i __A)1762 _mm512_mask_cvtepi16_ph(__m512h __W, __mmask32 __U, __m512i __A) {
1763 return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1764 (__v32hi)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1765 }
1766
1767 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtepi16_ph(__mmask32 __U,__m512i __A)1768 _mm512_maskz_cvtepi16_ph(__mmask32 __U, __m512i __A) {
1769 return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1770 (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1771 _MM_FROUND_CUR_DIRECTION);
1772 }
1773
1774 #define _mm512_cvt_roundph_epu16(A, R) \
1775 ((__m512i)__builtin_ia32_vcvtph2uw512_mask( \
1776 (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1), \
1777 (int)(R)))
1778
1779 #define _mm512_mask_cvt_roundph_epu16(W, U, A, R) \
1780 ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), (__v32hu)(W), \
1781 (__mmask32)(U), (int)(R)))
1782
1783 #define _mm512_maskz_cvt_roundph_epu16(U, A, R) \
1784 ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), \
1785 (__v32hu)_mm512_setzero_epi32(), \
1786 (__mmask32)(U), (int)(R)))
1787
1788 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvtph_epu16(__m512h __A)1789 _mm512_cvtph_epu16(__m512h __A) {
1790 return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1791 (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)-1,
1792 _MM_FROUND_CUR_DIRECTION);
1793 }
1794
1795 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvtph_epu16(__m512i __W,__mmask32 __U,__m512h __A)1796 _mm512_mask_cvtph_epu16(__m512i __W, __mmask32 __U, __m512h __A) {
1797 return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1798 (__v32hf)__A, (__v32hu)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1799 }
1800
1801 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtph_epu16(__mmask32 __U,__m512h __A)1802 _mm512_maskz_cvtph_epu16(__mmask32 __U, __m512h __A) {
1803 return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1804 (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)__U,
1805 _MM_FROUND_CUR_DIRECTION);
1806 }
1807
1808 #define _mm512_cvtt_roundph_epu16(A, R) \
1809 ((__m512i)__builtin_ia32_vcvttph2uw512_mask( \
1810 (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1), \
1811 (int)(R)))
1812
1813 #define _mm512_mask_cvtt_roundph_epu16(W, U, A, R) \
1814 ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), (__v32hu)(W), \
1815 (__mmask32)(U), (int)(R)))
1816
1817 #define _mm512_maskz_cvtt_roundph_epu16(U, A, R) \
1818 ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), \
1819 (__v32hu)_mm512_setzero_epi32(), \
1820 (__mmask32)(U), (int)(R)))
1821
1822 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvttph_epu16(__m512h __A)1823 _mm512_cvttph_epu16(__m512h __A) {
1824 return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1825 (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)-1,
1826 _MM_FROUND_CUR_DIRECTION);
1827 }
1828
1829 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvttph_epu16(__m512i __W,__mmask32 __U,__m512h __A)1830 _mm512_mask_cvttph_epu16(__m512i __W, __mmask32 __U, __m512h __A) {
1831 return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1832 (__v32hf)__A, (__v32hu)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1833 }
1834
1835 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvttph_epu16(__mmask32 __U,__m512h __A)1836 _mm512_maskz_cvttph_epu16(__mmask32 __U, __m512h __A) {
1837 return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1838 (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)__U,
1839 _MM_FROUND_CUR_DIRECTION);
1840 }
1841
1842 #define _mm512_cvt_roundepu16_ph(A, R) \
1843 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), \
1844 (__v32hf)_mm512_undefined_ph(), \
1845 (__mmask32)(-1), (int)(R)))
1846
1847 #define _mm512_mask_cvt_roundepu16_ph(W, U, A, R) \
1848 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), (__v32hf)(W), \
1849 (__mmask32)(U), (int)(R)))
1850
1851 #define _mm512_maskz_cvt_roundepu16_ph(U, A, R) \
1852 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask( \
1853 (__v32hu)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1854
1855 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_cvtepu16_ph(__m512i __A)1856 _mm512_cvtepu16_ph(__m512i __A) {
1857 return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1858 (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)-1,
1859 _MM_FROUND_CUR_DIRECTION);
1860 }
1861
1862 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_cvtepu16_ph(__m512h __W,__mmask32 __U,__m512i __A)1863 _mm512_mask_cvtepu16_ph(__m512h __W, __mmask32 __U, __m512i __A) {
1864 return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1865 (__v32hu)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1866 }
1867
1868 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtepu16_ph(__mmask32 __U,__m512i __A)1869 _mm512_maskz_cvtepu16_ph(__mmask32 __U, __m512i __A) {
1870 return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1871 (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1872 _MM_FROUND_CUR_DIRECTION);
1873 }
1874
1875 #define _mm512_cvt_roundph_epi32(A, R) \
1876 ((__m512i)__builtin_ia32_vcvtph2dq512_mask( \
1877 (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1), \
1878 (int)(R)))
1879
1880 #define _mm512_mask_cvt_roundph_epi32(W, U, A, R) \
1881 ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), (__v16si)(W), \
1882 (__mmask16)(U), (int)(R)))
1883
1884 #define _mm512_maskz_cvt_roundph_epi32(U, A, R) \
1885 ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), \
1886 (__v16si)_mm512_setzero_epi32(), \
1887 (__mmask16)(U), (int)(R)))
1888
1889 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvtph_epi32(__m256h __A)1890 _mm512_cvtph_epi32(__m256h __A) {
1891 return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1892 (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)-1,
1893 _MM_FROUND_CUR_DIRECTION);
1894 }
1895
1896 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvtph_epi32(__m512i __W,__mmask16 __U,__m256h __A)1897 _mm512_mask_cvtph_epi32(__m512i __W, __mmask16 __U, __m256h __A) {
1898 return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1899 (__v16hf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1900 }
1901
1902 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtph_epi32(__mmask16 __U,__m256h __A)1903 _mm512_maskz_cvtph_epi32(__mmask16 __U, __m256h __A) {
1904 return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1905 (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)__U,
1906 _MM_FROUND_CUR_DIRECTION);
1907 }
1908
1909 #define _mm512_cvt_roundph_epu32(A, R) \
1910 ((__m512i)__builtin_ia32_vcvtph2udq512_mask( \
1911 (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1), \
1912 (int)(R)))
1913
1914 #define _mm512_mask_cvt_roundph_epu32(W, U, A, R) \
1915 ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), (__v16su)(W), \
1916 (__mmask16)(U), (int)(R)))
1917
1918 #define _mm512_maskz_cvt_roundph_epu32(U, A, R) \
1919 ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), \
1920 (__v16su)_mm512_setzero_epi32(), \
1921 (__mmask16)(U), (int)(R)))
1922
1923 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvtph_epu32(__m256h __A)1924 _mm512_cvtph_epu32(__m256h __A) {
1925 return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1926 (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)-1,
1927 _MM_FROUND_CUR_DIRECTION);
1928 }
1929
1930 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvtph_epu32(__m512i __W,__mmask16 __U,__m256h __A)1931 _mm512_mask_cvtph_epu32(__m512i __W, __mmask16 __U, __m256h __A) {
1932 return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1933 (__v16hf)__A, (__v16su)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1934 }
1935
1936 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtph_epu32(__mmask16 __U,__m256h __A)1937 _mm512_maskz_cvtph_epu32(__mmask16 __U, __m256h __A) {
1938 return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1939 (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)__U,
1940 _MM_FROUND_CUR_DIRECTION);
1941 }
1942
1943 #define _mm512_cvt_roundepi32_ph(A, R) \
1944 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), \
1945 (__v16hf)_mm256_undefined_ph(), \
1946 (__mmask16)(-1), (int)(R)))
1947
1948 #define _mm512_mask_cvt_roundepi32_ph(W, U, A, R) \
1949 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), (__v16hf)(W), \
1950 (__mmask16)(U), (int)(R)))
1951
1952 #define _mm512_maskz_cvt_roundepi32_ph(U, A, R) \
1953 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask( \
1954 (__v16si)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1955
1956 static __inline__ __m256h __DEFAULT_FN_ATTRS512
_mm512_cvtepi32_ph(__m512i __A)1957 _mm512_cvtepi32_ph(__m512i __A) {
1958 return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1959 (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
1960 _MM_FROUND_CUR_DIRECTION);
1961 }
1962
1963 static __inline__ __m256h __DEFAULT_FN_ATTRS512
_mm512_mask_cvtepi32_ph(__m256h __W,__mmask16 __U,__m512i __A)1964 _mm512_mask_cvtepi32_ph(__m256h __W, __mmask16 __U, __m512i __A) {
1965 return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1966 (__v16si)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1967 }
1968
1969 static __inline__ __m256h __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtepi32_ph(__mmask16 __U,__m512i __A)1970 _mm512_maskz_cvtepi32_ph(__mmask16 __U, __m512i __A) {
1971 return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1972 (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
1973 _MM_FROUND_CUR_DIRECTION);
1974 }
1975
1976 #define _mm512_cvt_roundepu32_ph(A, R) \
1977 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), \
1978 (__v16hf)_mm256_undefined_ph(), \
1979 (__mmask16)(-1), (int)(R)))
1980
1981 #define _mm512_mask_cvt_roundepu32_ph(W, U, A, R) \
1982 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), (__v16hf)(W), \
1983 (__mmask16)(U), (int)(R)))
1984
1985 #define _mm512_maskz_cvt_roundepu32_ph(U, A, R) \
1986 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask( \
1987 (__v16su)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1988
1989 static __inline__ __m256h __DEFAULT_FN_ATTRS512
_mm512_cvtepu32_ph(__m512i __A)1990 _mm512_cvtepu32_ph(__m512i __A) {
1991 return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
1992 (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
1993 _MM_FROUND_CUR_DIRECTION);
1994 }
1995
1996 static __inline__ __m256h __DEFAULT_FN_ATTRS512
_mm512_mask_cvtepu32_ph(__m256h __W,__mmask16 __U,__m512i __A)1997 _mm512_mask_cvtepu32_ph(__m256h __W, __mmask16 __U, __m512i __A) {
1998 return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
1999 (__v16su)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2000 }
2001
2002 static __inline__ __m256h __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtepu32_ph(__mmask16 __U,__m512i __A)2003 _mm512_maskz_cvtepu32_ph(__mmask16 __U, __m512i __A) {
2004 return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
2005 (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
2006 _MM_FROUND_CUR_DIRECTION);
2007 }
2008
2009 #define _mm512_cvtt_roundph_epi32(A, R) \
2010 ((__m512i)__builtin_ia32_vcvttph2dq512_mask( \
2011 (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1), \
2012 (int)(R)))
2013
2014 #define _mm512_mask_cvtt_roundph_epi32(W, U, A, R) \
2015 ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), (__v16si)(W), \
2016 (__mmask16)(U), (int)(R)))
2017
2018 #define _mm512_maskz_cvtt_roundph_epi32(U, A, R) \
2019 ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), \
2020 (__v16si)_mm512_setzero_epi32(), \
2021 (__mmask16)(U), (int)(R)))
2022
2023 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvttph_epi32(__m256h __A)2024 _mm512_cvttph_epi32(__m256h __A) {
2025 return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2026 (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)-1,
2027 _MM_FROUND_CUR_DIRECTION);
2028 }
2029
2030 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvttph_epi32(__m512i __W,__mmask16 __U,__m256h __A)2031 _mm512_mask_cvttph_epi32(__m512i __W, __mmask16 __U, __m256h __A) {
2032 return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2033 (__v16hf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2034 }
2035
2036 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvttph_epi32(__mmask16 __U,__m256h __A)2037 _mm512_maskz_cvttph_epi32(__mmask16 __U, __m256h __A) {
2038 return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2039 (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)__U,
2040 _MM_FROUND_CUR_DIRECTION);
2041 }
2042
2043 #define _mm512_cvtt_roundph_epu32(A, R) \
2044 ((__m512i)__builtin_ia32_vcvttph2udq512_mask( \
2045 (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1), \
2046 (int)(R)))
2047
2048 #define _mm512_mask_cvtt_roundph_epu32(W, U, A, R) \
2049 ((__m512i)__builtin_ia32_vcvttph2udq512_mask((__v16hf)(A), (__v16su)(W), \
2050 (__mmask16)(U), (int)(R)))
2051
2052 #define _mm512_maskz_cvtt_roundph_epu32(U, A, R) \
2053 ((__m512i)__builtin_ia32_vcvttph2udq512_mask( \
2054 (__v16hf)(A), (__v16su)_mm512_setzero_epi32(), (__mmask16)(U), \
2055 (int)(R)))
2056
2057 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvttph_epu32(__m256h __A)2058 _mm512_cvttph_epu32(__m256h __A) {
2059 return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2060 (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)-1,
2061 _MM_FROUND_CUR_DIRECTION);
2062 }
2063
2064 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvttph_epu32(__m512i __W,__mmask16 __U,__m256h __A)2065 _mm512_mask_cvttph_epu32(__m512i __W, __mmask16 __U, __m256h __A) {
2066 return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2067 (__v16hf)__A, (__v16su)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2068 }
2069
2070 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvttph_epu32(__mmask16 __U,__m256h __A)2071 _mm512_maskz_cvttph_epu32(__mmask16 __U, __m256h __A) {
2072 return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2073 (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)__U,
2074 _MM_FROUND_CUR_DIRECTION);
2075 }
2076
2077 #define _mm512_cvt_roundepi64_ph(A, R) \
2078 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask( \
2079 (__v8di)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
2080
2081 #define _mm512_mask_cvt_roundepi64_ph(W, U, A, R) \
2082 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask((__v8di)(A), (__v8hf)(W), \
2083 (__mmask8)(U), (int)(R)))
2084
2085 #define _mm512_maskz_cvt_roundepi64_ph(U, A, R) \
2086 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask( \
2087 (__v8di)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
2088
2089 static __inline__ __m128h __DEFAULT_FN_ATTRS512
_mm512_cvtepi64_ph(__m512i __A)2090 _mm512_cvtepi64_ph(__m512i __A) {
2091 return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2092 (__v8di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
2093 _MM_FROUND_CUR_DIRECTION);
2094 }
2095
2096 static __inline__ __m128h __DEFAULT_FN_ATTRS512
_mm512_mask_cvtepi64_ph(__m128h __W,__mmask8 __U,__m512i __A)2097 _mm512_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m512i __A) {
2098 return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2099 (__v8di)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2100 }
2101
2102 static __inline__ __m128h __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtepi64_ph(__mmask8 __U,__m512i __A)2103 _mm512_maskz_cvtepi64_ph(__mmask8 __U, __m512i __A) {
2104 return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2105 (__v8di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
2106 _MM_FROUND_CUR_DIRECTION);
2107 }
2108
2109 #define _mm512_cvt_roundph_epi64(A, R) \
2110 ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), \
2111 (__v8di)_mm512_undefined_epi32(), \
2112 (__mmask8)(-1), (int)(R)))
2113
2114 #define _mm512_mask_cvt_roundph_epi64(W, U, A, R) \
2115 ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), (__v8di)(W), \
2116 (__mmask8)(U), (int)(R)))
2117
2118 #define _mm512_maskz_cvt_roundph_epi64(U, A, R) \
2119 ((__m512i)__builtin_ia32_vcvtph2qq512_mask( \
2120 (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2121
2122 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvtph_epi64(__m128h __A)2123 _mm512_cvtph_epi64(__m128h __A) {
2124 return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2125 (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)-1,
2126 _MM_FROUND_CUR_DIRECTION);
2127 }
2128
2129 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvtph_epi64(__m512i __W,__mmask8 __U,__m128h __A)2130 _mm512_mask_cvtph_epi64(__m512i __W, __mmask8 __U, __m128h __A) {
2131 return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2132 (__v8hf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2133 }
2134
2135 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtph_epi64(__mmask8 __U,__m128h __A)2136 _mm512_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) {
2137 return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2138 (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)__U,
2139 _MM_FROUND_CUR_DIRECTION);
2140 }
2141
2142 #define _mm512_cvt_roundepu64_ph(A, R) \
2143 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask( \
2144 (__v8du)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
2145
2146 #define _mm512_mask_cvt_roundepu64_ph(W, U, A, R) \
2147 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask((__v8du)(A), (__v8hf)(W), \
2148 (__mmask8)(U), (int)(R)))
2149
2150 #define _mm512_maskz_cvt_roundepu64_ph(U, A, R) \
2151 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask( \
2152 (__v8du)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
2153
2154 static __inline__ __m128h __DEFAULT_FN_ATTRS512
_mm512_cvtepu64_ph(__m512i __A)2155 _mm512_cvtepu64_ph(__m512i __A) {
2156 return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2157 (__v8du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
2158 _MM_FROUND_CUR_DIRECTION);
2159 }
2160
2161 static __inline__ __m128h __DEFAULT_FN_ATTRS512
_mm512_mask_cvtepu64_ph(__m128h __W,__mmask8 __U,__m512i __A)2162 _mm512_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m512i __A) {
2163 return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2164 (__v8du)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2165 }
2166
2167 static __inline__ __m128h __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtepu64_ph(__mmask8 __U,__m512i __A)2168 _mm512_maskz_cvtepu64_ph(__mmask8 __U, __m512i __A) {
2169 return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2170 (__v8du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
2171 _MM_FROUND_CUR_DIRECTION);
2172 }
2173
2174 #define _mm512_cvt_roundph_epu64(A, R) \
2175 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask( \
2176 (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1), \
2177 (int)(R)))
2178
2179 #define _mm512_mask_cvt_roundph_epu64(W, U, A, R) \
2180 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask((__v8hf)(A), (__v8du)(W), \
2181 (__mmask8)(U), (int)(R)))
2182
2183 #define _mm512_maskz_cvt_roundph_epu64(U, A, R) \
2184 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask( \
2185 (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2186
2187 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvtph_epu64(__m128h __A)2188 _mm512_cvtph_epu64(__m128h __A) {
2189 return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2190 (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)-1,
2191 _MM_FROUND_CUR_DIRECTION);
2192 }
2193
2194 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvtph_epu64(__m512i __W,__mmask8 __U,__m128h __A)2195 _mm512_mask_cvtph_epu64(__m512i __W, __mmask8 __U, __m128h __A) {
2196 return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2197 (__v8hf)__A, (__v8du)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2198 }
2199
2200 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtph_epu64(__mmask8 __U,__m128h __A)2201 _mm512_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) {
2202 return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2203 (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)__U,
2204 _MM_FROUND_CUR_DIRECTION);
2205 }
2206
2207 #define _mm512_cvtt_roundph_epi64(A, R) \
2208 ((__m512i)__builtin_ia32_vcvttph2qq512_mask( \
2209 (__v8hf)(A), (__v8di)_mm512_undefined_epi32(), (__mmask8)(-1), \
2210 (int)(R)))
2211
2212 #define _mm512_mask_cvtt_roundph_epi64(W, U, A, R) \
2213 ((__m512i)__builtin_ia32_vcvttph2qq512_mask((__v8hf)(A), (__v8di)(W), \
2214 (__mmask8)(U), (int)(R)))
2215
2216 #define _mm512_maskz_cvtt_roundph_epi64(U, A, R) \
2217 ((__m512i)__builtin_ia32_vcvttph2qq512_mask( \
2218 (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2219
2220 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvttph_epi64(__m128h __A)2221 _mm512_cvttph_epi64(__m128h __A) {
2222 return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2223 (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)-1,
2224 _MM_FROUND_CUR_DIRECTION);
2225 }
2226
2227 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvttph_epi64(__m512i __W,__mmask8 __U,__m128h __A)2228 _mm512_mask_cvttph_epi64(__m512i __W, __mmask8 __U, __m128h __A) {
2229 return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2230 (__v8hf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2231 }
2232
2233 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvttph_epi64(__mmask8 __U,__m128h __A)2234 _mm512_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) {
2235 return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2236 (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)__U,
2237 _MM_FROUND_CUR_DIRECTION);
2238 }
2239
2240 #define _mm512_cvtt_roundph_epu64(A, R) \
2241 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask( \
2242 (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1), \
2243 (int)(R)))
2244
2245 #define _mm512_mask_cvtt_roundph_epu64(W, U, A, R) \
2246 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask((__v8hf)(A), (__v8du)(W), \
2247 (__mmask8)(U), (int)(R)))
2248
2249 #define _mm512_maskz_cvtt_roundph_epu64(U, A, R) \
2250 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask( \
2251 (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2252
2253 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvttph_epu64(__m128h __A)2254 _mm512_cvttph_epu64(__m128h __A) {
2255 return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2256 (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)-1,
2257 _MM_FROUND_CUR_DIRECTION);
2258 }
2259
2260 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvttph_epu64(__m512i __W,__mmask8 __U,__m128h __A)2261 _mm512_mask_cvttph_epu64(__m512i __W, __mmask8 __U, __m128h __A) {
2262 return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2263 (__v8hf)__A, (__v8du)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2264 }
2265
2266 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvttph_epu64(__mmask8 __U,__m128h __A)2267 _mm512_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) {
2268 return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2269 (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)__U,
2270 _MM_FROUND_CUR_DIRECTION);
2271 }
2272
2273 #define _mm_cvt_roundsh_i32(A, R) \
2274 ((int)__builtin_ia32_vcvtsh2si32((__v8hf)(A), (int)(R)))
2275
_mm_cvtsh_i32(__m128h __A)2276 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvtsh_i32(__m128h __A) {
2277 return (int)__builtin_ia32_vcvtsh2si32((__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
2278 }
2279
2280 #define _mm_cvt_roundsh_u32(A, R) \
2281 ((unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)(A), (int)(R)))
2282
2283 static __inline__ unsigned int __DEFAULT_FN_ATTRS128
_mm_cvtsh_u32(__m128h __A)2284 _mm_cvtsh_u32(__m128h __A) {
2285 return (unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)__A,
2286 _MM_FROUND_CUR_DIRECTION);
2287 }
2288
2289 #ifdef __x86_64__
2290 #define _mm_cvt_roundsh_i64(A, R) \
2291 ((long long)__builtin_ia32_vcvtsh2si64((__v8hf)(A), (int)(R)))
2292
_mm_cvtsh_i64(__m128h __A)2293 static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvtsh_i64(__m128h __A) {
2294 return (long long)__builtin_ia32_vcvtsh2si64((__v8hf)__A,
2295 _MM_FROUND_CUR_DIRECTION);
2296 }
2297
2298 #define _mm_cvt_roundsh_u64(A, R) \
2299 ((unsigned long long)__builtin_ia32_vcvtsh2usi64((__v8hf)(A), (int)(R)))
2300
2301 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
_mm_cvtsh_u64(__m128h __A)2302 _mm_cvtsh_u64(__m128h __A) {
2303 return (unsigned long long)__builtin_ia32_vcvtsh2usi64(
2304 (__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
2305 }
2306 #endif // __x86_64__
2307
2308 #define _mm_cvt_roundu32_sh(A, B, R) \
2309 ((__m128h)__builtin_ia32_vcvtusi2sh((__v8hf)(A), (unsigned int)(B), (int)(R)))
2310
2311 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_cvtu32_sh(__m128h __A,unsigned int __B)2312 _mm_cvtu32_sh(__m128h __A, unsigned int __B) {
2313 __A[0] = __B;
2314 return __A;
2315 }
2316
2317 #ifdef __x86_64__
2318 #define _mm_cvt_roundu64_sh(A, B, R) \
2319 ((__m128h)__builtin_ia32_vcvtusi642sh((__v8hf)(A), (unsigned long long)(B), \
2320 (int)(R)))
2321
2322 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_cvtu64_sh(__m128h __A,unsigned long long __B)2323 _mm_cvtu64_sh(__m128h __A, unsigned long long __B) {
2324 __A[0] = __B;
2325 return __A;
2326 }
2327 #endif
2328
2329 #define _mm_cvt_roundi32_sh(A, B, R) \
2330 ((__m128h)__builtin_ia32_vcvtsi2sh((__v8hf)(A), (int)(B), (int)(R)))
2331
_mm_cvti32_sh(__m128h __A,int __B)2332 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvti32_sh(__m128h __A,
2333 int __B) {
2334 __A[0] = __B;
2335 return __A;
2336 }
2337
2338 #ifdef __x86_64__
2339 #define _mm_cvt_roundi64_sh(A, B, R) \
2340 ((__m128h)__builtin_ia32_vcvtsi642sh((__v8hf)(A), (long long)(B), (int)(R)))
2341
_mm_cvti64_sh(__m128h __A,long long __B)2342 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvti64_sh(__m128h __A,
2343 long long __B) {
2344 __A[0] = __B;
2345 return __A;
2346 }
2347 #endif
2348
2349 #define _mm_cvtt_roundsh_i32(A, R) \
2350 ((int)__builtin_ia32_vcvttsh2si32((__v8hf)(A), (int)(R)))
2351
_mm_cvttsh_i32(__m128h __A)2352 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvttsh_i32(__m128h __A) {
2353 return (int)__builtin_ia32_vcvttsh2si32((__v8hf)__A,
2354 _MM_FROUND_CUR_DIRECTION);
2355 }
2356
2357 #ifdef __x86_64__
2358 #define _mm_cvtt_roundsh_i64(A, R) \
2359 ((long long)__builtin_ia32_vcvttsh2si64((__v8hf)(A), (int)(R)))
2360
_mm_cvttsh_i64(__m128h __A)2361 static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvttsh_i64(__m128h __A) {
2362 return (long long)__builtin_ia32_vcvttsh2si64((__v8hf)__A,
2363 _MM_FROUND_CUR_DIRECTION);
2364 }
2365 #endif
2366
2367 #define _mm_cvtt_roundsh_u32(A, R) \
2368 ((unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)(A), (int)(R)))
2369
2370 static __inline__ unsigned int __DEFAULT_FN_ATTRS128
_mm_cvttsh_u32(__m128h __A)2371 _mm_cvttsh_u32(__m128h __A) {
2372 return (unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)__A,
2373 _MM_FROUND_CUR_DIRECTION);
2374 }
2375
2376 #ifdef __x86_64__
2377 #define _mm_cvtt_roundsh_u64(A, R) \
2378 ((unsigned long long)__builtin_ia32_vcvttsh2usi64((__v8hf)(A), (int)(R)))
2379
2380 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
_mm_cvttsh_u64(__m128h __A)2381 _mm_cvttsh_u64(__m128h __A) {
2382 return (unsigned long long)__builtin_ia32_vcvttsh2usi64(
2383 (__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
2384 }
2385 #endif
2386
2387 #define _mm512_cvtx_roundph_ps(A, R) \
2388 ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), \
2389 (__v16sf)_mm512_undefined_ps(), \
2390 (__mmask16)(-1), (int)(R)))
2391
2392 #define _mm512_mask_cvtx_roundph_ps(W, U, A, R) \
2393 ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), (__v16sf)(W), \
2394 (__mmask16)(U), (int)(R)))
2395
2396 #define _mm512_maskz_cvtx_roundph_ps(U, A, R) \
2397 ((__m512)__builtin_ia32_vcvtph2psx512_mask( \
2398 (__v16hf)(A), (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), (int)(R)))
2399
_mm512_cvtxph_ps(__m256h __A)2400 static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtxph_ps(__m256h __A) {
2401 return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2402 (__v16hf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)-1,
2403 _MM_FROUND_CUR_DIRECTION);
2404 }
2405
2406 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_cvtxph_ps(__m512 __W,__mmask16 __U,__m256h __A)2407 _mm512_mask_cvtxph_ps(__m512 __W, __mmask16 __U, __m256h __A) {
2408 return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2409 (__v16hf)__A, (__v16sf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2410 }
2411
2412 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtxph_ps(__mmask16 __U,__m256h __A)2413 _mm512_maskz_cvtxph_ps(__mmask16 __U, __m256h __A) {
2414 return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2415 (__v16hf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U,
2416 _MM_FROUND_CUR_DIRECTION);
2417 }
2418
2419 #define _mm512_cvtx_roundps_ph(A, R) \
2420 ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), \
2421 (__v16hf)_mm256_undefined_ph(), \
2422 (__mmask16)(-1), (int)(R)))
2423
2424 #define _mm512_mask_cvtx_roundps_ph(W, U, A, R) \
2425 ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), (__v16hf)(W), \
2426 (__mmask16)(U), (int)(R)))
2427
2428 #define _mm512_maskz_cvtx_roundps_ph(U, A, R) \
2429 ((__m256h)__builtin_ia32_vcvtps2phx512_mask( \
2430 (__v16sf)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
2431
_mm512_cvtxps_ph(__m512 __A)2432 static __inline__ __m256h __DEFAULT_FN_ATTRS512 _mm512_cvtxps_ph(__m512 __A) {
2433 return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2434 (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
2435 _MM_FROUND_CUR_DIRECTION);
2436 }
2437
2438 static __inline__ __m256h __DEFAULT_FN_ATTRS512
_mm512_mask_cvtxps_ph(__m256h __W,__mmask16 __U,__m512 __A)2439 _mm512_mask_cvtxps_ph(__m256h __W, __mmask16 __U, __m512 __A) {
2440 return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2441 (__v16sf)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2442 }
2443
2444 static __inline__ __m256h __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtxps_ph(__mmask16 __U,__m512 __A)2445 _mm512_maskz_cvtxps_ph(__mmask16 __U, __m512 __A) {
2446 return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2447 (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
2448 _MM_FROUND_CUR_DIRECTION);
2449 }
2450
2451 #define _mm512_fmadd_round_ph(A, B, C, R) \
2452 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2453 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2454 (__mmask32)-1, (int)(R)))
2455
2456 #define _mm512_mask_fmadd_round_ph(A, U, B, C, R) \
2457 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2458 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2459 (__mmask32)(U), (int)(R)))
2460
2461 #define _mm512_mask3_fmadd_round_ph(A, B, C, U, R) \
2462 ((__m512h)__builtin_ia32_vfmaddph512_mask3( \
2463 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2464 (__mmask32)(U), (int)(R)))
2465
2466 #define _mm512_maskz_fmadd_round_ph(U, A, B, C, R) \
2467 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2468 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2469 (__mmask32)(U), (int)(R)))
2470
2471 #define _mm512_fmsub_round_ph(A, B, C, R) \
2472 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2473 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2474 (__mmask32)-1, (int)(R)))
2475
2476 #define _mm512_mask_fmsub_round_ph(A, U, B, C, R) \
2477 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2478 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2479 (__mmask32)(U), (int)(R)))
2480
2481 #define _mm512_maskz_fmsub_round_ph(U, A, B, C, R) \
2482 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2483 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2484 (__mmask32)(U), (int)(R)))
2485
2486 #define _mm512_fnmadd_round_ph(A, B, C, R) \
2487 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2488 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2489 (__mmask32)-1, (int)(R)))
2490
2491 #define _mm512_mask3_fnmadd_round_ph(A, B, C, U, R) \
2492 ((__m512h)__builtin_ia32_vfmaddph512_mask3( \
2493 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2494 (__mmask32)(U), (int)(R)))
2495
2496 #define _mm512_maskz_fnmadd_round_ph(U, A, B, C, R) \
2497 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2498 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2499 (__mmask32)(U), (int)(R)))
2500
2501 #define _mm512_fnmsub_round_ph(A, B, C, R) \
2502 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2503 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2504 (__mmask32)-1, (int)(R)))
2505
2506 #define _mm512_maskz_fnmsub_round_ph(U, A, B, C, R) \
2507 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2508 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2509 (__mmask32)(U), (int)(R)))
2510
_mm512_fmadd_ph(__m512h __A,__m512h __B,__m512h __C)2511 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_ph(__m512h __A,
2512 __m512h __B,
2513 __m512h __C) {
2514 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2515 (__v32hf)__C, (__mmask32)-1,
2516 _MM_FROUND_CUR_DIRECTION);
2517 }
2518
2519 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_fmadd_ph(__m512h __A,__mmask32 __U,__m512h __B,__m512h __C)2520 _mm512_mask_fmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2521 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2522 (__v32hf)__C, (__mmask32)__U,
2523 _MM_FROUND_CUR_DIRECTION);
2524 }
2525
2526 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask3_fmadd_ph(__m512h __A,__m512h __B,__m512h __C,__mmask32 __U)2527 _mm512_mask3_fmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2528 return (__m512h)__builtin_ia32_vfmaddph512_mask3((__v32hf)__A, (__v32hf)__B,
2529 (__v32hf)__C, (__mmask32)__U,
2530 _MM_FROUND_CUR_DIRECTION);
2531 }
2532
2533 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_fmadd_ph(__mmask32 __U,__m512h __A,__m512h __B,__m512h __C)2534 _mm512_maskz_fmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2535 return (__m512h)__builtin_ia32_vfmaddph512_maskz((__v32hf)__A, (__v32hf)__B,
2536 (__v32hf)__C, (__mmask32)__U,
2537 _MM_FROUND_CUR_DIRECTION);
2538 }
2539
_mm512_fmsub_ph(__m512h __A,__m512h __B,__m512h __C)2540 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmsub_ph(__m512h __A,
2541 __m512h __B,
2542 __m512h __C) {
2543 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2544 -(__v32hf)__C, (__mmask32)-1,
2545 _MM_FROUND_CUR_DIRECTION);
2546 }
2547
2548 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_fmsub_ph(__m512h __A,__mmask32 __U,__m512h __B,__m512h __C)2549 _mm512_mask_fmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2550 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2551 -(__v32hf)__C, (__mmask32)__U,
2552 _MM_FROUND_CUR_DIRECTION);
2553 }
2554
2555 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_fmsub_ph(__mmask32 __U,__m512h __A,__m512h __B,__m512h __C)2556 _mm512_maskz_fmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2557 return (__m512h)__builtin_ia32_vfmaddph512_maskz(
2558 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2559 _MM_FROUND_CUR_DIRECTION);
2560 }
2561
_mm512_fnmadd_ph(__m512h __A,__m512h __B,__m512h __C)2562 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fnmadd_ph(__m512h __A,
2563 __m512h __B,
2564 __m512h __C) {
2565 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2566 (__v32hf)__C, (__mmask32)-1,
2567 _MM_FROUND_CUR_DIRECTION);
2568 }
2569
2570 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask3_fnmadd_ph(__m512h __A,__m512h __B,__m512h __C,__mmask32 __U)2571 _mm512_mask3_fnmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2572 return (__m512h)__builtin_ia32_vfmaddph512_mask3(-(__v32hf)__A, (__v32hf)__B,
2573 (__v32hf)__C, (__mmask32)__U,
2574 _MM_FROUND_CUR_DIRECTION);
2575 }
2576
2577 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_fnmadd_ph(__mmask32 __U,__m512h __A,__m512h __B,__m512h __C)2578 _mm512_maskz_fnmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2579 return (__m512h)__builtin_ia32_vfmaddph512_maskz(-(__v32hf)__A, (__v32hf)__B,
2580 (__v32hf)__C, (__mmask32)__U,
2581 _MM_FROUND_CUR_DIRECTION);
2582 }
2583
_mm512_fnmsub_ph(__m512h __A,__m512h __B,__m512h __C)2584 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fnmsub_ph(__m512h __A,
2585 __m512h __B,
2586 __m512h __C) {
2587 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2588 -(__v32hf)__C, (__mmask32)-1,
2589 _MM_FROUND_CUR_DIRECTION);
2590 }
2591
2592 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_fnmsub_ph(__mmask32 __U,__m512h __A,__m512h __B,__m512h __C)2593 _mm512_maskz_fnmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2594 return (__m512h)__builtin_ia32_vfmaddph512_maskz(
2595 -(__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2596 _MM_FROUND_CUR_DIRECTION);
2597 }
2598
2599 #define _mm512_fmaddsub_round_ph(A, B, C, R) \
2600 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2601 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2602 (__mmask32)-1, (int)(R)))
2603
2604 #define _mm512_mask_fmaddsub_round_ph(A, U, B, C, R) \
2605 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2606 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2607 (__mmask32)(U), (int)(R)))
2608
2609 #define _mm512_mask3_fmaddsub_round_ph(A, B, C, U, R) \
2610 ((__m512h)__builtin_ia32_vfmaddsubph512_mask3( \
2611 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2612 (__mmask32)(U), (int)(R)))
2613
2614 #define _mm512_maskz_fmaddsub_round_ph(U, A, B, C, R) \
2615 ((__m512h)__builtin_ia32_vfmaddsubph512_maskz( \
2616 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2617 (__mmask32)(U), (int)(R)))
2618
2619 #define _mm512_fmsubadd_round_ph(A, B, C, R) \
2620 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2621 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2622 (__mmask32)-1, (int)(R)))
2623
2624 #define _mm512_mask_fmsubadd_round_ph(A, U, B, C, R) \
2625 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2626 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2627 (__mmask32)(U), (int)(R)))
2628
2629 #define _mm512_maskz_fmsubadd_round_ph(U, A, B, C, R) \
2630 ((__m512h)__builtin_ia32_vfmaddsubph512_maskz( \
2631 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2632 (__mmask32)(U), (int)(R)))
2633
2634 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_fmaddsub_ph(__m512h __A,__m512h __B,__m512h __C)2635 _mm512_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C) {
2636 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2637 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)-1,
2638 _MM_FROUND_CUR_DIRECTION);
2639 }
2640
2641 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_fmaddsub_ph(__m512h __A,__mmask32 __U,__m512h __B,__m512h __C)2642 _mm512_mask_fmaddsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2643 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2644 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2645 _MM_FROUND_CUR_DIRECTION);
2646 }
2647
2648 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask3_fmaddsub_ph(__m512h __A,__m512h __B,__m512h __C,__mmask32 __U)2649 _mm512_mask3_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2650 return (__m512h)__builtin_ia32_vfmaddsubph512_mask3(
2651 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2652 _MM_FROUND_CUR_DIRECTION);
2653 }
2654
2655 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_fmaddsub_ph(__mmask32 __U,__m512h __A,__m512h __B,__m512h __C)2656 _mm512_maskz_fmaddsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2657 return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
2658 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2659 _MM_FROUND_CUR_DIRECTION);
2660 }
2661
2662 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_fmsubadd_ph(__m512h __A,__m512h __B,__m512h __C)2663 _mm512_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C) {
2664 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2665 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)-1,
2666 _MM_FROUND_CUR_DIRECTION);
2667 }
2668
2669 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_fmsubadd_ph(__m512h __A,__mmask32 __U,__m512h __B,__m512h __C)2670 _mm512_mask_fmsubadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2671 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2672 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2673 _MM_FROUND_CUR_DIRECTION);
2674 }
2675
2676 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_fmsubadd_ph(__mmask32 __U,__m512h __A,__m512h __B,__m512h __C)2677 _mm512_maskz_fmsubadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2678 return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
2679 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2680 _MM_FROUND_CUR_DIRECTION);
2681 }
2682
2683 #define _mm512_mask3_fmsub_round_ph(A, B, C, U, R) \
2684 ((__m512h)__builtin_ia32_vfmsubph512_mask3( \
2685 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2686 (__mmask32)(U), (int)(R)))
2687
2688 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask3_fmsub_ph(__m512h __A,__m512h __B,__m512h __C,__mmask32 __U)2689 _mm512_mask3_fmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2690 return (__m512h)__builtin_ia32_vfmsubph512_mask3((__v32hf)__A, (__v32hf)__B,
2691 (__v32hf)__C, (__mmask32)__U,
2692 _MM_FROUND_CUR_DIRECTION);
2693 }
2694
2695 #define _mm512_mask3_fmsubadd_round_ph(A, B, C, U, R) \
2696 ((__m512h)__builtin_ia32_vfmsubaddph512_mask3( \
2697 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2698 (__mmask32)(U), (int)(R)))
2699
2700 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask3_fmsubadd_ph(__m512h __A,__m512h __B,__m512h __C,__mmask32 __U)2701 _mm512_mask3_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2702 return (__m512h)__builtin_ia32_vfmsubaddph512_mask3(
2703 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2704 _MM_FROUND_CUR_DIRECTION);
2705 }
2706
2707 #define _mm512_mask_fnmadd_round_ph(A, U, B, C, R) \
2708 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2709 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2710 (__mmask32)(U), (int)(R)))
2711
2712 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_fnmadd_ph(__m512h __A,__mmask32 __U,__m512h __B,__m512h __C)2713 _mm512_mask_fnmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2714 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2715 (__v32hf)__C, (__mmask32)__U,
2716 _MM_FROUND_CUR_DIRECTION);
2717 }
2718
2719 #define _mm512_mask_fnmsub_round_ph(A, U, B, C, R) \
2720 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2721 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2722 (__mmask32)(U), (int)(R)))
2723
2724 #define _mm512_mask3_fnmsub_round_ph(A, B, C, U, R) \
2725 ((__m512h)__builtin_ia32_vfmsubph512_mask3( \
2726 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2727 (__mmask32)(U), (int)(R)))
2728
2729 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_fnmsub_ph(__m512h __A,__mmask32 __U,__m512h __B,__m512h __C)2730 _mm512_mask_fnmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2731 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2732 -(__v32hf)__C, (__mmask32)__U,
2733 _MM_FROUND_CUR_DIRECTION);
2734 }
2735
2736 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask3_fnmsub_ph(__m512h __A,__m512h __B,__m512h __C,__mmask32 __U)2737 _mm512_mask3_fnmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2738 return (__m512h)__builtin_ia32_vfmsubph512_mask3(-(__v32hf)__A, (__v32hf)__B,
2739 (__v32hf)__C, (__mmask32)__U,
2740 _MM_FROUND_CUR_DIRECTION);
2741 }
2742
_mm_fmadd_sh(__m128h __W,__m128h __A,__m128h __B)2743 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sh(__m128h __W,
2744 __m128h __A,
2745 __m128h __B) {
2746 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
2747 (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
2748 }
2749
_mm_mask_fmadd_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)2750 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_sh(__m128h __W,
2751 __mmask8 __U,
2752 __m128h __A,
2753 __m128h __B) {
2754 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
2755 (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2756 }
2757
2758 #define _mm_fmadd_round_sh(A, B, C, R) \
2759 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2760 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2761 (__mmask8)-1, (int)(R)))
2762
2763 #define _mm_mask_fmadd_round_sh(W, U, A, B, R) \
2764 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2765 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), \
2766 (__mmask8)(U), (int)(R)))
2767
2768 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_fmadd_sh(__mmask8 __U,__m128h __A,__m128h __B,__m128h __C)2769 _mm_maskz_fmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2770 return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B, (__v8hf)__C,
2771 (__mmask8)__U,
2772 _MM_FROUND_CUR_DIRECTION);
2773 }
2774
2775 #define _mm_maskz_fmadd_round_sh(U, A, B, C, R) \
2776 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2777 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2778 (__mmask8)(U), (int)(R)))
2779
2780 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask3_fmadd_sh(__m128h __W,__m128h __X,__m128h __Y,__mmask8 __U)2781 _mm_mask3_fmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2782 return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)__Y,
2783 (__mmask8)__U,
2784 _MM_FROUND_CUR_DIRECTION);
2785 }
2786
2787 #define _mm_mask3_fmadd_round_sh(W, X, Y, U, R) \
2788 ((__m128h)__builtin_ia32_vfmaddsh3_mask3( \
2789 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2790 (__mmask8)(U), (int)(R)))
2791
_mm_fmsub_sh(__m128h __W,__m128h __A,__m128h __B)2792 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsub_sh(__m128h __W,
2793 __m128h __A,
2794 __m128h __B) {
2795 return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
2796 -(__v8hf)__B, (__mmask8)-1,
2797 _MM_FROUND_CUR_DIRECTION);
2798 }
2799
_mm_mask_fmsub_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)2800 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_sh(__m128h __W,
2801 __mmask8 __U,
2802 __m128h __A,
2803 __m128h __B) {
2804 return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
2805 -(__v8hf)__B, (__mmask8)__U,
2806 _MM_FROUND_CUR_DIRECTION);
2807 }
2808
2809 #define _mm_fmsub_round_sh(A, B, C, R) \
2810 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2811 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2812 (__mmask8)-1, (int)(R)))
2813
2814 #define _mm_mask_fmsub_round_sh(W, U, A, B, R) \
2815 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2816 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), \
2817 (__mmask8)(U), (int)(R)))
2818
2819 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_fmsub_sh(__mmask8 __U,__m128h __A,__m128h __B,__m128h __C)2820 _mm_maskz_fmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2821 return (__m128h)__builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B,
2822 -(__v8hf)__C, (__mmask8)__U,
2823 _MM_FROUND_CUR_DIRECTION);
2824 }
2825
2826 #define _mm_maskz_fmsub_round_sh(U, A, B, C, R) \
2827 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2828 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2829 (__mmask8)(U), (int)R))
2830
2831 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask3_fmsub_sh(__m128h __W,__m128h __X,__m128h __Y,__mmask8 __U)2832 _mm_mask3_fmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2833 return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)__Y,
2834 (__mmask8)__U,
2835 _MM_FROUND_CUR_DIRECTION);
2836 }
2837
2838 #define _mm_mask3_fmsub_round_sh(W, X, Y, U, R) \
2839 ((__m128h)__builtin_ia32_vfmsubsh3_mask3( \
2840 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2841 (__mmask8)(U), (int)(R)))
2842
_mm_fnmadd_sh(__m128h __W,__m128h __A,__m128h __B)2843 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmadd_sh(__m128h __W,
2844 __m128h __A,
2845 __m128h __B) {
2846 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
2847 (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
2848 }
2849
2850 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_fnmadd_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)2851 _mm_mask_fnmadd_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
2852 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
2853 (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2854 }
2855
2856 #define _mm_fnmadd_round_sh(A, B, C, R) \
2857 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2858 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2859 (__mmask8)-1, (int)(R)))
2860
2861 #define _mm_mask_fnmadd_round_sh(W, U, A, B, R) \
2862 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2863 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), \
2864 (__mmask8)(U), (int)(R)))
2865
2866 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_fnmadd_sh(__mmask8 __U,__m128h __A,__m128h __B,__m128h __C)2867 _mm_maskz_fnmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2868 return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C,
2869 (__mmask8)__U,
2870 _MM_FROUND_CUR_DIRECTION);
2871 }
2872
2873 #define _mm_maskz_fnmadd_round_sh(U, A, B, C, R) \
2874 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2875 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2876 (__mmask8)(U), (int)(R)))
2877
2878 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask3_fnmadd_sh(__m128h __W,__m128h __X,__m128h __Y,__mmask8 __U)2879 _mm_mask3_fnmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2880 return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)__Y,
2881 (__mmask8)__U,
2882 _MM_FROUND_CUR_DIRECTION);
2883 }
2884
2885 #define _mm_mask3_fnmadd_round_sh(W, X, Y, U, R) \
2886 ((__m128h)__builtin_ia32_vfmaddsh3_mask3( \
2887 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2888 (__mmask8)(U), (int)(R)))
2889
_mm_fnmsub_sh(__m128h __W,__m128h __A,__m128h __B)2890 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmsub_sh(__m128h __W,
2891 __m128h __A,
2892 __m128h __B) {
2893 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
2894 (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
2895 }
2896
2897 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_fnmsub_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)2898 _mm_mask_fnmsub_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
2899 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
2900 (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2901 }
2902
2903 #define _mm_fnmsub_round_sh(A, B, C, R) \
2904 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2905 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2906 (__mmask8)-1, (int)(R)))
2907
2908 #define _mm_mask_fnmsub_round_sh(W, U, A, B, R) \
2909 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2910 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), \
2911 (__mmask8)(U), (int)(R)))
2912
2913 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_fnmsub_sh(__mmask8 __U,__m128h __A,__m128h __B,__m128h __C)2914 _mm_maskz_fnmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2915 return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C,
2916 (__mmask8)__U,
2917 _MM_FROUND_CUR_DIRECTION);
2918 }
2919
2920 #define _mm_maskz_fnmsub_round_sh(U, A, B, C, R) \
2921 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2922 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2923 (__mmask8)(U), (int)(R)))
2924
2925 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask3_fnmsub_sh(__m128h __W,__m128h __X,__m128h __Y,__mmask8 __U)2926 _mm_mask3_fnmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2927 return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)__Y,
2928 (__mmask8)__U,
2929 _MM_FROUND_CUR_DIRECTION);
2930 }
2931
2932 #define _mm_mask3_fnmsub_round_sh(W, X, Y, U, R) \
2933 ((__m128h)__builtin_ia32_vfmsubsh3_mask3( \
2934 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2935 (__mmask8)(U), (int)(R)))
2936
_mm_fcmadd_sch(__m128h __A,__m128h __B,__m128h __C)2937 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmadd_sch(__m128h __A,
2938 __m128h __B,
2939 __m128h __C) {
2940 return (__m128h)__builtin_ia32_vfcmaddcsh_mask((__v4sf)__A, (__v4sf)__B,
2941 (__v4sf)__C, (__mmask8)-1,
2942 _MM_FROUND_CUR_DIRECTION);
2943 }
2944
2945 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_fcmadd_sch(__m128h __A,__mmask8 __U,__m128h __B,__m128h __C)2946 _mm_mask_fcmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
2947 return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask(
2948 (__v4sf)__A, (__v4sf)(__B), (__v4sf)(__C), __U, _MM_FROUND_CUR_DIRECTION);
2949 }
2950
2951 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_fcmadd_sch(__mmask8 __U,__m128h __A,__m128h __B,__m128h __C)2952 _mm_maskz_fcmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2953 return (__m128h)__builtin_ia32_vfcmaddcsh_maskz((__v4sf)__A, (__v4sf)__B,
2954 (__v4sf)__C, (__mmask8)__U,
2955 _MM_FROUND_CUR_DIRECTION);
2956 }
2957
2958 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask3_fcmadd_sch(__m128h __A,__m128h __B,__m128h __C,__mmask8 __U)2959 _mm_mask3_fcmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
2960 return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask3(
2961 (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U, _MM_FROUND_CUR_DIRECTION);
2962 }
2963
2964 #define _mm_fcmadd_round_sch(A, B, C, R) \
2965 ((__m128h)__builtin_ia32_vfcmaddcsh_mask( \
2966 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2967 (__mmask8)-1, (int)(R)))
2968
2969 #define _mm_mask_fcmadd_round_sch(A, U, B, C, R) \
2970 ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask( \
2971 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2972 (__mmask8)(U), (int)(R)))
2973
2974 #define _mm_maskz_fcmadd_round_sch(U, A, B, C, R) \
2975 ((__m128h)__builtin_ia32_vfcmaddcsh_maskz( \
2976 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2977 (__mmask8)(U), (int)(R)))
2978
2979 #define _mm_mask3_fcmadd_round_sch(A, B, C, U, R) \
2980 ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask3( \
2981 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2982 (__mmask8)(U), (int)(R)))
2983
_mm_fmadd_sch(__m128h __A,__m128h __B,__m128h __C)2984 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sch(__m128h __A,
2985 __m128h __B,
2986 __m128h __C) {
2987 return (__m128h)__builtin_ia32_vfmaddcsh_mask((__v4sf)__A, (__v4sf)__B,
2988 (__v4sf)__C, (__mmask8)-1,
2989 _MM_FROUND_CUR_DIRECTION);
2990 }
2991
2992 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_fmadd_sch(__m128h __A,__mmask8 __U,__m128h __B,__m128h __C)2993 _mm_mask_fmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
2994 return (__m128h)__builtin_ia32_vfmaddcsh_round_mask(
2995 (__v4sf)__A, (__v4sf)(__B), (__v4sf)(__C), __U, _MM_FROUND_CUR_DIRECTION);
2996 }
2997
2998 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_fmadd_sch(__mmask8 __U,__m128h __A,__m128h __B,__m128h __C)2999 _mm_maskz_fmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
3000 return (__m128h)__builtin_ia32_vfmaddcsh_maskz((__v4sf)__A, (__v4sf)__B,
3001 (__v4sf)__C, (__mmask8)__U,
3002 _MM_FROUND_CUR_DIRECTION);
3003 }
3004
3005 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask3_fmadd_sch(__m128h __A,__m128h __B,__m128h __C,__mmask8 __U)3006 _mm_mask3_fmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
3007 return (__m128h)__builtin_ia32_vfmaddcsh_round_mask3(
3008 (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U, _MM_FROUND_CUR_DIRECTION);
3009 }
3010
3011 #define _mm_fmadd_round_sch(A, B, C, R) \
3012 ((__m128h)__builtin_ia32_vfmaddcsh_mask( \
3013 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3014 (__mmask8)-1, (int)(R)))
3015
3016 #define _mm_mask_fmadd_round_sch(A, U, B, C, R) \
3017 ((__m128h)__builtin_ia32_vfmaddcsh_round_mask( \
3018 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3019 (__mmask8)(U), (int)(R)))
3020
3021 #define _mm_maskz_fmadd_round_sch(U, A, B, C, R) \
3022 ((__m128h)__builtin_ia32_vfmaddcsh_maskz( \
3023 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3024 (__mmask8)(U), (int)(R)))
3025
3026 #define _mm_mask3_fmadd_round_sch(A, B, C, U, R) \
3027 ((__m128h)__builtin_ia32_vfmaddcsh_round_mask3( \
3028 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3029 (__mmask8)(U), (int)(R)))
3030
_mm_fcmul_sch(__m128h __A,__m128h __B)3031 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmul_sch(__m128h __A,
3032 __m128h __B) {
3033 return (__m128h)__builtin_ia32_vfcmulcsh_mask(
3034 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1,
3035 _MM_FROUND_CUR_DIRECTION);
3036 }
3037
3038 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_fcmul_sch(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)3039 _mm_mask_fcmul_sch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
3040 return (__m128h)__builtin_ia32_vfcmulcsh_mask((__v4sf)__A, (__v4sf)__B,
3041 (__v4sf)__W, (__mmask8)__U,
3042 _MM_FROUND_CUR_DIRECTION);
3043 }
3044
3045 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_fcmul_sch(__mmask8 __U,__m128h __A,__m128h __B)3046 _mm_maskz_fcmul_sch(__mmask8 __U, __m128h __A, __m128h __B) {
3047 return (__m128h)__builtin_ia32_vfcmulcsh_mask(
3048 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U,
3049 _MM_FROUND_CUR_DIRECTION);
3050 }
3051
3052 #define _mm_fcmul_round_sch(A, B, R) \
3053 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
3054 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3055 (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
3056
3057 #define _mm_mask_fcmul_round_sch(W, U, A, B, R) \
3058 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
3059 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W), \
3060 (__mmask8)(U), (int)(R)))
3061
3062 #define _mm_maskz_fcmul_round_sch(U, A, B, R) \
3063 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
3064 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3065 (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
3066
_mm_fmul_sch(__m128h __A,__m128h __B)3067 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmul_sch(__m128h __A,
3068 __m128h __B) {
3069 return (__m128h)__builtin_ia32_vfmulcsh_mask(
3070 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1,
3071 _MM_FROUND_CUR_DIRECTION);
3072 }
3073
_mm_mask_fmul_sch(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)3074 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmul_sch(__m128h __W,
3075 __mmask8 __U,
3076 __m128h __A,
3077 __m128h __B) {
3078 return (__m128h)__builtin_ia32_vfmulcsh_mask((__v4sf)__A, (__v4sf)__B,
3079 (__v4sf)__W, (__mmask8)__U,
3080 _MM_FROUND_CUR_DIRECTION);
3081 }
3082
3083 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_fmul_sch(__mmask8 __U,__m128h __A,__m128h __B)3084 _mm_maskz_fmul_sch(__mmask8 __U, __m128h __A, __m128h __B) {
3085 return (__m128h)__builtin_ia32_vfmulcsh_mask(
3086 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U,
3087 _MM_FROUND_CUR_DIRECTION);
3088 }
3089
3090 #define _mm_fmul_round_sch(A, B, R) \
3091 ((__m128h)__builtin_ia32_vfmulcsh_mask( \
3092 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3093 (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
3094
3095 #define _mm_mask_fmul_round_sch(W, U, A, B, R) \
3096 ((__m128h)__builtin_ia32_vfmulcsh_mask( \
3097 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W), \
3098 (__mmask8)(U), (int)(R)))
3099
3100 #define _mm_maskz_fmul_round_sch(U, A, B, R) \
3101 ((__m128h)__builtin_ia32_vfmulcsh_mask( \
3102 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3103 (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
3104
_mm512_fcmul_pch(__m512h __A,__m512h __B)3105 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmul_pch(__m512h __A,
3106 __m512h __B) {
3107 return (__m512h)__builtin_ia32_vfcmulcph512_mask(
3108 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (__mmask16)-1,
3109 _MM_FROUND_CUR_DIRECTION);
3110 }
3111
3112 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_fcmul_pch(__m512h __W,__mmask16 __U,__m512h __A,__m512h __B)3113 _mm512_mask_fcmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
3114 return (__m512h)__builtin_ia32_vfcmulcph512_mask((__v16sf)__A, (__v16sf)__B,
3115 (__v16sf)__W, (__mmask16)__U,
3116 _MM_FROUND_CUR_DIRECTION);
3117 }
3118
3119 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_fcmul_pch(__mmask16 __U,__m512h __A,__m512h __B)3120 _mm512_maskz_fcmul_pch(__mmask16 __U, __m512h __A, __m512h __B) {
3121 return (__m512h)__builtin_ia32_vfcmulcph512_mask(
3122 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (__mmask16)__U,
3123 _MM_FROUND_CUR_DIRECTION);
3124 }
3125
3126 #define _mm512_fcmul_round_pch(A, B, R) \
3127 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
3128 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3129 (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
3130
3131 #define _mm512_mask_fcmul_round_pch(W, U, A, B, R) \
3132 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
3133 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W), \
3134 (__mmask16)(U), (int)(R)))
3135
3136 #define _mm512_maskz_fcmul_round_pch(U, A, B, R) \
3137 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
3138 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3139 (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
3140
_mm512_fmul_pch(__m512h __A,__m512h __B)3141 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmul_pch(__m512h __A,
3142 __m512h __B) {
3143 return (__m512h)__builtin_ia32_vfmulcph512_mask(
3144 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (__mmask16)-1,
3145 _MM_FROUND_CUR_DIRECTION);
3146 }
3147
3148 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_fmul_pch(__m512h __W,__mmask16 __U,__m512h __A,__m512h __B)3149 _mm512_mask_fmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
3150 return (__m512h)__builtin_ia32_vfmulcph512_mask((__v16sf)__A, (__v16sf)__B,
3151 (__v16sf)__W, (__mmask16)__U,
3152 _MM_FROUND_CUR_DIRECTION);
3153 }
3154
3155 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_fmul_pch(__mmask16 __U,__m512h __A,__m512h __B)3156 _mm512_maskz_fmul_pch(__mmask16 __U, __m512h __A, __m512h __B) {
3157 return (__m512h)__builtin_ia32_vfmulcph512_mask(
3158 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (__mmask16)__U,
3159 _MM_FROUND_CUR_DIRECTION);
3160 }
3161
3162 #define _mm512_fmul_round_pch(A, B, R) \
3163 ((__m512h)__builtin_ia32_vfmulcph512_mask( \
3164 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3165 (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
3166
3167 #define _mm512_mask_fmul_round_pch(W, U, A, B, R) \
3168 ((__m512h)__builtin_ia32_vfmulcph512_mask( \
3169 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W), \
3170 (__mmask16)(U), (int)(R)))
3171
3172 #define _mm512_maskz_fmul_round_pch(U, A, B, R) \
3173 ((__m512h)__builtin_ia32_vfmulcph512_mask( \
3174 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3175 (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
3176
_mm512_fcmadd_pch(__m512h __A,__m512h __B,__m512h __C)3177 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmadd_pch(__m512h __A,
3178 __m512h __B,
3179 __m512h __C) {
3180 return (__m512h)__builtin_ia32_vfcmaddcph512_mask3(
3181 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)-1,
3182 _MM_FROUND_CUR_DIRECTION);
3183 }
3184
3185 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_fcmadd_pch(__m512h __A,__mmask16 __U,__m512h __B,__m512h __C)3186 _mm512_mask_fcmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
3187 return (__m512h)__builtin_ia32_vfcmaddcph512_mask(
3188 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3189 _MM_FROUND_CUR_DIRECTION);
3190 }
3191
3192 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask3_fcmadd_pch(__m512h __A,__m512h __B,__m512h __C,__mmask16 __U)3193 _mm512_mask3_fcmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
3194 return (__m512h)__builtin_ia32_vfcmaddcph512_mask3(
3195 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3196 _MM_FROUND_CUR_DIRECTION);
3197 }
3198
3199 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_fcmadd_pch(__mmask16 __U,__m512h __A,__m512h __B,__m512h __C)3200 _mm512_maskz_fcmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
3201 return (__m512h)__builtin_ia32_vfcmaddcph512_maskz(
3202 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3203 _MM_FROUND_CUR_DIRECTION);
3204 }
3205
3206 #define _mm512_fcmadd_round_pch(A, B, C, R) \
3207 ((__m512h)__builtin_ia32_vfcmaddcph512_mask3( \
3208 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3209 (__mmask16)-1, (int)(R)))
3210
3211 #define _mm512_mask_fcmadd_round_pch(A, U, B, C, R) \
3212 ((__m512h)__builtin_ia32_vfcmaddcph512_mask( \
3213 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3214 (__mmask16)(U), (int)(R)))
3215
3216 #define _mm512_mask3_fcmadd_round_pch(A, B, C, U, R) \
3217 ((__m512h)__builtin_ia32_vfcmaddcph512_mask3( \
3218 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3219 (__mmask16)(U), (int)(R)))
3220
3221 #define _mm512_maskz_fcmadd_round_pch(U, A, B, C, R) \
3222 ((__m512h)__builtin_ia32_vfcmaddcph512_maskz( \
3223 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3224 (__mmask16)(U), (int)(R)))
3225
_mm512_fmadd_pch(__m512h __A,__m512h __B,__m512h __C)3226 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_pch(__m512h __A,
3227 __m512h __B,
3228 __m512h __C) {
3229 return (__m512h)__builtin_ia32_vfmaddcph512_mask3((__v16sf)__A, (__v16sf)__B,
3230 (__v16sf)__C, (__mmask16)-1,
3231 _MM_FROUND_CUR_DIRECTION);
3232 }
3233
3234 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_fmadd_pch(__m512h __A,__mmask16 __U,__m512h __B,__m512h __C)3235 _mm512_mask_fmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
3236 return (__m512h)__builtin_ia32_vfmaddcph512_mask((__v16sf)__A, (__v16sf)__B,
3237 (__v16sf)__C, (__mmask16)__U,
3238 _MM_FROUND_CUR_DIRECTION);
3239 }
3240
3241 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask3_fmadd_pch(__m512h __A,__m512h __B,__m512h __C,__mmask16 __U)3242 _mm512_mask3_fmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
3243 return (__m512h)__builtin_ia32_vfmaddcph512_mask3(
3244 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3245 _MM_FROUND_CUR_DIRECTION);
3246 }
3247
3248 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_fmadd_pch(__mmask16 __U,__m512h __A,__m512h __B,__m512h __C)3249 _mm512_maskz_fmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
3250 return (__m512h)__builtin_ia32_vfmaddcph512_maskz(
3251 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3252 _MM_FROUND_CUR_DIRECTION);
3253 }
3254
3255 #define _mm512_fmadd_round_pch(A, B, C, R) \
3256 ((__m512h)__builtin_ia32_vfmaddcph512_mask3( \
3257 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3258 (__mmask16)-1, (int)(R)))
3259
3260 #define _mm512_mask_fmadd_round_pch(A, U, B, C, R) \
3261 ((__m512h)__builtin_ia32_vfmaddcph512_mask( \
3262 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3263 (__mmask16)(U), (int)(R)))
3264
3265 #define _mm512_mask3_fmadd_round_pch(A, B, C, U, R) \
3266 ((__m512h)__builtin_ia32_vfmaddcph512_mask3( \
3267 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3268 (__mmask16)(U), (int)(R)))
3269
3270 #define _mm512_maskz_fmadd_round_pch(U, A, B, C, R) \
3271 ((__m512h)__builtin_ia32_vfmaddcph512_maskz( \
3272 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3273 (__mmask16)(U), (int)(R)))
3274
3275 static __inline__ _Float16 __DEFAULT_FN_ATTRS512
_mm512_reduce_add_ph(__m512h __W)3276 _mm512_reduce_add_ph(__m512h __W) {
3277 return __builtin_ia32_reduce_fadd_ph512(-0.0f16, __W);
3278 }
3279
3280 static __inline__ _Float16 __DEFAULT_FN_ATTRS512
_mm512_reduce_mul_ph(__m512h __W)3281 _mm512_reduce_mul_ph(__m512h __W) {
3282 return __builtin_ia32_reduce_fmul_ph512(1.0f16, __W);
3283 }
3284
3285 static __inline__ _Float16 __DEFAULT_FN_ATTRS512
_mm512_reduce_max_ph(__m512h __V)3286 _mm512_reduce_max_ph(__m512h __V) {
3287 return __builtin_ia32_reduce_fmax_ph512(__V);
3288 }
3289
3290 static __inline__ _Float16 __DEFAULT_FN_ATTRS512
_mm512_reduce_min_ph(__m512h __V)3291 _mm512_reduce_min_ph(__m512h __V) {
3292 return __builtin_ia32_reduce_fmin_ph512(__V);
3293 }
3294
3295 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_blend_ph(__mmask32 __U,__m512h __A,__m512h __W)3296 _mm512_mask_blend_ph(__mmask32 __U, __m512h __A, __m512h __W) {
3297 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, (__v32hf)__W,
3298 (__v32hf)__A);
3299 }
3300
3301 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_permutex2var_ph(__m512h __A,__m512i __I,__m512h __B)3302 _mm512_permutex2var_ph(__m512h __A, __m512i __I, __m512h __B) {
3303 return (__m512h)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
3304 (__v32hi)__B);
3305 }
3306
3307 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_permutexvar_ph(__m512i __A,__m512h __B)3308 _mm512_permutexvar_ph(__m512i __A, __m512h __B) {
3309 return (__m512h)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A);
3310 }
3311
3312 // intrinsics below are alias for f*mul_*ch
3313 #define _mm512_mul_pch(A, B) _mm512_fmul_pch(A, B)
3314 #define _mm512_mask_mul_pch(W, U, A, B) _mm512_mask_fmul_pch(W, U, A, B)
3315 #define _mm512_maskz_mul_pch(U, A, B) _mm512_maskz_fmul_pch(U, A, B)
3316 #define _mm512_mul_round_pch(A, B, R) _mm512_fmul_round_pch(A, B, R)
3317 #define _mm512_mask_mul_round_pch(W, U, A, B, R) \
3318 _mm512_mask_fmul_round_pch(W, U, A, B, R)
3319 #define _mm512_maskz_mul_round_pch(U, A, B, R) \
3320 _mm512_maskz_fmul_round_pch(U, A, B, R)
3321
3322 #define _mm512_cmul_pch(A, B) _mm512_fcmul_pch(A, B)
3323 #define _mm512_mask_cmul_pch(W, U, A, B) _mm512_mask_fcmul_pch(W, U, A, B)
3324 #define _mm512_maskz_cmul_pch(U, A, B) _mm512_maskz_fcmul_pch(U, A, B)
3325 #define _mm512_cmul_round_pch(A, B, R) _mm512_fcmul_round_pch(A, B, R)
3326 #define _mm512_mask_cmul_round_pch(W, U, A, B, R) \
3327 _mm512_mask_fcmul_round_pch(W, U, A, B, R)
3328 #define _mm512_maskz_cmul_round_pch(U, A, B, R) \
3329 _mm512_maskz_fcmul_round_pch(U, A, B, R)
3330
3331 #define _mm_mul_sch(A, B) _mm_fmul_sch(A, B)
3332 #define _mm_mask_mul_sch(W, U, A, B) _mm_mask_fmul_sch(W, U, A, B)
3333 #define _mm_maskz_mul_sch(U, A, B) _mm_maskz_fmul_sch(U, A, B)
3334 #define _mm_mul_round_sch(A, B, R) _mm_fmul_round_sch(A, B, R)
3335 #define _mm_mask_mul_round_sch(W, U, A, B, R) \
3336 _mm_mask_fmul_round_sch(W, U, A, B, R)
3337 #define _mm_maskz_mul_round_sch(U, A, B, R) _mm_maskz_fmul_round_sch(U, A, B, R)
3338
3339 #define _mm_cmul_sch(A, B) _mm_fcmul_sch(A, B)
3340 #define _mm_mask_cmul_sch(W, U, A, B) _mm_mask_fcmul_sch(W, U, A, B)
3341 #define _mm_maskz_cmul_sch(U, A, B) _mm_maskz_fcmul_sch(U, A, B)
3342 #define _mm_cmul_round_sch(A, B, R) _mm_fcmul_round_sch(A, B, R)
3343 #define _mm_mask_cmul_round_sch(W, U, A, B, R) \
3344 _mm_mask_fcmul_round_sch(W, U, A, B, R)
3345 #define _mm_maskz_cmul_round_sch(U, A, B, R) \
3346 _mm_maskz_fcmul_round_sch(U, A, B, R)
3347
3348 #undef __DEFAULT_FN_ATTRS128
3349 #undef __DEFAULT_FN_ATTRS256
3350 #undef __DEFAULT_FN_ATTRS512
3351
3352 #endif
3353 #endif
3354